question-mark
Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Constant memory usage growth in service consuming and producing via Kafka

See original GitHub issue

Description

We have 2 services group listening from 6 topics (4 partitions per each) and each service produces messages to another topic. Average consuming rate is ~1m hits per hour aggregate for 2 instances. Average message size is ~100-200 bytes, rarely 300 bytes (serialized DTO). Each service creating 2 Consumer instances since we subscribe for events from 2 broker clusters (5 topic subscriptions for first consumer instance and 1 topic subscription for second one). Producer is currently idle most of the time, like several messages per day.

Code is following:

=========================== Configuration ==============================

var consumer_1 = new Consumer(new Dictionary<string, object> // consumer_2 is the same
{
	{ "group.id", appSettings.KafkaConsumerGroup },
	{ "bootstrap.servers", string.Join(",", appSettings.KafkaBootstrapServers_1) },
	{ "queued.max.messages.kbytes", appSettings.TopicPartitionMaxQueueKb }, // 131072 Kb = 128 Mb
	{ "queued.min.messages", appSettings.TopicPartitionMinQueueMessages }, // 10000
	{ "fetch.message.max.bytes", appSettings.TopicPartitionMaxFetchBytes } // 131072 bytes = 128 Kb
});

var producerConfig = new KafkaPushNotificationProducerConfig(new Dictionary<string, object>
{
	{ "bootstrap.servers", string.Join(",", appSettings.KafkaBootstrapServers_3) }
});

===================== Consumer extensions ===============================

public static class KafkaConsumerExtensions
{
	private static readonly ILogger Log = Serilog.Log.Logger;

	public static void AddKafkaSubscription<TKafkaNotification, TNotification>(
		this Consumer consumer,
		IMediator mediator,
		string topic,
		IDeserializer<TKafkaNotification> deserializer,
		bool enableInternalLogging)
		where TKafkaNotification : class
		where TNotification : INotification
	{
		consumer.OnMessage += async (sender, message) =>
			{
				try
				{
					Log.Information($"[Confluent.Kafka.Consumer.OnMessage] Message received for topic {topic}{Environment.NewLine}" +
									$"Key bytes: {message.Key?.Length ?? 0}; Value bytes: {message.Value?.Length ?? 0}");

					if (message.Topic == topic) // disregard messages from other topics
					{
						var model = deserializer.Deserialize(message.Value);
						var request = model.MapTo<TNotification>();
						await mediator.Publish(request).ConfigureAwait(false);
					}
				}
				catch (Exception ex)
				{
					Log.Error(
						ex,
						$"[Confluent.Kafka.Consumer.OnMessage] Exception happened while processing Kafka Subscription delegate for topic: {topic} and model: {typeof(TKafkaNotification).Name}.");
				}
			};

		consumer.OnPartitionsAssigned += (sender, list) =>
			{
				Log.Debug($"[Confluent.Kafka.Consumer.OnPartitionsAssigned] #{list.Count} Partitions were assigned for Topic: {topic} and model: {typeof(TKafkaNotification).Name}.");
				consumer.Assign(list);
			};

		consumer.OnPartitionsRevoked += (sender, list) =>
			{
				Log.Debug($"[Confluent.Kafka.Consumer.OnPartitionsRevoked] #{list.Count} Partitions were revoked for Topic: {topic} and model: {typeof(TKafkaNotification).Name}.");
				consumer.Unassign();
			};

		consumer.OnError += (_, error)
			=> Log.Debug($"[Confluent.Kafka.Consumer.OnError] Kafka error. Error: {error}. Topic: {topic} and model: {typeof(TKafkaNotification).Name}");

		consumer.OnConsumeError += (sender, code)
			=> Log.Debug($"[Confluent.Kafka.Consumer.OnConsumeError] Kafka consumer error. Code: {code}. Topic: {topic} and model: {typeof(TKafkaNotification).Name}");

		if (enableInternalLogging)
		{
			consumer.OnLog += (sender, message) =>
				Log.Information(
					$"[Confluent.Kafka.Consumer.OnLog] Level: {message.Level}; Name: {message.Name}; Facility: {message.Facility}; Message: {message.Message}; ");
		}

		Log.Debug($"[Confluent.Kafka.Consumer] Added Kafka subscription for topic: {topic} and model: {typeof(TKafkaNotification).Name}.");
	}

	public static void Start(this Consumer consumer, int pollingIntervalMs)
	{
		while (true)
		{
			try
			{
				consumer.Poll(TimeSpan.FromMilliseconds(pollingIntervalMs));
			}
			catch (Exception ex)
			{
				Log.Error("[Confluent.Kafka.Consumer] Kafka polling error.", ex);
			}
		}
	}
}

========================= Consumer startup ==============================

_consumer_1.Subscribe(
	new List<string>
		{
			_appSettings.KafkaTopic_1_1,
			_appSettings.KafkaTopic_1_2,
			_appSettings.KafkaTopic_1_3,
			_appSettings.KafkaTopic_1_4,
			_appSettings.KafkaTopic_1_5
		});

_consumer_2.Subscribe(
	new List<string>
		{
			_appSettings.KafkaTopic_2_1
		});

_consumerTask_1.Run(() => _consumer_1.Start(_appSettings.KafkaPollingIntervalMs));
_consumerTask_2.Run(() => _consumer_2.Start(_appSettings.KafkaPollingIntervalMs));

============================== Producer ==============================

public class SendMessageManager : ISendMessageManager
{
	private static readonly ILogger Log = Serilog.Log.ForContext<SendEmailManager>();

	private readonly AppSettings _settings;
	private readonly IKafkaProducerConfig _producerConfig;

	public SendMessageManager(IKafkaProducerConfig producerConfig, AppSettings settings)
	{
		_settings = settings;
		_producerConfig = producerConfig;
	}

	public async Task SendMessage(SendMessageRequest request)
	{
		try
		{
			var settings = new AvroSerializerSettings { UsePosixTime = true };
			var avroSerializer = AvroSerializer.Create<SendMessageRequest>(settings);

			using (var stream = new MemoryStream())
			{
				avroSerializer.Serialize(stream, request);
				var payload = stream.ToArray();
				using (var producer = new Producer(_producerConfig.Instance))
				{
					var report = await producer.ProduceAsync(_settings.KafkaSendMessageTopic, null, payload);
					if (_settings.EnableInternalLogging.HasValue && _settings.EnableInternalLogging.Value)
					{
						producer.OnLog += (sender, message) =>
							Log.Debug(
								$"[Confluent.Kafka.Producer] Level: {message.Level}; Name: {message.Name}; Facility: {message.Facility}; Message: {message.Message}; ");
					}

					Log.Debug($"[{nameof(SendMessage)}] 'SendMessage' request to Kafka succeeded.{Environment.NewLine}" +
									$"SendMessageRequest: {JsonConvert.SerializeObject(request)}{Environment.NewLine}" +
									$"Delivery report: Topic = {_settings.KafkaSendMessageTopic}; Partition = {report.Partition}; Offset = {report.Offset}");
				}
			}
		}
		catch (Exception e)
		{
			Log.Error($"[{nameof(SendMessage)}] 'SendMessage' request to Kafka failed.{Environment.NewLine}" +
					  $"SendMessageRequest: {JsonConvert.SerializeObject(request)}", e);
			throw;
		}
	}
}

Service memory usage reached ~5GB for around half a day, and keeps growing until OOM. It raises ~1Mb per second, sometimes keeps stale for a while and then starts to grow again. After some investigation I lowered some settings as per wiki explanation (~10 times lower each): “queued.max.messages.kbytes”, “queued.min.messages” and “fetch.message.max.bytes” as I assumed consuming messages is the most likely point where memory leaks. This may have turned down the tempo of growth a bit but didn’t stop the constant growth. Attaching DebugDiag tool to process didn’t show some obvious memory leak issues for process except Commited Vritual Memory is high and ~= RAM usage. So I’m currently struggling to find guilty party for that behavior and thought you guys may help with the direction.

How to reproduce

All conditions are pretty much in the description.

Checklist

Please provide the following information:

  • [+] Confluent.Kafka nuget version: 0.11.5
  • [-] Apache Kafka version:
  • [+] Client configuration: check description
  • [+] Operating system: Windows Server 2012 R2
  • [-] Provide logs (with “debug” : “…” as necessary in configuration): I will turn on and update
  • [-] Provide broker log excerpts
  • [+] Critical issue

Issue Analytics

  • State:closed
  • Created 5 years ago
  • Comments:5 (2 by maintainers)

github_iconTop GitHub Comments

1reaction
vprokopenkocommented, Oct 1, 2018

@mhowlett , that seems to be true, issue is not related to Kafka client. Thanks for looking into it.

0reactions
mhowlettcommented, Sep 24, 2018

this looks unrelated to Confluent.Kafka to me - I can’t think of any reason for the large number of Int32 arrays and we don’t make use of the ASP.NET Caching functionality.

Read more comments on GitHub >

github_iconTop Results From Across the Web

High memory consumption on kafka consumer
Recently I found that kafka consumers require a lot of ram. ... with constantly growing memory consumption during the consumer polling work.
Read more >
Dawn of the Dead Ends: Fixing a Memory Leak in Apache ...
The issue we're debugging is that somewhere, Kafka is allocating memory and never freeing it. Remember, this isn't on-heap memory or off-heap ...
Read more >
Solving a Native Memory Leak. Close those resources
The app uses a persistent state store to calculate 30 minute timeouts. Internally these state stores are backed by Kafka topics while RocksDB ......
Read more >
5 Common Pitfalls When Using Apache Kafka
5 Common Pitfalls When Using Apache Kafka · 1. Setting request.timeout.ms too low · 2. Misunderstanding producer retries and retriable exceptions.
Read more >
Avoid OutOfMemoryError errors in Aiven for Apache Kafka
How to avoid OutOfMemoryError issues# · Decrease the maximum amount of data simultaneously kept in memory · Increase the available memory for Kafka...
Read more >

github_iconTop Related Medium Post

No results found

github_iconTop Related StackOverflow Question

No results found

github_iconTroubleshoot Live Code

Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free

github_iconTop Related Reddit Thread

No results found

github_iconTop Related Hackernoon Post

No results found

github_iconTop Related Tweet

No results found

github_iconTop Related Dev.to Post

No results found

github_iconTop Related Hashnode Post

No results found