Unexpected metadata causes IllegalStateException
See original GitHub issueWe ran into an java.lang.IllegalStateException
while testing the EoS implementation and we are not sure what is causing it.
The scenario is as follows: in a loop, we create a KafkaJs client and a producer and we write a number of messages in a single transaction, then discard the client.
What we observe is that after a variable number of iterations the following exception happens:
kafka1_1 | [2019-02-28 19:34:07,949] ERROR TransactionMetadata(transactionalId=transactional-id, producerId=382001, producerEpoch=877, txnTimeoutMs=60000, state=Empty, pendingState=Some(Ongoing), topicPartitions=Set(), txnStartTimestamp=1551382447950, txnLastUpdateTimestamp=1551382447936)'s transition to TxnTransitMetadata(producerId=382001, producerEpoch=877, txnTimeoutMs=60000, txnState=Ongoing, topicPartitions=Set(test-topic-1551382421868-1), txnStartTimestamp=1551382447942, txnLastUpdateTimestamp=1551382447942) failed: this should not happen (kafka.coordinator.transaction.TransactionMetadata)
kafka1_1 | [2019-02-28 19:34:07,953] ERROR [KafkaApi-0] Error when handling request {transactional_id=transactional-id,producer_id=382001,producer_epoch=877,topics=[{topic=test-topic-1551382421868,partitions=[1]}]} (kafka.server.KafkaApis)
kafka1_1 | java.lang.IllegalStateException: TransactionalId transactional-id failed transition to state TxnTransitMetadata(producerId=382001, producerEpoch=877, txnTimeoutMs=60000, txnState=Ongoing, topicPartitions=Set(test-topic-1551382421868-1), txnStartTimestamp=1551382447942, txnLastUpdateTimestamp=1551382447942) due to unexpected metadata
(further detail below).
The Kafka cluster remains up, however, a subsequent run of the reproducer results in KafkaJSNumberOfRetriesExceeded
after a number of retries on the CONCURRENT_TRANSACTIONS
error. This is regardless of if we use the same topic.
Reproducer:
const ip = require("ip");
const kafkajs = require("kafkajs");
const id = Date.now();
const transactionalId = "transactional-id";
const host = process.env.HOST_IP || ip.address();
const broker = `${host}:9092`;
const topic = `test-topic-${id}`;
const acks = -1;
const maxInFlightRequests = 1;
const connectionTimeout = 1000;
const key = `sink-1-${id}`;
async function setupAdmin() {
const client = new kafkajs.Kafka({ clientId: "reproducer", brokers: [broker] });
const admin = client.admin();
try {
await admin.connect();
await admin.createTopics({
waitForLeaders: true,
topics: [{ topic, numPartitions: 4, configEntries: [] }],
});
} finally {
await admin.disconnect();
}
console.log("ADMIN SETUP DONE");
}
function createMessages(num) {
const messages = [];
for (let ii = 0; ii < num; ii++) {
messages.push({ key, value: `${ii}` });
}
return messages;
}
async function runOneProducer(num) {
const messages = createMessages(num);
const client = new kafkajs.Kafka({
clientId: "reproducer",
brokers: [broker],
connectionTimeout,
maxInFlightRequests,
});
const producer = client.producer({
// An idempotent producer enforces EoS messaging
// idempotent: transactional ? true : false,
idempotent: true,
transactionalId,
});
await producer.connect();
const transaction = await producer.transaction();
try {
await transaction.send({
acks,
topic,
messages,
});
} catch (e) {
try {
await transaction.abort();
} catch (e) {
throw e;
}
throw e;
}
await transaction.commit();
await producer.disconnect();
}
async function runProducerReplicator() {
await setupAdmin();
const repeats = 5000;
const numMessages = 128;
for (let ii = 0; ii < repeats; ii++) {
await runOneProducer(numMessages);
if (ii % 20 === 0) {
console.log(ii);
}
}
}
runProducerReplicator();
Kafka Log:
kafka1_1 | [2019-02-28 19:34:07,944] INFO [TransactionCoordinator id=0] Initialized transactionalId transactional-id with producerId 382001 and producer epoch 876 on partition __transaction_state-31 (kafka.coordinator.transaction.TransactionCoordinator)
kafka1_1 | [2019-02-28 19:34:07,936] INFO [TransactionCoordinator id=0] Initialized transactionalId transactional-id with producerId 382001 and producer epoch 877 on partition __transaction_state-31 (kafka.coordinator.transaction.TransactionCoordinator)
kafka1_1 | [2019-02-28 19:34:07,949] ERROR TransactionMetadata(transactionalId=transactional-id, producerId=382001, producerEpoch=877, txnTimeoutMs=60000, state=Empty, pendingState=Some(Ongoing), topicPartitions=Set(), txnStartTimestamp=1551382447950, txnLastUpdateTimestamp=1551382447936)'s transition to TxnTransitMetadata(producerId=382001, producerEpoch=877, txnTimeoutMs=60000, txnState=Ongoing, topicPartitions=Set(test-topic-1551382421868-1), txnStartTimestamp=1551382447942, txnLastUpdateTimestamp=1551382447942) failed: this should not happen (kafka.coordinator.transaction.TransactionMetadata)
kafka1_1 | [2019-02-28 19:34:07,953] ERROR [KafkaApi-0] Error when handling request {transactional_id=transactional-id,producer_id=382001,producer_epoch=877,topics=[{topic=test-topic-1551382421868,partitions=[1]}]} (kafka.server.KafkaApis)
kafka1_1 | java.lang.IllegalStateException: TransactionalId transactional-id failed transition to state TxnTransitMetadata(producerId=382001, producerEpoch=877, txnTimeoutMs=60000, txnState=Ongoing, topicPartitions=Set(test-topic-1551382421868-1), txnStartTimestamp=1551382447942, txnLastUpdateTimestamp=1551382447942) due to unexpected metadata
kafka1_1 | at kafka.coordinator.transaction.TransactionMetadata.throwStateTransitionFailure(TransactionMetadata.scala:390)
kafka1_1 | at kafka.coordinator.transaction.TransactionMetadata.completeTransitionTo(TransactionMetadata.scala:326)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$kafka$coordinator$transaction$TransactionStateManager$$updateCacheCallback$1$1.apply$mcV$sp(TransactionStateManager.scala:534)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$kafka$coordinator$transaction$TransactionStateManager$$updateCacheCallback$1$1.apply(TransactionStateManager.scala:526)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$kafka$coordinator$transaction$TransactionStateManager$$updateCacheCallback$1$1.apply(TransactionStateManager.scala:526)
kafka1_1 | at kafka.utils.CoreUtils$.inLock(CoreUtils.scala:250)
kafka1_1 | at kafka.coordinator.transaction.TransactionMetadata.inLock(TransactionMetadata.scala:172)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager.kafka$coordinator$transaction$TransactionStateManager$$updateCacheCallback$1(TransactionStateManager.scala:525)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$appendTransactionToLog$1$$anonfun$apply$mcV$sp$11.apply(TransactionStateManager.scala:620)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$appendTransactionToLog$1$$anonfun$apply$mcV$sp$11.apply(TransactionStateManager.scala:620)
kafka1_1 | at kafka.server.DelayedProduce.onComplete(DelayedProduce.scala:129)
kafka1_1 | at kafka.server.DelayedOperation.forceComplete(DelayedOperation.scala:70)
kafka1_1 | at kafka.server.DelayedProduce.tryComplete(DelayedProduce.scala:111)
kafka1_1 | at kafka.server.DelayedOperationPurgatory.tryCompleteElseWatch(DelayedOperation.scala:232)
kafka1_1 | at kafka.server.ReplicaManager.appendRecords(ReplicaManager.scala:488)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$appendTransactionToLog$1.apply$mcV$sp(TransactionStateManager.scala:614)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$appendTransactionToLog$1.apply(TransactionStateManager.scala:591)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager$$anonfun$appendTransactionToLog$1.apply(TransactionStateManager.scala:591)
kafka1_1 | at kafka.utils.CoreUtils$.inLock(CoreUtils.scala:250)
kafka1_1 | at kafka.utils.CoreUtils$.inReadLock(CoreUtils.scala:256)
kafka1_1 | at kafka.coordinator.transaction.TransactionStateManager.appendTransactionToLog(TransactionStateManager.scala:585)
kafka1_1 | at kafka.coordinator.transaction.TransactionCoordinator.handleAddPartitionsToTransaction(TransactionCoordinator.scala:272)
kafka1_1 | at kafka.server.KafkaApis.handleAddPartitionToTxnRequest(KafkaApis.scala:1744)
kafka1_1 | at kafka.server.KafkaApis.handle(KafkaApis.scala:128)
kafka1_1 | at kafka.server.KafkaRequestHandler.run(KafkaRequestHandler.scala:69)
kafka1_1 | at java.lang.Thread.run(Thread.java:748)
kafka2_1 | waiting for kafka to be ready
kafka3_1 | waiting for kafka to be ready
KafkaJs Log:
ADMIN SETUP DONE
0
{"level":"ERROR","timestamp":"2019-02-28T19:33:42.609Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":5,"size":20}
20
…
180
{"level":"ERROR","timestamp":"2019-02-28T19:33:48.652Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":4,"size":20}
200
…
860
{"level":"ERROR","timestamp":"2019-02-28T19:34:07.963Z","logger":"kafkajs","message":"[Connection] Response AddPartitionsToTxn(key: 24, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The server experienced an unexpected error when processing the request","correlationId":7,"size":48}
{"level":"ERROR","timestamp":"2019-02-28T19:34:07.964Z","logger":"kafkajs","message":"[Producer] The server experienced an unexpected error when processing the request","retryCount":0,"retryTime":342}
{"level":"ERROR","timestamp":"2019-02-28T19:34:07.971Z","logger":"kafkajs","message":"[Connection] Response EndTxn(key: 26, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":8,"size":10}
(node:14623) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 884): KafkaJSProtocolError: The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing
(node:14623) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
KafkaJs Log after second run of reproducer:
ADMIN SETUP DONE
{"level":"ERROR","timestamp":"2019-02-28T19:35:18.152Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":4,"size":20}
{"level":"ERROR","timestamp":"2019-02-28T19:35:18.525Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":5,"size":20}
{"level":"ERROR","timestamp":"2019-02-28T19:35:19.148Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":7,"size":20}
{"level":"ERROR","timestamp":"2019-02-28T19:35:20.358Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":8,"size":20}
{"level":"ERROR","timestamp":"2019-02-28T19:35:22.333Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":9,"size":20}
{"level":"ERROR","timestamp":"2019-02-28T19:35:26.489Z","logger":"kafkajs","message":"[Connection] Response InitProducerId(key: 22, version: 0)","broker":"10.51.16.56:9092","clientId":"reproducer","error":"The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing","correlationId":11,"size":20}
(node:14652) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 9): KafkaJSNumberOfRetriesExceeded: The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing
(node:14652) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
Issue Analytics
- State:
- Created 5 years ago
- Comments:9 (4 by maintainers)
Top Results From Across the Web
Question: Unexpected error executing process: java.lang ...
Unexpected error executing process: java.lang.IllegalStateException: com.boomi.store.DataStoreException: Unable to load meta data index.
Read more >Deploying Default_Opportunity.pathAssistant via the Metadata ...
As title says, a deploy via Ant is failing when trying to deploy a path on opportunity. After searching, I found a few...
Read more >Error in metadata: MetaException(message:java.lang ...
The solution is to have write permission on the file....
Read more >IllegalStateException - Android Developers
Constructs an IllegalStateException with no detail message. IllegalStateException(String s) ... IllegalStateException(String message, Throwable cause).
Read more >Error: java.lang.RuntimeException: org.apache.hadoop.hive.ql ...
Caused by: org.apache.hadoop.hive.ql.metadata. ... IllegalStateException: Expected a string but was BEGIN_ARRAY at line 1 column 105
Read more >
Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free
Top Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
I will release
1.5.0
then, @sklose @plameniv thanks for shaping up this release!agreed, this is an edge case and not a blocker for the 1.5.0 release