[BUG]: SerializationException when using the Foreach method in streaming
See original GitHub issueI’m trying to run a streaming application that reads from Kafka and uses the Foreach sync. However, when I run my application, just after I put some data in the Kafka topic, I got this error:
org.apache.spark.api.python.PythonException: System.Runtime.Serialization.SerializationException: Unable to find assembly ‘StreamingDemo, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null’. (StreamingDemo is my project).
The code I’m using is pretty simple and if I remove the line that calls Foreach() and uncomment the Format() line, the application works printing the data from Kafka in the console:
namespace StreamingDemo
{
class Program
{
static void Main(string[] args)
{
string bootstrapServers = "localhost:9092"; //args[0];
string topics = "test"; //args[1];
SparkSession spark = SparkSession
.Builder()
.AppName("StructuredKafkaWordCount")
.GetOrCreate();
DataFrame lines = spark
.ReadStream()
.Format("kafka")
.Option("kafka.bootstrap.servers", bootstrapServers)
.Option("subscribe", topics)
.Load()
.SelectExpr("CAST(value AS STRING)");
StreamingQuery query = lines
.WriteStream()
.OutputMode(OutputMode.Append)
//.Format("console")
.Trigger(Trigger.Continuous(2000))
.Foreach(new RedisForeachWriter())
.Start();
query.AwaitTermination();
}
}
}
The code of RedisForeachWriter class is the following (besides the name, there’s nothing related to Redis yet in this code)
namespace StreamingDemo
{
[Serializable]
public class RedisForeachWriter : IForeachWriter
{
public void Close(Exception errorOrNull)
{
Console.BackgroundColor = ConsoleColor.Yellow;
Console.WriteLine(errorOrNull);
Console.ResetColor();
}
public bool Open(long partitionId, long epochId)
{
Console.BackgroundColor = ConsoleColor.Yellow;
Console.WriteLine($"Open: {partitionId} - {epochId}");
Console.ResetColor();
return true;
}
public void Process(Row row)
{
Console.BackgroundColor = ConsoleColor.Yellow;
Console.WriteLine($"Value: {row.Get(0)}");
Console.ResetColor();
}
}
}
This is the full stack trace. What am I missing?
Thank you
[2020-02-25T19:54:57.9586743Z] [KALI] [Error] [JvmBridge] org.apache.spark.sql.streaming.StreamingQueryException: Writing job aborted.
=== Streaming Query ===
Identifier: [id = 421593f9-16f7-45c7-ad79-2221eb9cac1e, runId = 0d05a8d1-44ae-4705-933c-06a28bcddc13]
Current Committed Offsets: {}
Current Available Offsets: {}
Current State: ACTIVE
Thread State: RUNNABLE
Logical Plan:
Project [cast(value#8 as string) AS value#21]
+- ContinuousExecutionRelation org.apache.spark.sql.kafka010.KafkaSourceProvider@640c811c, Map(subscribe -> test, kafka.bootstrap.servers -> localhost:9092), [key#7, va
lue#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:297)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:193)
Caused by: org.apache.spark.SparkException: Writing job aborted.
at org.apache.spark.sql.execution.streaming.continuous.WriteToContinuousDataSourceExec.doExecute(WriteToContinuousDataSourceExec.scala:62)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution$$anonfun$runContinuous$4$$anonfun$apply$1.apply(ContinuousExecution.scala:262)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution$$anonfun$runContinuous$4$$anonfun$apply$1.apply(ContinuousExecution.scala:262)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution$$anonfun$runContinuous$4.apply(ContinuousExecution.scala:261)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution$$anonfun$runContinuous$4.apply(ContinuousExecution.scala:261)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:351)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution.runContinuous(ContinuousExecution.scala:260)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution.runActivatedStream(ContinuousExecution.scala:90)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:281)
... 1 more
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0
, localhost, executor driver): org.apache.spark.api.python.PythonException: System.Runtime.Serialization.SerializationException: Unable to find assembly 'StreamingDemo,
Version=1.0.0.0, Culture=neutral, PublicKeyToken=null'.
at System.Runtime.Serialization.Formatters.Binary.BinaryAssemblyInfo.GetAssembly()
at System.Runtime.Serialization.Formatters.Binary.ObjectReader.GetType(BinaryAssemblyInfo assemblyInfo, String name)
at System.Runtime.Serialization.Formatters.Binary.ObjectMap..ctor(String objectName, String[] memberNames, BinaryTypeEnum[] binaryTypeEnumA, Object[] typeInformation
A, Int32[] memberAssemIds, ObjectReader objectReader, Int32 objectId, BinaryAssemblyInfo assemblyInfo, SizedArray assemIdToAssemblyTable)
at System.Runtime.Serialization.Formatters.Binary.BinaryParser.ReadObjectWithMapTyped(BinaryObjectWithMapTyped record)
at System.Runtime.Serialization.Formatters.Binary.BinaryParser.ReadObjectWithMapTyped(BinaryHeaderEnum binaryHeaderEnum)
at System.Runtime.Serialization.Formatters.Binary.BinaryParser.Run()
at System.Runtime.Serialization.Formatters.Binary.ObjectReader.Deserialize(BinaryParser serParser, Boolean fCheck)
at System.Runtime.Serialization.Formatters.Binary.BinaryFormatter.Deserialize(Stream serializationStream, Boolean check)
at System.Runtime.Serialization.Formatters.Binary.BinaryFormatter.Deserialize(Stream serializationStream)
at Microsoft.Spark.Utils.CommandSerDe.Deserialize[T](Stream stream, SerializedMode& serializerMode, SerializedMode& deserializerMode, String& runMode) in /_/src/csha
rp/Microsoft.Spark/Utils/CommandSerDe.cs:line 243
at Microsoft.Spark.Worker.Processor.CommandProcessor.ReadRDDCommand(Stream stream) in D:\a\1\s\src\csharp\Microsoft.Spark.Worker\Processor\CommandProcessor.cs:line 8
2
at Microsoft.Spark.Worker.Processor.CommandProcessor.Process(Stream stream) in D:\a\1\s\src\csharp\Microsoft.Spark.Worker\Processor\CommandProcessor.cs:line 58
at Microsoft.Spark.Worker.Processor.PayloadProcessor.Process(Stream stream) in D:\a\1\s\src\csharp\Microsoft.Spark.Worker\Processor\PayloadProcessor.cs:line 74
at Microsoft.Spark.Worker.TaskRunner.ProcessStream(Stream inputStream, Stream outputStream, Version version, Boolean& readComplete) in D:\a\1\s\src\csharp\Microsoft.
Spark.Worker\TaskRunner.cs:line 143
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at org.apache.spark.sql.execution.python.PythonForeachWriter.close(PythonForeachWriter.scala:66)
at org.apache.spark.sql.execution.streaming.sources.ForeachDataWriter.closeWriter(ForeachWriterProvider.scala:142)
at org.apache.spark.sql.execution.streaming.sources.ForeachDataWriter.commit(ForeachWriterProvider.scala:131)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousWriteRDD$$anonfun$compute$1.apply$mcV$sp(ContinuousWriteRDD.scala:62)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousWriteRDD$$anonfun$compute$1.apply(ContinuousWriteRDD.scala:51)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousWriteRDD$$anonfun$compute$1.apply(ContinuousWriteRDD.scala:51)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
at org.apache.spark.sql.execution.streaming.continuous.ContinuousWriteRDD.compute(ContinuousWriteRDD.scala:76)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Issue Analytics
- State:
- Created 4 years ago
- Comments:9 (2 by maintainers)
@elvaliuliuliu, it seems that the latest version of the binaries (0.10.0) has corrected the Could not get or parse batch id from TaskContext exception.
Can you check if the assembly is available on the worker? https://github.com/dotnet/spark/blob/master/docs/deploy-worker-udf-binaries.md