Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Running error by using Jupyter. An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.

See original GitHub issue

I put the spark/mnist_spark.py file onto the jupyter notebook for running. But there is a weird error “An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.” What’s the meaning of this error message?

The jupyter notebook starts with ipython shell. I import pyspark and input the configuration by using pyspark.SparkConf(). There is no problem to create the TFcluster. But when it came to cluster.train, it crashed and popped out the error message. The following is my running code and result. Thank you for helping!

# %load mnist_spark.py
# Copyright 2017 Yahoo Inc.
# Licensed under the terms of the Apache 2.0 license.
# Please see LICENSE file in the project root for terms.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pyspark
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

import argparse
import os
import numpy
import sys
import tensorflow as tf
import threading
import time
from datetime import datetime

from tensorflowonspark import TFCluster
import mnist_dist

sys.argv = sys.argv[:1]

conftfos = pyspark.SparkConf().setAll([('spark.yarn.queue','gpu'),('spark.executor.instances','2'),('spark.executor.cores','2'),('spark.executor.memory','10G'),('spark.dynamicAllocation.enabled','False'),('spark.yarn.maxAppAttempts','1'),('spark.executorEnv.LD_LIBRARY_PATH','/opt/cloudera/parcels/CDH/lib64:$JAVA_HOME/jre/lib/amd64/server')])

sc = pyspark.SparkContext(master="yarn",conf=conftfos)

sc.addFile('hdfs:///user/jupyter/Python.zip')
sc.addPyFile('/home/jupyter/TensorflowOnSpark/TensorFlowOnSpark/examples/mnist/spark/mnist_dist.py')
sc.addPyFile('/home/jupyter/TensorflowOnSpark/TensorFlowOnSpark/tfspark.zip')

executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1
num_ps = 1

parser = argparse.ArgumentParser()
parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100)
parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1)
parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="csv")
parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format",default="mnist/csv/train/images")
parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format",default="mnist/csv/train/labels")
parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default="mnist_model_csv")
parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster", type=int, default=2)
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1)
parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true",default=False)
parser.add_argument("-X", "--mode", help="train|inference", default="train")
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:",args)

args: Namespace(batch_size=100, cluster_size=2, epochs=1, format='csv', images='mnist/csv/train/images', labels='mnist/csv/train/labels', mode='train', model='mnist_model_csv', output='predictions', rdma=False, readers=1, steps=1000, tensorboard=False)

print("{0} ===== Start".format(datetime.now().isoformat()))

if args.format == "tfr":
  images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                              keyClass="org.apache.hadoop.io.BytesWritable",
                              valueClass="org.apache.hadoop.io.NullWritable")
  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv":
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":
    images = sc.pickleFile(args.images)
    labels = sc.pickleFile(args.labels)
  print("zipping images and labels")
  dataRDD = images.zip(labels)

2017-06-07T16:30:36.362144 ===== Start
zipping images and labels

dataRDD.count()

cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
#cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)

2017-06-07 16:30:47,037 INFO (MainThread-59203) Reserving TFSparkNodes 
2017-06-07 16:30:47,039 INFO (MainThread-59203) listening for reservations at ('gpu3', 27664)
2017-06-07 16:30:47,040 INFO (MainThread-59203) Starting TensorFlow on executors
2017-06-07 16:30:47,043 INFO (MainThread-59203) Waiting for TFSparkNodes to start
2017-06-07 16:30:47,044 INFO (MainThread-59203) waiting for 2 reservations
2017-06-07 16:30:48,046 INFO (MainThread-59203) all reservations completed
2017-06-07 16:30:48,047 INFO (MainThread-59203) All TFSparkNodes started
2017-06-07 16:30:48,048 INFO (MainThread-59203) {'addr': '/tmp/pymp-g8Kby_/listener-_9aUh3', 'task_index': 0, 'port': 22397, 'authkey': '\xf3Z\x01\x18mLF\xe6\x99\xb6_\x84\xfb\xa4\xf7\xa8', 'worker_num': 1, 'host': 'GPU1', 'ppid': 127619, 'job_name': 'worker', 'tb_pid': 0, 'tb_port': 0}
2017-06-07 16:30:48,048 INFO (MainThread-59203) {'addr': ('GPU1', 24716), 'task_index': 0, 'port': 24236, 'authkey': '\r\xdcR\xd3\x17 L5\x84R\x1a\\\x87\xd7\xa4\xc2', 'worker_num': 0, 'host': 'GPU1', 'ppid': 127621, 'job_name': 'ps', 'tb_pid': 0, 'tb_port': 0}

if args.mode == "train":
  cluster.train(dataRDD, args.epochs)
else:
  labelRDD = cluster.inference(dataRDD)
  labelRDD.saveAsTextFile(args.output)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))

2017-06-07 16:30:50,717 INFO (MainThread-59203) Feeding training data



---------------------------------------------------------------------------

Py4JJavaError                             Traceback (most recent call last)

<ipython-input-10-857570569c77> in <module>()
      1 if args.mode == "train":
----> 2   cluster.train(dataRDD, args.epochs)
      3 else:
      4   labelRDD = cluster.inference(dataRDD)
      5   labelRDD.saveAsTextFile(args.output)


/usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.pyc in train(self, dataRDD, num_epochs, qname)
     83                 rdds.append(dataRDD)
     84             unionRDD = self.sc.union(rdds)
---> 85             unionRDD.foreachPartition(TFSparkNode.train(self.cluster_info, self.cluster_meta, qname))
     86 
     87     def inference(self, dataRDD, qname='input'):


/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py in foreachPartition(self, f)
    796             except TypeError:
    797                 return iter([])
--> 798         self.mapPartitions(func).count()  # Force evaluation
    799 
    800     def collect(self):


/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py in count(self)
   1038         3
   1039         """
-> 1040         return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
   1041 
   1042     def stats(self):


/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py in sum(self)
   1029         6.0
   1030         """
-> 1031         return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
   1032 
   1033     def count(self):


/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py in fold(self, zeroValue, op)
    903         # zeroValue provided to each partition is unique from the one provided
    904         # to the final reduce call
--> 905         vals = self.mapPartitions(func).collect()
    906         return reduce(op, vals, zeroValue)
    907 


/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py in collect(self)
    806         """
    807         with SCCallSiteSync(self.context) as css:
--> 808             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
    809         return list(_load_from_socket(port, self._jrdd_deserializer))
    810 


/usr/lib/python2.7/site-packages/py4j/java_gateway.pyc in __call__(self, *args)
   1152         answer = self.gateway_client.send_command(command)
   1153         return_value = get_return_value(
-> 1154             answer, self.gateway_client, self.target_id, self.name)
   1155 
   1156         for temp_arg in temp_args:


/usr/lib/python2.7/site-packages/py4j/protocol.pyc in get_return_value(answer, gateway_client, target_id, name)
    318                 raise Py4JJavaError(
    319                     "An error occurred while calling {0}{1}{2}.\n".
--> 320                     format(target_id, ".", name), value)
    321             else:
    322                 raise Py4JError(


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 20, GPU1, executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/worker.py", line 174, in main
    process()
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/worker.py", line 169, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 2406, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 2406, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 2406, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 345, in func
    return f(iterator)
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 793, in func
    r = f(it)
  File "/usr/lib/python2.7/site-packages/tensorflowonspark/TFSparkNode.py", line 433, in _train
    queue = mgr.get_queue(qname)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 667, in temp
    token, exp = self._create(typeid, *args, **kwds)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 567, in _create
    id, exposed = dispatch(conn, None, 'create', (typeid,)+args, kwds)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 105, in dispatch
    raise convert_to_error(kind, result)
RemoteError: 
---------------------------------------------------------------------------
Traceback (most recent call last):
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 207, in handle_request
    result = func(c, *args, **kwds)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 386, in create
    obj = callable(*args, **kwds)
  File "./tfspark.zip/tensorflowonspark/TFManager.py", line 34, in <lambda>
    TFManager.register('get_queue', callable=lambda qname: qdict[qname])
KeyError: 'input'
---------------------------------------------------------------------------

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:934)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/worker.py", line 174, in main
    process()
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/worker.py", line 169, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 2406, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 2406, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 2406, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 345, in func
    return f(iterator)
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/rdd.py", line 793, in func
    r = f(it)
  File "/usr/lib/python2.7/site-packages/tensorflowonspark/TFSparkNode.py", line 433, in _train
    queue = mgr.get_queue(qname)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 667, in temp
    token, exp = self._create(typeid, *args, **kwds)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 567, in _create
    id, exposed = dispatch(conn, None, 'create', (typeid,)+args, kwds)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 105, in dispatch
    raise convert_to_error(kind, result)
RemoteError: 
---------------------------------------------------------------------------
Traceback (most recent call last):
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 207, in handle_request
    result = func(c, *args, **kwds)
  File "/usr/lib64/python2.7/multiprocessing/managers.py", line 386, in create
    obj = callable(*args, **kwds)
  File "./tfspark.zip/tensorflowonspark/TFManager.py", line 34, in <lambda>
    TFManager.register('get_queue', callable=lambda qname: qdict[qname])
KeyError: 'input'
---------------------------------------------------------------------------

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more