OSError: [WinError 123]: Spark on Windows local
See original GitHub issueI’m trying to execute an expample:
spark Version: 2.3.4
import timefrom sklearn import datasets, svm
from skdist.distribute.search import DistGridSearchCV
from pyspark.sql import SparkSession # instantiate spark session
spark = (
SparkSession
.builder
.getOrCreate()
)
sc = spark.sparkContext # the digits dataset
digits = datasets.load_digits()
X = digits["data"]
y = digits["target"] # create a classifier: a support vector classifier
classifier = svm.SVC()
param_grid = {
"C": [0.01, 0.01, 0.1, 1.0, 10.0, 20.0, 50.0],
"gamma": ["scale", "auto", 0.001, 0.01, 0.1],
"kernel": ["rbf", "poly", "sigmoid"]
}
scoring = "f1_weighted"
cv = 10# hyperparameter optimization
start = time.time()
model = DistGridSearchCV(
classifier, param_grid,
sc=sc, cv=cv, scoring=scoring,
verbose=True
)
when I try to train my model, model.fit(X,y) it’s fails with
OSError: [WinError 123] Die Syntax für den Dateinamen, Verzeichnisnamen oder die Datenträgerbezeichnung ist falsch: ‘C:\C:\spark\jars\spark-core_2.11-2.3.4.jar’
The Spak_Home is set as "C:\spark" and the PATH %SPARK_HOME%\bin
Without SparkContext I’m able to run the code
model = DistGridSearchCV(
classifier, param_grid,
**sc=sc**, cv=cv, scoring=scoring)
to:
model = DistGridSearchCV(
classifier, param_grid,
cv=cv, scoring=scoring)
I did also try to pass the “spakHome” variable when defnied the SparkContext without the driver letter "C:"
SparkContext(appName=“Dist_Exmp”,sparkHome=“spark”) sc.sparkHome --> spark
But the variable is taken from the EvnVar.
Here the whole Trace:
`--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-72-d768f88d541e> in <module> ----> 1 model.fit(X_train, y_train)
C:\MeineProgramme\anaconda3\lib\site-packages\skdist\distribute\search.py in fit(self, X, y, groups, **fit_params) 367 base_estimator_ = self.sc.broadcast(base_estimator) 368 partitions = _parse_partitions(self.partitions, len(fit_sets)) –> 369 out = self.sc.parallelize(fit_sets, numSlices=partitions).map(lambda x: [x[0], fit_and_score( 370 base_estimator, X, y, scorers, x[2][0], x[2][1], 371 verbose, x[1], fit_params=fit_params,
C:\MeineProgramme\anaconda3\lib\site-packages\pyspark\rdd.py in collect(self) 812 “”" 813 with SCCallSiteSync(self.context) as css: –> 814 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) 815 return list(_load_from_socket(sock_info, self._jrdd_deserializer)) 816
C:\MeineProgramme\anaconda3\lib\site-packages\py4j\java_gateway.py in call(self, *args) 1255 answer = self.gateway_client.send_command(command) 1256 return_value = get_return_value( -> 1257 answer, self.gateway_client, self.target_id, self.name) 1258 1259 for temp_arg in temp_args:
C:\MeineProgramme\anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw) 61 def deco(*a, **kw): 62 try: —> 63 return f(*a, **kw) 64 except py4j.protocol.Py4JJavaError as e: 65 s = e.java_exception.toString()
C:\MeineProgramme\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name) 326 raise Py4JJavaError( 327 “An error occurred while calling {0}{1}{2}.\n”. –> 328 format(target_id, “.”, name), value) 329 else: 330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 1.0 failed 1 times, most recent failure: Lost task 1.0 in stage 1.0 (TID 6, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File “C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py”, line 240, in main File “C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py”, line 60, in read_command File “C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py”, line 171, in read_with_length return self.loads(obj) File “C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py”, line 566, in loads return pickle.loads(obj, encoding=encoding) File “C:\MeineProgramme\anaconda3\lib\site-packages\skdist\distribute\search.py”, line 14, in <module> from sklearn.model_selection import ( File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_init.py", line 19, in <module> from .validation import cross_val_score File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_validation.py”, line 27, in <module> from …metrics.scorer import check_scoring, check_multimetric_scoring File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics_init.py", line 7, in <module> from .ranking import auc File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics\ranking.py”, line 35, in <module> from …preprocessing import label_binarize File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_init.py", line 6, in <module> from .function_transformer import FunctionTransformer File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_function_transformer.py”, line 5, in <module> from …utils.testing import assert_allclose_dense_sparse File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\utils\testing.py”, line 718, in <module> import pytest File “C:\MeineProgramme\anaconda3\lib\site-packages\pytest.py”, line 6, in <module> from pytest.assertion import register_assert_rewrite File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion_init.py", line 7, in <module> from pytest.assertion import rewrite File “C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\rewrite.py”, line 26, in <module> from pytest.assertion import util File “C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\util.py”, line 8, in <module> import pytest.code File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code_init.py", line 2, in <module> from .code import Code # noqa File “C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code\code.py”, line 23, in <module> import pluggy File "C:\MeineProgramme\anaconda3\lib\site-packages\pluggy_init.py", line 16, in <module> from .manager import PluginManager, PluginValidationError File “C:\MeineProgramme\anaconda3\lib\site-packages\pluggy\manager.py”, line 11, in <module> import importlib_metadata File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 547, in <module> version = version(name) File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 509, in version return distribution(distribution_name).version File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 482, in distribution return Distribution.from_name(distribution_name) File “C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py”, line 183, in from_name dist = next(dists, None) File “C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py”, line 425, in <genexpr> for path in map(cls.switch_path, paths) File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 449, in _search_path if not root.is_dir(): File “C:\MeineProgramme\anaconda3\lib\pathlib.py”, line 1358, in is_dir return S_ISDIR(self.stat().st_mode) File “C:\MeineProgramme\anaconda3\lib\pathlib.py”, line 1168, in stat return self._accessor.stat(self) OSError: [WinError 123] Die Syntax für den Dateinamen, Verzeichnisnamen oder die Datenträgerbezeichnung ist falsch: ‘C:\C:\spark\jars\spark-core_2.11-2.3.4.jar’
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:336)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:475)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:458)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:290)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1661) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1649) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1648) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1648) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1882) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1831) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1820) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) at org.apache.spark.rdd.RDD.collect(RDD.scala:944) at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:165) at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last): File “C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py”, line 240, in main File “C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py”, line 60, in read_command File “C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py”, line 171, in read_with_length return self.loads(obj) File “C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py”, line 566, in loads return pickle.loads(obj, encoding=encoding) File “C:\MeineProgramme\anaconda3\lib\site-packages\skdist\distribute\search.py”, line 14, in <module> from sklearn.model_selection import ( File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_init.py", line 19, in <module> from .validation import cross_val_score File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_validation.py”, line 27, in <module> from …metrics.scorer import check_scoring, check_multimetric_scoring File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics_init.py", line 7, in <module> from .ranking import auc File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics\ranking.py”, line 35, in <module> from …preprocessing import label_binarize File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_init.py", line 6, in <module> from .function_transformer import FunctionTransformer File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_function_transformer.py”, line 5, in <module> from …utils.testing import assert_allclose_dense_sparse File “C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\utils\testing.py”, line 718, in <module> import pytest File “C:\MeineProgramme\anaconda3\lib\site-packages\pytest.py”, line 6, in <module> from pytest.assertion import register_assert_rewrite File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion_init.py", line 7, in <module> from pytest.assertion import rewrite File “C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\rewrite.py”, line 26, in <module> from pytest.assertion import util File “C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\util.py”, line 8, in <module> import pytest.code File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code_init.py", line 2, in <module> from .code import Code # noqa File “C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code\code.py”, line 23, in <module> import pluggy File "C:\MeineProgramme\anaconda3\lib\site-packages\pluggy_init.py", line 16, in <module> from .manager import PluginManager, PluginValidationError File “C:\MeineProgramme\anaconda3\lib\site-packages\pluggy\manager.py”, line 11, in <module> import importlib_metadata File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 547, in <module> version = version(name) File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 509, in version return distribution(distribution_name).version File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 482, in distribution return Distribution.from_name(distribution_name) File “C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py”, line 183, in from_name dist = next(dists, None) File “C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py”, line 425, in <genexpr> for path in map(cls.switch_path, paths) File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 449, in _search_path if not root.is_dir(): File “C:\MeineProgramme\anaconda3\lib\pathlib.py”, line 1358, in is_dir return S_ISDIR(self.stat().st_mode) File “C:\MeineProgramme\anaconda3\lib\pathlib.py”, line 1168, in stat return self._accessor.stat(self) OSError: [WinError 123] Die Syntax für den Dateinamen, Verzeichnisnamen oder die Datenträgerbezeichnung ist falsch: ‘C:\C:\spark\jars\spark-core_2.11-2.3.4.jar’
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:336)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:475)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:458)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:290)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more`
Issue Analytics
- State:
- Created 4 years ago
- Reactions:1
- Comments:13

Top Related StackOverflow Question
@progsurfer did you resolve that issue somehow? I have the same problem but with nltk and I don’t know what to do
I’m going to resolve this as it appears the sklearn upgrade helped.