Hi folks,
I am an exact beginner to spark and Python environment.
I have installed spark and would like to run a code snippet named
"SpyderSetupForSpark.py":
# -*- coding: utf-8 -*-
"""
Make sure you give execute privileges
-
Spark with Python: Setup Spyder IDE for Spark
Copyright : V2 Maestros @2016
Execute this script once when Spyder is started on Windows
-
"""
import os
import sys
os.chdir("C:\Spark\spark-2.0.1-bin-hadoop2.7\python")
os.curdir
# Configure the environment. Set this up to the directory where
# Spark is installed
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = 'C:\Spark\spark-2.0.1-bin-hadoop2.7'
# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']
#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exist. The names might change
#as versions change.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.3-src.zip"))
#Initiate Spark context. Once this is done all other applications can run
from pyspark import SparkContext
from pyspark import SparkConf
# Optionally configure Spark Settings
conf=SparkConf()
conf.set("spark.executor.memory", "1g")
conf.set("spark.cores.max", "2")
conf.setAppName("V2 Maestros")
## Initialize SparkContext. Run only once. Otherwise you get multiple
#Context Error.
sc = SparkContext('local', conf=conf)
#Test to make sure everything works.
lines=sc.textFile("auto-data.csv")
lines.count()
*And I get a very long error list such below:*
Traceback (most recent call last):
File "", line 2, in
lines.count()
File
"C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py",
line 1008, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File
"C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py",
line 999, in sum
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
File
"C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py",
line 873, in fold
vals = self.mapPartitions(func).collect()
File
"C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py",
line 776, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File
"C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\java_gateway.py",
line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File
"C:\Spark\spark-2.0.1-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\protocol.py",
line 319, in get_return_value
format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling
z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0
(TID 0, localhost): org.apache.spark.SparkException: Python worker did not
connect back in time
at
org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:138)
at
org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:67)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:114)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:128)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
at java.net.PlainSocketImpl.accept(Unknown Source)
at java.net.ServerSocket.implAccept(Unknown Source)
at java.net.ServerSocket.accept(Unknown Source)
at
org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:133)
... 12 more
Driver stacktrace:
at
org.apache.spark.sche