Hi All, I am trying to run KMeans clustering on a large data set with 12,000 points and 80,000 dimensions. I have a spark cluster in Ec2 stand alone mode with 8 workers running on 2 slaves with 160 GB Ram and 40 VCPU.
*My Code is as Follows:* def convert_into_sparse_vector(A): non_nan_indices=np.nonzero(~np.isnan(A) ) non_nan_values=A[non_nan_indices] dictionary=dict(zip(non_nan_indices[0],non_nan_values)) return Vectors.sparse (len(A),dictionary) X=[convert_into_sparse_vector(A) for A in complete_dataframe.values ] sc=SparkContext(appName="parallel_kmeans") data=sc.parallelize(X,10) model = KMeans.train(data, 1000, initializationMode="k-means||") where complete_dataframe is a pandas data frame that has my data. I get the error: Py4JNetworkError: An error occurred while trying to connect to the Java server. / The error trace is as follows: > ---------------------------------------- Exception happened during > processing of request from ('127.0.0.1', 41360) Traceback (most recent > call last): File "/usr/lib64/python2.6/SocketServer.py", line 283, > in _handle_request_noblock > self.process_request(request, client_address) File > "/usr/lib64/python2.6/SocketServer.py", line 309, in process_request > self.finish_request(request, client_address) File > "/usr/lib64/python2.6/SocketServer.py", line 322, in finish_request > self.RequestHandlerClass(request, client_address, self) File > "/usr/lib64/python2.6/SocketServer.py", line 617, in __init__ > self.handle() File "/root/spark/python/pyspark/accumulators.py", > line 235, in handle > num_updates = read_int(self.rfile) File > "/root/spark/python/pyspark/serializers.py", line 544, in read_int > raise EOFError EOFError > ---------------------------------------- > --------------------------------------------------------------------------- > Py4JNetworkError Traceback (most recent call > last) <ipython-input-13-3dd00c2c5e93> in <module>() > ----> 1 model = KMeans.train(data, 1000, initializationMode="k-means||") > > /root/spark/python/pyspark/mllib/clustering.pyc in train(cls, rdd, k, > maxIterations, runs, initializationMode, seed, initializationSteps, > epsilon) > 134 """Train a k-means clustering model.""" > 135 model = callMLlibFunc("trainKMeansModel", > rdd.map(_convert_to_vector), k, maxIterations, > --> 136 runs, initializationMode, seed, > initializationSteps, epsilon) > 137 centers = callJavaFunc(rdd.context, model.clusterCenters) > 138 return KMeansModel([c.toArray() for c in centers]) > > /root/spark/python/pyspark/mllib/common.pyc in callMLlibFunc(name, > *args) > 126 sc = SparkContext._active_spark_context > 127 api = getattr(sc._jvm.PythonMLLibAPI(), name) > --> 128 return callJavaFunc(sc, api, *args) > 129 > 130 > > /root/spark/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, > *args) > 119 """ Call Java Function """ > 120 args = [_py2java(sc, a) for a in args] > --> 121 return _java2py(sc, func(*args)) > 122 > 123 > > /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in > __call__(self, *args) > 534 END_COMMAND_PART > 535 > --> 536 answer = self.gateway_client.send_command(command) > 537 return_value = get_return_value(answer, > self.gateway_client, > 538 self.target_id, self.name) > > /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in > send_command(self, command, retry) > 367 if retry: > 368 #print_exc() > --> 369 response = self.send_command(command) > 370 else: > 371 response = ERROR > > /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in > send_command(self, command, retry) > 360 the Py4J protocol. > 361 """ > --> 362 connection = self._get_connection() > 363 try: > 364 response = connection.send_command(command) > > /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in > _get_connection(self) > 316 connection = self.deque.pop() > 317 except Exception: > --> 318 connection = self._create_connection() > 319 return connection > 320 > > /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in > _create_connection(self) > 323 connection = GatewayConnection(self.address, self.port, > 324 self.auto_close, self.gateway_property) > --> 325 connection.start() > 326 return connection > 327 > > /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in > start(self) > 430 'server' > 431 logger.exception(msg) > --> 432 raise Py4JNetworkError(msg) > 433 > 434 def close(self): > > Py4JNetworkError: An error occurred while trying to connect to the > Java server/ Please let me know if I am missing something. Thanks and Regards, Rogers Jeffrey L -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/MLIB-KMEANS-Py4JNetworkError-An-error-occurred-while-trying-to-connect-to-the-Java-server-on-a-huge-t-tp23396.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org