I am using pyspark spark-1.6.1-bin-hadoop2.6 and python3. I have a data frame with a column I need to convert to a sparse vector. I get an exception
Any idea what my bug is? Kind regards Andy Py4JJavaError: An error occurred while calling None.org.apache.spark.sql.hive.HiveContext. : java.lang.RuntimeException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) at org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:20 4) Here is my python code fragment with a more complete stack trace # load data set from pyspark.sql import HiveContext #,SQLContext, Row # window functions require HiveContext (spark 2.x will not require hive) #sqlContext = SQLContext(sc) hiveSqlContext = HiveContext(sc) import numpy as np from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg import VectorUDT #sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) # = 3 = size # [0,1] int indices #[1.0, 3.0] values """ root |-- id: string (nullable = true) |-- samples: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- id: long (nullable = false) | | |-- rateStr: string (nullable = false) """ def toSparseVector(pojoList) : indicies = [] for pojo in pojoList : indicies.append(pojo.id) l = np.ones(len(indicies)) v = Vectors.spark(numDimensions, indicies, l) return v myUDF = udf(toSparseVector, VectorUDT())) features = df.withColumn(newColName, myUDF(df[³samples"])) Py4JJavaError Traceback (most recent call last) <ipython-input-77-30ab820130a0> in <module>() 30 #myUDF = udf(lambda pojoList: labelStr if (labelStr == "noise") else "injury", StringType()) 31 ---> 32 myUDF = udf(toSparseVector, VectorUDT()) # 33 features = df.withColumn(newColName, myUDF(df["follows"])) /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functi ons.py in udf(f, returnType) 1595 [Row(slen=5), Row(slen=3)] 1596 """ -> 1597 return UserDefinedFunction(f, returnType) 1598 1599 blacklist = ['map', 'since', 'ignore_unicode_prefix'] /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functi ons.py in __init__(self, func, returnType, name) 1556 self.returnType = returnType 1557 self._broadcast = None -> 1558 self._judf = self._create_judf(name) 1559 1560 def _create_judf(self, name): /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functi ons.py in _create_judf(self, name) 1567 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self) 1568 ctx = SQLContext.getOrCreate(sc) -> 1569 jdt = ctx._ssql_ctx.parseDataType(self.returnType.json()) 1570 if name is None: 1571 name = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__ /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/contex t.py in _ssql_ctx(self) 681 try: 682 if not hasattr(self, '_scala_HiveContext'): --> 683 self._scala_HiveContext = self._get_hive_ctx() 684 return self._scala_HiveContext 685 except Py4JError as e: /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/contex t.py in _get_hive_ctx(self) 690 691 def _get_hive_ctx(self): --> 692 return self._jvm.HiveContext(self._jsc.sc()) 693 694 def refreshTable(self, tableName): /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.z ip/py4j/java_gateway.py in __call__(self, *args) 1062 answer = self._gateway_client.send_command(command) 1063 return_value = get_return_value( -> 1064 answer, self._gateway_client, None, self._fqn) 1065 1066 for temp_arg in temp_args: /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils. py in deco(*a, **kw) 43 def deco(*a, **kw): 44 try: ---> 45 return f(*a, **kw) 46 except py4j.protocol.Py4JJavaError as e: 47 s = e.java_exception.toString() /Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/p y4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 306 raise Py4JJavaError( 307 "An error occurred while calling {0}{1}{2}.\n". --> 308 format(target_id, ".", name), value) 309 else: 310 raise Py4JError( Py4JJavaError: An error occurred while calling None.org.apache.spark.sql.hive.HiveContext. : java.lang.RuntimeException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) at org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:20 4) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces sorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc torAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:422) at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedC lientLoader.scala:249) at org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.sc ala:327) at org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:237) at org.apache.spark.sql.hive.HiveContext.setConf(HiveContext.scala:441) at org.apache.spark.sql.hive.HiveContext.defaultOverrides(HiveContext.scala:226 ) at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:229) at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101)