You also could use Spark SQL:
from pyspark.sql import Row, SQLContext
row = Row('id', 'C1', 'C2', 'C3')
# convert each
data = sc.textFile("test.csv").map(lambda line: line.split(','))
sqlContext = SQLContext(sc)
rows = data.map(lambda r: row(*r))
sqlContext.inferSchema(rows).registerTempTable("da
Hi,
I will write the code in python
{code:title=test.py}
data = sc.textFile(...).map(...) ## Please make sure that the rdd is
like[[id, c1, c2, c3], [id, c1, c2, c3],...]
keypair = data.map(lambda l: ((l[0],l[1],l[2]), float(l[3])))
keypair = keypair.reduceByKey(add)
out = keypair.map(lambda l: l