Hello, Obviously I'm new to spark and I assume I'm missing something really obvious but all my map operations are run on only one processor even if they have many partitions. I've tried to google for the issue but everything seems good, I use local[8] and my file has more than one partition ( checked with _jrdd.splits().size() and I repartitioned to make sure ).
I run my test program using the following command: ./bin/spark-submit --master local[8] session_tracking_spark.py The code itself: """SimpleApp.py""" from pyspark import SparkContext from dateutil.parser import parse from datetime import datetime from datetime import timedelta import json if __name__ == "__main__": class ParsedLogLine: def __init__(self): self.logLineColumns = None self.logTime = None self.msgType = None self.msgContent = None def parse_line(line): line = line.rstrip() results = ParsedLogLine() results.logLineColumns = line.split("|") if len(results.logLineColumns) == 6: results.logTime = parse( results.logLineColumns[1] ) - timedelta(hours=3) results.msgContent = json.loads( results.logLineColumns[5] ) results.hoplonDbLogLine = results.logLineColumns[0] return ( results.msgContent["GameSessionId"], results ) # logFile = "YOUR_SPARK_HOME/README.md" # Should be some file on your system logFile = "/home/rdenis/full_sessions.txt" # Should be some file on your system sc = SparkContext("local", "Simple App") logData = sc.textFile(logFile) countNb = logData.map(parse_line).count() print "count:", countNb, "partition nb:", logData._jrdd.splits().size() Thanks for the help in advance! -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/No-parallelism-in-map-transformation-tp9863.html Sent from the Apache Spark User List mailing list archive at Nabble.com.