Here is how you would read from Google Cloud Storage(note you need to create a service account key) ->
os.environ['PYSPARK_SUBMIT_ARGS'] = """--jars /home/neil/Downloads/gcs-connector-latest-hadoop2.jar pyspark-shell""" from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, SQLContext conf = SparkConf()\ .setMaster("local[8]")\ .setAppName("GS") sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") sc._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") sc._jsc.hadoopConfiguration().set("fs.gs.project.id", "PUT UR GOOGLE PROJECT ID HERE") sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.email", "testa...@sparkgcs.iam.gserviceaccount.com") sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true") sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.keyfile", "sparkgcs-96bd21691c29.p12") spark = SparkSession.builder\ .config(conf=sc.getConf())\ .getOrCreate() dfTermRaw = spark.read.format("csv")\ .option("header", "true")\ .option("delimiter" ,"\t")\ .option("inferSchema", "true")\ .load("gs://bucket_test/sample.tsv") -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Spark-Read-from-Google-store-and-save-in-AWS-s3-tp28278p28286.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe e-mail: user-unsubscr...@spark.apache.org