Looking at the cause of the error, it seems hadoop-aws-xx.jar (corresponding to the version of hadoop you use) was missing in classpath.
FYI On Tue, Apr 26, 2016 at 9:06 AM, Jinan Alhajjaj <j.r.alhaj...@hotmail.com> wrote: > Hi All, > I am trying to read a file stored in Amazon S3. > I wrote this code: > > import java.util.List; > > import java.util.Scanner; > > import org.apache.spark.SparkConf; > > import org.apache.spark.api.java.JavaRDD; > > import org.apache.spark.api.java.JavaSparkContext; > > import org.apache.spark.api.java.function.Function; > > import org.apache.spark.sql.DataFrame; > > import org.apache.spark.sql.Row; > > import org.apache.spark.sql.SQLContext; > > public class WordAnalysis { > > public static void main(String[] args) { > > int startYear=0; > > int endyear=0; > > Scanner input = new Scanner(System.in); > > System.out.println("Please, Enter 1 if you want 1599-2008 or enter 2 > for specific range: "); > > int choice=input.nextInt(); > > > > if(choice==1) > > { > > startYear=1500; > > endyear=2008; > > } > > if(choice==2) > > { > > System.out.print("please,Enter the start year : "); > > startYear = input.nextInt(); > > System.out.print("please,Enter the end year : "); > > endyear = input.nextInt(); > > } > > SparkConf conf = new SparkConf().setAppName("jinantry").setMaster("local" > ); > > JavaSparkContext spark = new JavaSparkContext(conf); > > SQLContext sqlContext = new org.apache.spark.sql.SQLContext(spark); > > JavaRDD<Items> ngram = spark.textFile( > "s3n://google-books-ngram/1gram/googlebooks-eng-all-1gram-20120701-x.gz") > > .map(new Function<String, Items>() { > > public Items call(String line) throws Exception { > > String[] parts = line.split("\t"); > > Items item = new Items(); > > if (parts.length == 4) { > > item.setWord(parts[0]); > > item.setYear(Integer.parseInt(parts[1])); > > item.setCount(Integer.parseInt(parts[2])); > > item.setVolume(Integer.parseInt(parts[3])); > > return item; > > } else { > > item.setWord(" "); > > item.setYear(Integer.parseInt(" ")); > > item.setCount(Integer.parseInt(" ")); > > item.setVolume(Integer.parseInt(" ")); > > return item; > > } > > } > > }); > > DataFrame schemangram = sqlContext.createDataFrame(ngram, Items.class); > > schemangram.registerTempTable("ngram"); > > String sql="SELECT word,SUM(count) FROM ngram where year >="+startYear+" > AND year<="+endyear+" And word LIKE '%_NOUN' GROUP BY word ORDER BY > SUM(count) DESC"; > > DataFrame matchyear = sqlContext.sql(sql); > > List<Row> words=matchyear.collectAsList(); > > int i=1; > > for (Row scholar : words) { > > System.out.println(scholar); > > if(i==10) > > break; > > i++; > > } > > > } > > > } > > > When I run it this error appear to me: > > Exception in thread "main" > org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, > tree: > > Exchange rangepartitioning(aggOrder#5L DESC,200), None > > +- ConvertToSafe > > +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as > bigint)),mode=Final,isDistinct=false)], output=[word#2,_c1#4L,aggOrder#5L]) > > +- TungstenExchange hashpartitioning(word#2,200), None > > +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 > as bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L]) > > +- Project [word#2,count#0] > > +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 > LIKE %_NOUN) > > +- Scan ExistingRDD[count#0,volume#1,word#2,year#3] > > > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49) > > at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:247) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130) > > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130) > > at > org.apache.spark.sql.execution.ConvertToUnsafe.doExecute(rowFormatConverters.scala:38) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130) > > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130) > > at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:64) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130) > > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130) > > at > org.apache.spark.sql.execution.Project.doExecute(basicOperators.scala:46) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130) > > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130) > > at > org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:55) > > at > org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:55) > > at org.apache.spark.sql.DataFrame.rdd$lzycompute(DataFrame.scala:1637) > > at org.apache.spark.sql.DataFrame.rdd(DataFrame.scala:1634) > > at > org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1$$anonfun$apply$12.apply(DataFrame.scala:1493) > > at > org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1$$anonfun$apply$12.apply(DataFrame.scala:1493) > > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56) > > at org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2086) > > at > org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1.apply(DataFrame.scala:1492) > > at > org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1.apply(DataFrame.scala:1491) > > at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2099) > > at org.apache.spark.sql.DataFrame.collectAsList(DataFrame.scala:1491) > > at WordAnalysis.main(WordAnalysis.java:60) > > Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: > execute, tree: > > TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as > bigint)),mode=Final,isDistinct=false)], output=[word#2,_c1#4L,aggOrder#5L]) > > +- TungstenExchange hashpartitioning(word#2,200), None > > +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as > bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L]) > > +- Project [word#2,count#0] > > +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE > %_NOUN) > > +- Scan ExistingRDD[count#0,volume#1,word#2,year#3] > > > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49) > > at > org.apache.spark.sql.execution.aggregate.TungstenAggregate.doExecute(TungstenAggregate.scala:80) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130) > > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130) > > at > org.apache.spark.sql.execution.ConvertToSafe.doExecute(rowFormatConverters.scala:56) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130) > > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130) > > at > org.apache.spark.sql.execution.Exchange.prepareShuffleDependency(Exchange.scala:164) > > at > org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:254) > > at > org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:248) > > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48) > > ... 33 more > > Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: > execute, tree: > > TungstenExchange hashpartitioning(word#2,200), None > > +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as > bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L]) > > +- Project [word#2,count#0] > > +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE > %_NOUN) > > +- Scan ExistingRDD[count#0,volume#1,word#2,year#3] > > > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49) > > at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:247) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132) > > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130) > > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) > > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130) > > at > org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:86) > > at > org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:80) > > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48) > > ... 47 more > > Caused by: java.io.IOException: No FileSystem for scheme: s3n > > at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2584 > ) > > at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2591) > > at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91) > > at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2630) > > at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2612) > > at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370) > > at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296) > > at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus( > FileInputFormat.java:256) > > at org.apache.hadoop.mapred.FileInputFormat.listStatus( > FileInputFormat.java:228) > > at org.apache.hadoop.mapred.FileInputFormat.getSplits( > FileInputFormat.java:313) > > at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:199) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > > at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91) > > at > org.apache.spark.sql.execution.Exchange.prepareShuffleDependency(Exchange.scala:220) > > at > org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:254) > > at > org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:248) > > at > org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48) > > ... 55 more > > could any one help me in this. > > Thank you >