Reading from Amazon S3

Jinan Alhajjaj Tue, 26 Apr 2016 09:06:57 -0700

Hi All,I am trying to read a file stored in Amazon S3.I wrote this code:import 
java.util.List;
import java.util.Scanner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class WordAnalysis {
        public static void main(String[] args) {
            int startYear=0;
            int endyear=0;
            Scanner input = new Scanner(System.in);  
            System.out.println("Please, Enter 1 if you want 1599-2008 or enter 
2 for specific range: ");
            int choice=input.nextInt();
           
                        if(choice==1)
                        {
                                startYear=1500;
                                endyear=2008;
                        }
                        if(choice==2)
                        {
                                System.out.print("please,Enter the start year : 
");
                                startYear = input.nextInt();
                                System.out.print("please,Enter the end year : 
");
                                endyear = input.nextInt();
                        }                       
                SparkConf conf = new 
SparkConf().setAppName("jinantry").setMaster("local");             
                JavaSparkContext spark = new JavaSparkContext(conf);
                SQLContext sqlContext = new 
org.apache.spark.sql.SQLContext(spark);
                JavaRDD<Items> ngram = 
spark.textFile("s3n://google-books-ngram/1gram/googlebooks-eng-all-1gram-20120701-x.gz‏")
                                .map(new Function<String, Items>() {
                                        public Items call(String line) throws 
Exception {
                                                String[] parts = 
line.split("\t");
                                                Items item = new Items();
                                                        if (parts.length == 4) {
                                                                
item.setWord(parts[0]);
                                                                
item.setYear(Integer.parseInt(parts[1]));
                                                                
item.setCount(Integer.parseInt(parts[2]));
                                                                
item.setVolume(Integer.parseInt(parts[3]));
                                                                return item;
                                                        } else {
                                                                item.setWord(" 
");
                                                                
item.setYear(Integer.parseInt(" "));
                                                                
item.setCount(Integer.parseInt(" "));
                                                                
item.setVolume(Integer.parseInt(" "));
                                                                return item;
                                                        }
                                                }                               
                                });
                DataFrame schemangram = sqlContext.createDataFrame(ngram, 
Items.class);
                schemangram.registerTempTable("ngram");
                String sql="SELECT word,SUM(count) FROM ngram where year 
>="+startYear+" AND year<="+endyear+" And word LIKE '%_NOUN' GROUP BY word 
ORDER BY SUM(count) DESC";  
                DataFrame matchyear = sqlContext.sql(sql);
                List<Row> words=matchyear.collectAsList();
                int i=1;
            for (Row scholar : words) { 
                        System.out.println(scholar);                    
                        if(i==10)
                                break;
                        i++;
                   }



        }


}

When I run it this error appear to me:Exception in thread "main" 
org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange rangepartitioning(aggOrder#5L DESC,200), None
+- ConvertToSafe
   +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Final,isDistinct=false)], output=[word#2,_c1#4L,aggOrder#5L])
      +- TungstenExchange hashpartitioning(word#2,200), None
         +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L])
            +- Project [word#2,count#0]
               +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE 
%_NOUN)
                  +- Scan ExistingRDD[count#0,volume#1,word#2,year#3] 


        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
        at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:247)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
        at 
org.apache.spark.sql.execution.ConvertToUnsafe.doExecute(rowFormatConverters.scala:38)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
        at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:64)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
        at 
org.apache.spark.sql.execution.Project.doExecute(basicOperators.scala:46)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
        at 
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:55)
        at 
org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:55)
        at org.apache.spark.sql.DataFrame.rdd$lzycompute(DataFrame.scala:1637)
        at org.apache.spark.sql.DataFrame.rdd(DataFrame.scala:1634)
        at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1$$anonfun$apply$12.apply(DataFrame.scala:1493)
        at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1$$anonfun$apply$12.apply(DataFrame.scala:1493)
        at 
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
        at 
org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2086)
        at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1.apply(DataFrame.scala:1492)
        at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1.apply(DataFrame.scala:1491)
        at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2099)
        at org.apache.spark.sql.DataFrame.collectAsList(DataFrame.scala:1491)
        at WordAnalysis.main(WordAnalysis.java:60)
Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: 
execute, tree:
TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Final,isDistinct=false)], output=[word#2,_c1#4L,aggOrder#5L])
+- TungstenExchange hashpartitioning(word#2,200), None
   +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L])
      +- Project [word#2,count#0]
         +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE 
%_NOUN)
            +- Scan ExistingRDD[count#0,volume#1,word#2,year#3] 


        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
        at 
org.apache.spark.sql.execution.aggregate.TungstenAggregate.doExecute(TungstenAggregate.scala:80)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
        at 
org.apache.spark.sql.execution.ConvertToSafe.doExecute(rowFormatConverters.scala:56)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
        at 
org.apache.spark.sql.execution.Exchange.prepareShuffleDependency(Exchange.scala:164)
        at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:254)
        at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:248)
        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)
        ... 33 more
Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: 
execute, tree:
TungstenExchange hashpartitioning(word#2,200), None
+- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L])
   +- Project [word#2,count#0]
      +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE %_NOUN)
         +- Scan ExistingRDD[count#0,volume#1,word#2,year#3] 


        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
        at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:247)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
        at 
org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:86)
        at 
org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:80)
        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)
        ... 47 more
Caused by: java.io.IOException: No FileSystem for scheme: s3n
        at 
org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2584)
        at 
org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2591)
        at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91)
        at 
org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2630)
        at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2612)
        at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370)
        at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296)
        at 
org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:256)
        at 
org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228)
        at 
org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313)
        at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:199)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
        at scala.Option.getOrElse(Option.scala:120)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
        at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
        at 
org.apache.spark.sql.execution.Exchange.prepareShuffleDependency(Exchange.scala:220)
        at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:254)
        at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:248)
        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)
        ... 55 more could any one help me in this.Thank you

Reading from Amazon S3

Reply via email to