public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("CountDistinct");
JavaSparkContext jsc = new JavaSparkContext(sparkConf); List<Tuple2<String, String>> list = new ArrayList<Tuple2<String, String>>(); list.add(new Tuple2<String, String>("key1", "val1")); list.add(new Tuple2<String, String>("key1", "val1")); list.add(new Tuple2<String, String>("key2", "val2")); list.add(new Tuple2<String, String>("key2", "val2")); list.add(new Tuple2<String, String>("key2", "val22")); JavaPairRDD<String, Integer> rdd = jsc.parallelize(list).mapToPair(t -> new Tuple2<String, Integer>(t._1 + " " +t._2, 1)); JavaPairRDD<String, Integer> rdd2 = rdd.reduceByKey((c1, c2) -> c1+c2 ); List<Tuple2<String, Integer>> output = rdd2.collect(); for (Tuple2<?,?> tuple : output) { System.out.println( tuple._1() + " : " + tuple._2() ); } } On Sun, Jul 19, 2015 at 2:28 PM, Jerry Lam <chiling...@gmail.com> wrote: > You mean this does not work? > > SELECT key, count(value) from table group by key > > > > On Sun, Jul 19, 2015 at 2:28 PM, N B <nb.nos...@gmail.com> wrote: > >> Hello, >> >> How do I go about performing the equivalent of the following SQL clause >> in Spark Streaming? I will be using this on a Windowed DStream. >> >> SELECT key, count(distinct(value)) from table group by key; >> >> so for example, given the following dataset in the table: >> >> key | value >> -----+------- >> k1 | v1 >> k1 | v1 >> k1 | v2 >> k1 | v3 >> k1 | v3 >> k2 | vv1 >> k2 | vv1 >> k2 | vv2 >> k2 | vv2 >> k2 | vv2 >> k3 | vvv1 >> k3 | vvv1 >> >> the result will be: >> >> key | count >> -----+------- >> k1 | 3 >> k2 | 2 >> k3 | 1 >> >> Thanks >> Nikunj >> >> >