Hi All, PFB code.
import org.apache.spark.ml.feature.{HashingTF, IDF} import org.apache.spark.ml.linalg.SparseVector import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} /** * Created by satyajit on 12/7/16. */ object DIMSUMusingtf extends App { val conf = new SparkConf() .setMaster("local[1]") .setAppName("testColsim") val sc = new SparkContext(conf) val spark = SparkSession .builder .appName("testColSim").getOrCreate() import org.apache.spark.ml.feature.Tokenizer val sentenceData = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.show() rescaledData.select("features", "label").take(3).foreach(println) val check = rescaledData.select("features") val row = check.rdd.map(row => row.getAs[SparseVector]("features")) val mat = new RowMatrix(row) //i am basically trying to use Dense.vector as a direct input to rowMatrix, but i get an error that RowMatrix Cannot resolve constructor row.foreach(println) } Any help would be appreciated. Regards, Satyajit.