I wanted to use get feature importances related to a Random Forest as
described in this JIRA: https://issues.apache.org/jira/browse/SPARK-5133
However, I don’t see how to call this. I don't see any methods exposed on
org.apache.spark.mllib.tree.RandomForest
How can I get featureImportances when I generate a RandomForest model in
this code?
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import util.Random
def displayModel(model:RandomForestModel) = {
// Display model.
println("Learned classification tree model:\n" + model.toDebugString)
}
def saveModel(model:RandomForestModel,path:String) = {
// Save and load model.
model.save(sc, path)
val sameModel = DecisionTreeModel.load(sc, path)
}
def testModel(model:RandomForestModel,testData:RDD[LabeledPoint]) = {
// Test model.
val labelAndPreds = testData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val testErr = labelAndPreds.
filter(r => r._1 != r._2).count.toDouble / testData.count()
println("Test Error = " + testErr)
}
def buildModel(trainingData:RDD[LabeledPoint],
numClasses:Int,categoricalFeaturesInfo:Map[Int,Int]) = {
val numTrees = 30
val featureSubsetStrategy = "auto"
val impurity = "gini"
val maxDepth = 4
val maxBins = 32
// Build model.
val model = RandomForest.trainClassifier(
trainingData, numClasses, categoricalFeaturesInfo,
numTrees, featureSubsetStrategy, impurity, maxDepth,
maxBins)
model
}
// Create plain RDD.
val rdd = sc.parallelize(Range(0,1000))
// Convert to LabeledPoint RDD.
val data = rdd.
map(x => {
val label = x % 2
val feature1 = x % 5
val feature2 = x % 7
val features = Seq(feature1,feature2).
map(_.toDouble).
zipWithIndex.
map(_.swap)
val vector = Vectors.sparse(features.size, features)
val point = new LabeledPoint(label, vector)
point })
// Split data into training (70%) and test (30%).
val splits = data.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))
// Set up parameters for training.
val numClasses = data.map(_.label).distinct.count.toInt
val categoricalFeaturesInfo = Map[Int, Int]()
val model = buildModel(
trainingData,
numClasses,
categoricalFeaturesInfo)
testModel(model,testData)