MLlib: Feature Importances API

Asim Jalis Wed, 16 Dec 2015 21:42:07 -0800

I wanted to use get feature importances related to a Random Forest as
described in this JIRA: https://issues.apache.org/jira/browse/SPARK-5133


However, I don’t see how to call this. I don't see any methods exposed on

org.apache.spark.mllib.tree.RandomForest

How can I get featureImportances when I generate a RandomForest model in
this code?

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import util.Random

def displayModel(model:RandomForestModel) = {
  // Display model.
  println("Learned classification tree model:\n" + model.toDebugString)
}

def saveModel(model:RandomForestModel,path:String) = {
  // Save and load model.
  model.save(sc, path)
  val sameModel = DecisionTreeModel.load(sc, path)
}

def testModel(model:RandomForestModel,testData:RDD[LabeledPoint]) = {
  // Test model.
  val labelAndPreds = testData.map { point =>
    val prediction = model.predict(point.features)
    (point.label, prediction)
  }
  val testErr = labelAndPreds.
    filter(r => r._1 != r._2).count.toDouble / testData.count()
  println("Test Error = " + testErr)
}

def buildModel(trainingData:RDD[LabeledPoint],
  numClasses:Int,categoricalFeaturesInfo:Map[Int,Int]) = {
  val numTrees = 30
  val featureSubsetStrategy = "auto"
  val impurity = "gini"
  val maxDepth = 4
  val maxBins = 32

  // Build model.
  val model = RandomForest.trainClassifier(
    trainingData, numClasses, categoricalFeaturesInfo,
    numTrees, featureSubsetStrategy, impurity, maxDepth,
    maxBins)

  model
}

// Create plain RDD.
val rdd = sc.parallelize(Range(0,1000))

// Convert to LabeledPoint RDD.
val data = rdd.
  map(x => {
    val label = x % 2
    val feature1 = x % 5
    val feature2 = x % 7
    val features = Seq(feature1,feature2).
      map(_.toDouble).
      zipWithIndex.
      map(_.swap)
    val vector = Vectors.sparse(features.size, features)
    val point = new LabeledPoint(label, vector)
    point })

// Split data into training (70%) and test (30%).
val splits = data.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))

// Set up parameters for training.
val numClasses = data.map(_.label).distinct.count.toInt
val categoricalFeaturesInfo = Map[Int, Int]()

val model = buildModel(
    trainingData,
    numClasses,
    categoricalFeaturesInfo)
testModel(model,testData)

MLlib: Feature Importances API

Reply via email to