object DecisionTreeClassification { def main(args: Array[String]): Unit = { System.setProperty("hadoop.home.dir","E:\tools\hadoop\hadoop-common-2.6.0-bin-master\hadoop-common-2.6.0-bin-master") val spark = SparkSession.builder().master("local").appName("Naive_bayes").getOrCreate() val sc = spark.sparkContext // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/汽车数据样本.txt") // Split the data into training and test sets (30% held out for testing) val splits: Array[RDD[LabeledPoint]] = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a DecisionTree model. // Empty categoricalFeaturesInfo indicates all features are continuous. //指定分类的类别 val numClasses = 2 //设置离散变量,没有指定的话就是连续数据。如果不是连续数据说明数据已经离散化了,可以分类了。 //这个参数就是告诉决策树,我们这边指定的数据,都不是连续数据,换言之,没有指定的数据就是连续数据。 //不是连续数据就说明已经离散化了,是可以分类的。0号特征有四个类别,1号特征也有四个类别。 val categoricalFeaturesInfo = Map[Int, Int](0->4,1->4,2->3,3->3) val impurity = "entropy"//gini & entropy两种选择,一个是熵一个是基尼系数 //预剪枝,最大深度。防止模型过拟合。 val maxDepth = 5 val maxBins = 32//离散化程度处理连续数据,相当于%32,这个值减少也等同于剪枝 val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins) // evaluate model on test instances and compute test error val labelAndPreds: RDD[(Double, Double)] = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / testData.count() println(s"Test Error = $testErr") //决策树说白了就是一串if else println(s"Learned classification tree model:n ${model.toDebugString}") // Save and load model model.save(sc, "target/tmp/myDecisionTreeClassificationModel") // val sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel") spark.stop() }