
需要数据源可以留言
object DecisionTreeClassification {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir","E:\tools\hadoop\hadoop-common-2.6.0-bin-master\hadoop-common-2.6.0-bin-master")
val spark = SparkSession.builder().master("local").appName("Naive_bayes").getOrCreate()
val sc = spark.sparkContext
// Load and parse the data file.
val data = MLUtils.loadLibSVMFile(sc, "data/汽车数据样本.txt")
// Split the data into training and test sets (30% held out for testing)
val splits: Array[RDD[LabeledPoint]] = data.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))
// Train a DecisionTree model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
//指定分类的类别
val numClasses = 2
//设置离散变量,没有指定的话就是连续数据。如果不是连续数据说明数据已经离散化了,可以分类了。
//这个参数就是告诉决策树,我们这边指定的数据,都不是连续数据,换言之,没有指定的数据就是连续数据。
//不是连续数据就说明已经离散化了,是可以分类的。0号特征有四个类别,1号特征也有四个类别。
val categoricalFeaturesInfo = Map[Int, Int](0->4,1->4,2->3,3->3)
val impurity = "entropy"//gini & entropy两种选择,一个是熵一个是基尼系数
//预剪枝,最大深度。防止模型过拟合。
val maxDepth = 5
val maxBins = 32//离散化程度处理连续数据,相当于%32,这个值减少也等同于剪枝
val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
impurity, maxDepth, maxBins)
// evaluate model on test instances and compute test error
val labelAndPreds: RDD[(Double, Double)] = testData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / testData.count()
println(s"Test Error = $testErr")
//决策树说白了就是一串if else
println(s"Learned classification tree model:n ${model.toDebugString}")
// Save and load model
model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
// val sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")
spark.stop()
}
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)