以下是我在Linux虚拟机的机器学习代码(参考自林子雨《spark编程基础》官方网站第八章PPT)
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector,Vectors}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer,HashingTF, Tokenizer}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}
import org.apache.spark.sql.functions;
import spark.implicits._
case class cars(features: org.apache.spark.ml.linalg.Vector, label:String)
val data = spark.sparkContext.textFile("file:///usr/local/bigdatacase/dataset/jq1.txt").map(_.split("\t")).map(p => cars(Vectors.dense(p(1).toDouble,p(2).toDouble,p(3).toDouble),p(0).toString())).toDF()
data.createOrReplaceTempView("cars")
val df = spark.sql("select * from cars where label ='宝马' or label ='奔驰' or label ='斯柯达'")
df.map(t => t(1)+":"+t(0)).collect().foreach(println)
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
val Array(trainingData, testData) = df.randomSplit(Array(0.3, 0.7))
val cr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val crPipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, cr, labelConverter))
val crPipelineModel = crPipeline.fit(trainingData)
val crPredictions = crPipelineModel.transform(testData)
crPredictions.select("predictedLabel", "label", "features", "probability").collect().foreach { case Row(predictedLabel: String, label: String,features: Vector, prob: Vector) => println(s"($label, $features) --> prob=$prob, predicted Label=$predictedLabel")}
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
val crAccuracy = evaluator.evaluate(crPredictions)
准确率是0.59多,我想提高模型的预测准确率,应该怎么做呢?
当我把val df = spark.sql("select * from cars where label ='宝马' or label ='奔驰' or label ='斯柯达'")换成val df = spark.sql("select * from cars where label ='别克' or label ='宝马' or label ='奔驰' or label ='斯柯达'")时,也就是增多一个标签,准确率只有0.41多,有没有什么办法在增加标签的同时,提高模型的预测准确率呢?