基于XGBoost的PU-Learning
論文:Detecting positive and negative deceptive opinions using PU-learning
PU-learning是一種只有正樣本的半監(jiān)督的二分類(lèi)器。在實(shí)際工程應(yīng)用中,有時(shí)候我們會(huì)遇到只有正樣本而沒(méi)有負(fù)樣本的分類(lèi)任務(wù),或者說(shuō)負(fù)樣本是不純的,即負(fù)樣本中摻雜有部分正樣本。PU-learning提供了一種選擇可靠負(fù)樣本的機(jī)制,具體算法如下:
原始的PU-Learning
算法解釋:
????1:先用正樣本(positive)與未標(biāo)注樣本(或者稱(chēng)作無(wú)標(biāo)簽樣本Unlabel)訓(xùn)練分類(lèi)器
????2:根據(jù)訓(xùn)練得到的分類(lèi)器對(duì)未標(biāo)注樣本進(jìn)行分類(lèi)
????3:把分類(lèi)為負(fù)樣本的樣本作為可靠的負(fù)樣本
????4-14:把剩下的未標(biāo)注樣本與正樣本再訓(xùn)練分類(lèi)器,不斷重復(fù)1-3過(guò)程,直至沒(méi)有更多可靠負(fù)樣本
新的PU-Learning
新的PU-Learning在原有的基礎(chǔ)上進(jìn)行了修正,具體由于看論文出處
基于XGBoost分類(lèi)器代碼如下:
package org.jmlab.ml
 ?
 import java.io.{File, PrintWriter}
 ?
 import ml.dmlc.xgboost4j.LabeledPoint
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost}
 import org.apache.log4j.{LogManager, Logger}
 import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.SparkSession
 ?
 /**
 ? * Created by jmzhou on 2018/9/18.
 ? */
 class GradualReductionPULearner {
 ?
 ? val log: Logger = LogManager.getLogger(getClass)
 ? val prebiThreshold = 0.3f
 ? var spark: SparkSession = _
 ? val iterationNum = 20
 ?
 ? def loadData(): DMatrix ={
 ? ? val valid = MLUtils.loadLibSVMFile(spark.sparkContext, "data/data.libsvm")
 ? ? ? .map(point => {
 ? ? ? ? LabeledPoint(point.label.toFloat,
 ? ? ? ? ? point.features.toSparse.indices,
 ? ? ? ? ? point.features.toSparse.values.map(_.toFloat)
 ? ? ? ? )
 ? ? ? }).collect().toIterator
 ? ? new DMatrix(valid)
 ? }
 ?
 ? def weight(labeledPoints: Array[LabeledPoint]): (Booster, Array[LabeledPoint]) ={
 ? ? val posPoint = labeledPoints.filter(p => p.label == 1.0)
 ? ? val init = zeroStep(labeledPoints)
 ?
 ? ? var relNegPoint = init._1
 ? ? var negPoint = init._2
 ? ? var preNegPoint = negPoint
 ? ? var classifier: Booster = null
 ? ? var iterNum = 1
 ?
 ? ? val validDMat = loadData()
 ?
 ? ? var relNegNum = 0
 ? ? var stopFlag = false
 ?
 ? ? while (negPoint.length <= preNegPoint.length && posPoint.length < relNegPoint.length && !stopFlag){
 ? ? ? iterNum += 1
 ? ? ? println("iterNum: " + iterNum)
 ? ? ? val dmat = new DMatrix((posPoint++relNegPoint).toIterator)
 ? ? ? val posNum = posPoint.length
 ? ? ? val negNum = relNegPoint.length
 ? ? ? classifier = XGBoost.train(dmat, getParamMap(posNum, negNum), iterationNum)
 // ? ? ?evaluate(spark, classifier, validDMat)
 ? ? ? val predict = classifier.predict(new DMatrix(relNegPoint.toIterator)).flatten
 ? ? ? ? .map(p => if(p > prebiThreshold) 1.0f else 0.0f)
 ? ? ? preNegPoint = negPoint
 ? ? ? negPoint = relNegPoint.zip(predict).filter{case(p, l) => l == 0.0f}.map(_._1)
 ? ? ? relNegPoint = (relNegPoint ++ negPoint).distinct
 ? ? ? println("posNum: " + posNum)
 ? ? ? if (relNegNum != relNegPoint.length)
 ? ? ? ? relNegNum = relNegPoint.length
 ? ? ? else if (iterNum >= 2)
 ? ? ? ? stopFlag = true
 ? ? ? println("relNegPoint: " + relNegNum)
 ? ? }
 ? ? (classifier, posPoint++relNegPoint)
 ? }
 ?
 ? def zeroStep(labeledPoints: Array[LabeledPoint]): (Array[LabeledPoint], Array[LabeledPoint]) = {
 ? ? val posNum = labeledPoints.count(p => p.label == 1.0)
 ? ? val negNum = labeledPoints.count(p => p.label == 0.0)
 ? ? val unLabelPoint = labeledPoints.filter(p => p.label == 0.0)
 ? ? val dmat = new DMatrix(labeledPoints.toIterator)
 ? ? val classifier = XGBoost.train(dmat, getParamMap(posNum, negNum), iterationNum)
 ? ? val validDMat = loadData()
 // ? ?evaluate(spark, classifier, validDMat)
 ? ? val predict = classifier.predict(new DMatrix(unLabelPoint.toIterator))
 ? ? ? .flatten.map(p => if(p > prebiThreshold) 1.0f else 0.0f)
 ? ? val negPoint = unLabelPoint.zip(predict).filter{case(p, l) => l == 0.0f}.map(_._1)
 ? ? val relNegPoint = negPoint
 ? ? (relNegPoint, negPoint)
 ? }
 ?
 ? def getParamMap(posNum: Int, negNum: Int): Map[String, Any] = {
 ? ? List("eta" -> 0.1f,
 ? ? ? "scale_pos_weight" -> negNum/posNum.toDouble,
 ? ? ? "max_depth" -> 5,
 ? ? ? "silent" -> 0,
 ? ? ? "objective" -> "binary:logistic",
 ? ? ? "lambda" -> 2.5,
 ? ? ? "rate_drop" -> 0.5,
 ? ? ? "alpha" -> 1
 ? ? ).toMap
 ? }
 ?
 ? def evaluate(spark: SparkSession, model: Booster, test_dmat: DMatrix): Unit ={
 ? ? val labels = test_dmat.getLabel.map(_.toDouble)
 ? ? val predict_xgb = model.predict(test_dmat).flatten
 ?
 ? ? val scoreAndLabels = spark.sparkContext.makeRDD(predict_xgb.map(_.toDouble) zip labels)
 ?
 ? ? val xgbMetrics = new BinaryClassificationMetrics(scoreAndLabels)
 ? ? val auROC = xgbMetrics.areaUnderROC()
 ?
 ? ? println("xgboost: Area under ROC = " + auROC)
 ?
 ? ? val predicts = predict_xgb.map(p => if(p >= prebiThreshold) 1.0 else 0.0)
 ? ? val predictsAndLabels = spark.sparkContext.makeRDD(predicts zip labels)
 ?
 ? ? val roc = xgbMetrics.roc().map{case(fpr, recall) => s"$fpr,$recall"}.collect()
 ?
 ? ? val metrics = new MulticlassMetrics(predictsAndLabels)
 ? ? val confusionMatrix = metrics.confusionMatrix
 ? ? println("confusionMatrix: ")
 ? ? println(confusionMatrix)
 ?
 ? ? val TP = confusionMatrix.apply(1, 1)
 ? ? val FP = confusionMatrix.apply(0, 1)
 ? ? val FN = confusionMatrix.apply(1, 0)
 ? ? val P = TP/(TP+FP)
 ? ? val R = TP/(TP+FN)
 ?
 ? ? println("P: " + P)
 ? ? println("R: " + R)
 ?
 ? ? val f1 = 2*P*R/(P+R)
 ?
 ? ? println("accuracy: " + metrics.accuracy)
 ? ? println("f1 score: " + f1)
 ? ? println("class 1 recall: " + metrics.recall(1.0))
 ? ? println("class 0 recall: " + metrics.recall(0.0))
 ?
 ? }
 ?
 }
 ?
總結(jié)
以上是生活随笔為你收集整理的基于XGBoost的PU-Learning的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
                            
                        - 上一篇: 推荐系统(工程方向)-策略平台
 - 下一篇: PU learning学习笔记