?importorg.apache.spark.ml.feature.Word2Vec
val documentDF =spark.createDataFrame(Seq("傳奇 游戲 戰士".split(" "),
"蘋果 梨 香蕉".split(" "),
"傳奇 游戲 種類多".split(" "),
"蘋果 手機 流暢".split(" ")).map(Tuple1.apply)).toDF("text")
val word2Vec = newWord2Vec().setInputCol("text").setOutputCol("result").setVectorSize(10).setMinCount(2)
val model =word2Vec.fit(documentDF)
val result =model.transform(documentDF)
result.show(false)
?importorg.apache.spark.ml.feature.Bucketizer
val data = Array(-8.0, -0.5, -0.3,0.0, 0.2, 9.0)
val splits = Array(Double.NegativeInfinity,-0.5, 0.0, 0.5, Double.PositiveInfinity)
val dataFrame =spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val bucketizer = newBucketizer().setInputCol("features").setOutputCol("bucketedFeatures").setSplits(splits)
bucketizer.transform(dataFrame).show(false)
?importorg.apache.spark.ml.feature.QuantileDiscretizer
val data = Array((0, 18.0), (1, 19.0),(2, 8.0), (3, 5.0), (4, 2.2))
var df =spark.createDataFrame(data).toDF("id", "hour")
val discretizer = newQuantileDiscretizer().setInputCol("hour").setOutputCol("result").setNumBuckets(3)
val result =discretizer.fit(df).transform(df)
result.show()
?importorg.apache.spark.ml.feature.Normalizer
importorg.apache.spark.ml.linalg.{Vector,Vectors}
val data=Seq(Vectors.dense(-1,1,1,8,56),Vectors.dense(-1,3,-1,-9,88),Vectors.dense(0,5,1,10,96),Vectors.dense(0,5,1,11,589),Vectors.dense(0,5,1,11,688))
valdf=spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val normalizer = newNormalizer().setInputCol("features").setOutputCol("normFeatures").setP(1.0)
normalizer.transform(df).show(false)
importorg.apache.spark.ml.feature.PCA
importorg.apache.spark.ml.linalg.{Vector,Vectors}
val data = Array(Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0,7.0))
val df =spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val scaledDataFrame = newStandardScaler().setInputCol("features").setOutputCol("scaledFeatures").fit(df).transform(df)
val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(scaledDataFrame)
val pcaDF =pca.transform(scaledDataFrame)
pcaDF.select("features","pcaFeatures").show(false)
importorg.apache.spark.ml.feature.VectorIndexer
importorg.apache.spark.ml.linalg.Vectors
val data=Seq(Vectors.dense(-1,1,1,8,56),Vectors.dense(-1,3,-1,-9,88),Vectors.dense(0,5,1,10,96),Vectors.dense(0,5,1,11,589))
valdf=spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val indexer = newVectorIndexer().setInputCol("features").setOutputCol("indexed").setMaxCategories(3)
val indexerModel =indexer.fit(df)
indexerModel.transform(df).show(false)