он отлично работает на моей машине
Ваш код в IntelliJ
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession
object MllibError {
val spark = SparkSession
.builder()
.appName("MllibError")
.master("local[*]")
.config("spark.sql.shuffle.partitions","4") //Change to a more reasonable default number of partitions for our data
.config("spark.app.id","MllibError") // To silence Metrics warning
.getOrCreate()
def main(args: Array[String]): Unit = {
val dataset = spark.createDataFrame(
Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
).toDF("id", "hour", "mobile", "userFeatures", "clicked")
val assembler = new VectorAssembler()
.setInputCols(Array("hour", "mobile", "userFeatures"))
.setOutputCol("features")
val output = assembler.transform(dataset)
output.show(truncate = false)
}
}
вывод
+---+----+------+--------------+-------+-----------------------+
|id |hour|mobile|userFeatures |clicked|features |
+---+----+------+--------------+-------+-----------------------+
|0 |18 |1.0 |[0.0,10.0,0.5]|1.0 |[18.0,1.0,0.0,10.0,0.5]|
+---+----+------+--------------+-------+-----------------------+
возможно, у вас проблемы с зависимостями или версиями библиотек. build.sbt
name := "scala-programming-for-data-science"
version := "0.1"
scalaVersion := "2.11.10"
// https://mvnrepository.com/artifact/org.apache.spark/spark-mllib
libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.2.0"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.2.0"
libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.2.0"
libraryDependencies += "org.apache.spark" %% "spark-hive" % "2.2.0"
Надеюсь, это даст вам некоторые подсказки. С уважением.