Искра Scala MLlib исключение: java .lang.IllegalArgumentException - PullRequest
0 голосов
/ 29 апреля 2020

Я новичок в Spark MLLib и пытаюсь выполнить приведенный ниже искровой код

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

val dataset = spark.createDataFrame(
  Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
).toDF("id", "hour", "mobile", "userFeatures", "clicked")

val assembler = new VectorAssembler()
  .setInputCols(Array("hour", "mobile", "userFeatures"))
  .setOutputCol("features")

val output = assembler.transform(dataset)

Но я получаю следующее исключение

java.lang.IllegalArgumentException: Data type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> of column userFeatures is not supported.
  at org.apache.spark.ml.feature.VectorAssembler.transformSchema(VectorAssembler.scala:169)
  at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
  at org.apache.spark.ml.feature.VectorAssembler.transform(VectorAssembler.scala:86)
  ... 51 elided

1 Ответ

0 голосов
/ 30 апреля 2020

он отлично работает на моей машине

Ваш код в IntelliJ

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession

object MllibError {
  val spark = SparkSession
    .builder()
    .appName("MllibError")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions","4") //Change to a more reasonable default number of partitions for our data
    .config("spark.app.id","MllibError") // To silence Metrics warning
    .getOrCreate()
  def main(args: Array[String]): Unit = {
    val dataset = spark.createDataFrame(
      Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")

    val assembler = new VectorAssembler()
      .setInputCols(Array("hour", "mobile", "userFeatures"))
      .setOutputCol("features")

    val output = assembler.transform(dataset)
    output.show(truncate = false)
  }
}

вывод

+---+----+------+--------------+-------+-----------------------+
|id |hour|mobile|userFeatures  |clicked|features               |
+---+----+------+--------------+-------+-----------------------+
|0  |18  |1.0   |[0.0,10.0,0.5]|1.0    |[18.0,1.0,0.0,10.0,0.5]|
+---+----+------+--------------+-------+-----------------------+

возможно, у вас проблемы с зависимостями или версиями библиотек. build.sbt

name := "scala-programming-for-data-science"

version := "0.1"

scalaVersion := "2.11.10"

// https://mvnrepository.com/artifact/org.apache.spark/spark-mllib
libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.2.0"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.2.0"
libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.2.0"
libraryDependencies += "org.apache.spark" %% "spark-hive" % "2.2.0"

Надеюсь, это даст вам некоторые подсказки. С уважением.

...