как я могу изменить код ниже, я почти без ума от spark sql
Seq [Any] (meta.size, sortedIndex) это покажет ошибку ниже
val dummyHotSparse = (line: String) => {
val splits = line.split(",")
val index = ArrayBuffer[Int]()
val value = ArrayBuffer[Double]()
for(i <- 0 until splits.length){
if(meta.contains(splits(i))){
index += meta(splits(i))
value += 1
}
}
// pay attention!!!, we can only sort the index, without sort the value, as all the value are 1
val sortedIndex = Sorting.stableSort(index)
(meta.size, sortedIndex, value)
**Seq[Any](meta.size, sortedIndex)**
}
meta.foreach(println)
val upperUDF = functions.udf(dummyHotSparse)
val newFeature = mycol + "_indexed"
var x = df.withColumn(newFeature, upperUDF(functions.col(mycol)))
я получаю ошибку ниже:
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:780)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:715)
at scala.reflect.internal.tpe.TypeConstraints$UndoLog.undo(TypeConstraints.scala:55)
at org.apache.spark.sql.catalyst.ScalaReflection$class.cleanUpReflectionObjects(ScalaReflection.scala:824)
at org.apache.spark.sql.catalyst.ScalaReflection$.cleanUpReflectionObjects(ScalaReflection.scala:39)
at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:714)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:736)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:715)
at scala.reflect.internal.tpe.TypeConstraints$UndoLog.undo(TypeConstraints.scala:55)
at org.apache.spark.sql.catalyst.ScalaReflection$class.cleanUpReflectionObjects(ScalaReflection.scala:824)
at org.apache.spark.sql.catalyst.ScalaReflection$.cleanUpReflectionObjects(ScalaReflection.scala:39)
at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:714)
at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:711)
at org.apache.spark.sql.functions$.udf(functions.scala:3398)
at org.FeaturesExtraction.Util$.dummyHotFeature(Util.scala:81)
at org.FeaturesExtraction.Util$.run(Util.scala:131)
at org.FeaturesExtraction.Util$.main(Util.scala:140)
at org.FeaturesExtraction.Util.main(Util.scala)
Любые советы приветствуются, спасибо ~