Я хочу найти повторяющуюся статью с моделью MinHash, предоставленной Spark MLlib, затем я обнаружил исключение: «Должен быть хотя бы 1 ненулевой элемент».
Я полагаю, что это вызвано val featurizedData = mh.transform(tmp).cache()
.
Но я не могу понять, что не так.
код:
val data = spark.read.format("jdbc").
option("url", "jdbc:mysql://localhost/test").
option("dbtable", "article").
option("user", "root").
option("password", "123456").load().
select("id", "content").
map(r =>
(r.getString(0), segment(r.getString(1))
).toDF("id", "content").where("content is not null").cache()
val cvModel: CountVectorizerModel = new CountVectorizer().
setInputCol("content").
setOutputCol("features").
setMinDF(2).
fit(data)
val tmp = cvModel.transform(data).cache()
val mh = new MinHashLSH().
setNumHashTables(5).
setInputCol("features").
setOutputCol("hashes").
fit(tmp)
val featurizedData = mh.transform(tmp).cache()
val res = mh.approxSimilarityJoin(featurizedData, featurizedData, 0.1, "sim").
select(col("datasetA.id").alias("idA"),
col("datasetB.id").alias("idB"),
col("sim"))
import org.apache.spark.sql.SaveMode
res.toDF().write.format("jdbc").
mode(SaveMode.Overwrite).
option("url", "jdbc:mysql://localhost/test).
option("dbtable", "article_duplicated_913").
option("user", "root").
option("password", "123456").
save()
информация об исключении:
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$1: (vector) => array<vector>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply2_1$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:102)
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:92)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: Must have at least 1 non zero entry.
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.ml.feature.MinHashLSHModel$$anonfun$1.apply(MinHashLSH.scala:57)
at org.apache.spark.ml.feature.MinHashLSHModel$$anonfun$1.apply(MinHashLSH.scala:56)
... 36 more .
код ссылки