Я думаю, что комбинация Tokenizer
и explode
может работать. Решение приведено ниже:
scala> val data = spark.read.format("csv").option("delimiter", "\t").schema(schema).load("plot_summaries.txt")
data: org.apache.spark.sql.DataFrame = [DocumentID: bigint, Description: string]
scala> data.show(1)
+----------+--------------------+
|DocumentID| Description|
+----------+--------------------+
| 23890098|Shlykov, a hard-w...|
+----------+--------------------+
only showing top 1 row
scala> import org.apache.spark.sql.functions.explode
import org.apache.spark.sql.functions.explode
scala> import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.feature.Tokenizer
scala> val tokenizer = new Tokenizer().setInputCol("Description").setOutputCol("Words")
tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_80d1c6e72cbc
scala> val wordsData = tokenizer.transform(data)
wordsData: org.apache.spark.sql.DataFrame = [DocumentID: bigint, Description: string ... 1 more field]
scala> wordsData.show(1)
+----------+--------------------+--------------------+
|DocumentID| Description| Words|
+----------+--------------------+--------------------+
| 23890098|Shlykov, a hard-w...|[shlykov,, a, har...|
+----------+--------------------+--------------------+
only showing top 1 row
scala> val newWordsData = wordsData.drop("Description")
newWordsData: org.apache.spark.sql.DataFrame = [DocumentID: bigint, Words: array<string>]
scala> newWordsData.show(1)
+----------+--------------------+
|DocumentID| Words|
+----------+--------------------+
| 23890098|[shlykov,, a, har...|
+----------+--------------------+
only showing top 1 row
scala> val flattened = newWordsData.withColumn("token",explode($"Words"))
flattened: org.apache.spark.sql.DataFrame = [DocumentID: bigint, Words: array<string> ... 1 more field]
scala> flattened.show
+----------+--------------------+-------------+
|DocumentID| Words| token|
+----------+--------------------+-------------+
| 23890098|[shlykov,, a, har...| shlykov,|
| 23890098|[shlykov,, a, har...| a|
| 23890098|[shlykov,, a, har...| hard-working|
| 23890098|[shlykov,, a, har...| taxi|
| 23890098|[shlykov,, a, har...| driver|
| 23890098|[shlykov,, a, har...| and|
| 23890098|[shlykov,, a, har...| lyosha,|
| 23890098|[shlykov,, a, har...| a|
| 23890098|[shlykov,, a, har...| saxophonist,|
| 23890098|[shlykov,, a, har...| develop|
| 23890098|[shlykov,, a, har...| a|
| 23890098|[shlykov,, a, har...| bizarre|
| 23890098|[shlykov,, a, har...| love-hate|
| 23890098|[shlykov,, a, har...|relationship,|
| 23890098|[shlykov,, a, har...| and|
| 23890098|[shlykov,, a, har...| despite|
| 23890098|[shlykov,, a, har...| their|
| 23890098|[shlykov,, a, har...| prejudices,|
| 23890098|[shlykov,, a, har...| realize|
| 23890098|[shlykov,, a, har...| they|
+----------+--------------------+-------------+
only showing top 20 rows
Дайте мне знать, если это поможет !!