Вы можете сгруппировать фрейм данных по id
и сгладить соответствующие токенизированные слова перед вычислением TF-IDF.Ниже приведен фрагмент кода примера Spark TF-IDF doc:
val sample = Seq(
(1, "A B C D E"),
(1, "B C D"),
(1, "B C D E"),
(2, "B C D F"),
(2, "A B C"),
(2, "B C E F G")
).toDF("id","sentences")
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
val tokenizer = new Tokenizer().setInputCol("sentences").setOutputCol("words")
val wordsDF = tokenizer.transform(sample)
def flattenWords = udf( (s: Seq[Seq[String]]) => s.flatMap(identity) )
val groupedDF = wordsDF.groupBy("id").
agg(flattenWords(collect_list("words")).as("grouped_words"))
val hashingTF = new HashingTF().
setInputCol("grouped_words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(groupedDF)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.show
// +---+--------------------+--------------------+--------------------+
// | id| grouped_words| rawFeatures| features|
// +---+--------------------+--------------------+--------------------+
// | 1|[a, b, c, d, e, b...|(20,[1,2,10,14,18...|(20,[1,2,10,14,18...|
// | 2|[b, c, d, f, a, b...|(20,[1,2,8,10,14,...|(20,[1,2,8,10,14,...|
// +---+--------------------+--------------------+--------------------+