Вам нужно преобразовать и показать, вот и все. Вот быстрый пример, чтобы помочь вам. Надеюсь, это поможет.
from pyspark.ml.feature import Tokenizer
df = spark.createDataFrame([
(0, 'Hello and good day'),
(1, 'This is a simple demostration'),
(2, 'Natural and unnatural language processing')
], ['id', 'sentence'])
df.show(truncate=False)
# +---+-----------------------------------------+
# |id |sentence |
# +---+-----------------------------------------+
# |0 |Hello and good day |
# |1 |This is a simple demostration |
# |2 |Natural and unnatural language processing|
# +---+-----------------------------------------+
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenized = tokenizer.transform(df)
tokenized.select('words').show(truncate=False)
# +-----------------------------------------------+
# |words |
# +-----------------------------------------------+
# |[hello, and, good, day] |
# |[this, is, a, simple, demostration] |
# |[natural, and, unnatural, language, processing]|
# +-----------------------------------------------+