Вы можете использовать JSON-анализатор:
import json
from itertools import chain
from pyspark.sql.functions import udf, concat_ws
@udf("array<string>")
def parse(s):
try:
return list(chain.from_iterable(x.values() for x in json.loads(s)))
except:
pass
df = spark.createDataFrame(
[(1123798, """[{"1123798":"Other, poets"},{"1112194":" Poetry for kids"}]""")],
("catalogid", "catalogpath")
)
result = df.select("catalogid", parse("catalogpath").alias("catalog_desc"))
result.show(truncate=False)
# +---------+----------------------------------+
# |catalogid|catalog_desc |
# +---------+----------------------------------+
# |1123798 |[Other, poets, Poetry for kids]|
# +---------+----------------------------------+
Если вы хотите одну строку, вы можете применить concat_ws
:
result.withColumn("catalog_desc", concat_ws(";", "catalog_desc")).show(truncate=False)
# +---------+-------------------------------+
# |catalogid|catalog_desc |
# +---------+-------------------------------+
# |1123798 |Other, poets; Poetry for kids|
# +---------+-------------------------------+