Вы можете использовать подход с spark.read.json
и df.rdd.map
, например:
json_string = """
{
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": ["GML", "XML"]
},
"GlossSee": "markup"
}
}
}
}
}
"""
df2 = spark.createDataFrame(
[
(1, json_string),
],
['id', 'txt']
)
df2.dtypes
[('id', 'bigint'), ('txt', 'string')]
new_df = spark.read.json(df2.rdd.map(lambda r: r.txt))
new_df.printSchema()
root
|-- glossary: struct (nullable = true)
| |-- GlossDiv: struct (nullable = true)
| | |-- GlossList: struct (nullable = true)
| | | |-- GlossEntry: struct (nullable = true)
| | | | |-- Abbrev: string (nullable = true)
| | | | |-- Acronym: string (nullable = true)
| | | | |-- GlossDef: struct (nullable = true)
| | | | | |-- GlossSeeAlso: array (nullable = true)
| | | | | | |-- element: string (containsNull = true)
| | | | | |-- para: string (nullable = true)
| | | | |-- GlossSee: string (nullable = true)
| | | | |-- GlossTerm: string (nullable = true)
| | | | |-- ID: string (nullable = true)
| | | | |-- SortAs: string (nullable = true)
| | |-- title: string (nullable = true)
| |-- title: string (nullable = true)