Данные можно найти здесь, это относительно небольшой файл json, который я нашел на github.Я пытаюсь найти лучший способ разбить его на фрейм данных, который я мог бы затем проанализировать (1 строка на эпизод).
http://api.tvmaze.com/singlesearch/shows?q=black-mirror&embed=episodes
Схема фрейма данных приведена ниже после прочтенияпосле вызова spark.read.json ():
# File location and type
file_location = "/FileStore/tables/blackmirror.json"
file_type = "json"
# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
df.printSchema ()
|-- _embedded: struct (nullable = true)
| |-- episodes: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- _links: struct (nullable = true)
| | | | |-- self: struct (nullable = true)
| | | | | |-- href: string (nullable = true)
| | | |-- airdate: string (nullable = true)
| | | |-- airstamp: string (nullable = true)
| | | |-- airtime: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- image: struct (nullable = true)
| | | | |-- medium: string (nullable = true)
| | | | |-- original: string (nullable = true)
| | | |-- name: string (nullable = true)
| | | |-- number: long (nullable = true)
| | | |-- runtime: long (nullable = true)
| | | |-- season: long (nullable = true)
| | | |-- summary: string (nullable = true)
| | | |-- url: string (nullable = true)
|-- _links: struct (nullable = true)
| |-- previousepisode: struct (nullable = true)
| | |-- href: string (nullable = true)
| |-- self: struct (nullable = true)
| | |-- href: string (nullable = true)
|-- externals: struct (nullable = true)
| |-- imdb: string (nullable = true)
| |-- thetvdb: long (nullable = true)
| |-- tvrage: long (nullable = true)
|-- genres: array (nullable = true)
| |-- element: string (containsNull = true)
|-- id: long (nullable = true)
|-- image: struct (nullable = true)
| |-- medium: string (nullable = true)
| |-- original: string (nullable = true)
|-- language: string (nullable = true)
|-- name: string (nullable = true)
|-- network: string (nullable = true)
|-- officialSite: string (nullable = true)
|-- premiered: string (nullable = true)
|-- rating: struct (nullable = true)
| |-- average: double (nullable = true)
|-- runtime: long (nullable = true)
|-- schedule: struct (nullable = true)
| |-- days: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- time: string (nullable = true)
|-- status: string (nullable = true)
|-- summary: string (nullable = true)
|-- type: string (nullable = true)
|-- updated: long (nullable = true)
|-- url: string (nullable = true)
|-- webChannel: struct (nullable = true)
| |-- country: string (nullable = true)
| |-- id: long (nullable = true)
| |-- name: string (nullable = true)
|-- weight: long (nullable = true)
При вызове df.count () возвращает один элемент.Я хотел бы 1 строку на элемент.Глядя на некоторые похожие ответы, думаю, что я могу использовать sql.functions explode () для преобразования массива в фрейм данных, но хотел бы знать лучший способ сделать это без потери информации.