Я только начинаю изучать spark (pyspark 2.4.5), и в настоящее время я пытаюсь прочитать файл json и преобразовать его в фрейм данных.
Фрагмент:
s = spark.read.json("mt.json", multiLine = True).schema
df = spark.read.json("mt.json",s)
Я знаю, что могу создать схему, используя StructType
, но json, который я пытаюсь прочитать, имеет вложенные структуры внутри, ниже это первая строка из файла json:
{
"venue": {
"venue_name": "Datong High School",
"lon": 0,
"lat": 0,
"venue_id": 23779799
},
"visibility": "public",
"response": "no",
"guests": 0,
"member": {
"member_id": 120119272,
"photo": "http:\/\/photos3.meetupstatic.com\/photos\/member\/b\/2\/b\/c\/thumb_262125756.jpeg",
"member_name": "Allen Wang"
},
"rsvp_id": 1658733801,
"mtime": 1489925470960,
"event": {
"event_name": "Play Intermediate Volleyball",
"event_id": "jkpwmlywgbmb",
"time": 1491613200000,
"event_url": "https:\/\/www.meetup.com\/Taipei-Sports-and-Social-Club\/events\/236786445\/"
},
"group": {
"group_topics": [{
"urlkey": "fitness",
"topic_name": "Fitness"
}, {
"urlkey": "mountain-biking",
"topic_name": "Mountain Biking"
}, {
"urlkey": "sports",
"topic_name": "Sports and Recreation"
}, {
"urlkey": "outdoors",
"topic_name": "Outdoors"
}, {
"urlkey": "fun-times",
"topic_name": "Fun Times"
}, {
"urlkey": "winter-and-summer-sports",
"topic_name": "Winter and Summer Sports"
}, {
"urlkey": "adventure",
"topic_name": "Adventure"
}, {
"urlkey": "water-sports",
"topic_name": "Water Sports"
}, {
"urlkey": "sports-and-socials",
"topic_name": "Sports and Socials"
}, {
"urlkey": "hiking",
"topic_name": "Hiking"
}, {
"urlkey": "excercise",
"topic_name": "Exercise"
}, {
"urlkey": "recreational-sports",
"topic_name": "Recreational Sports"
}],
"group_city": "Taipei",
"group_country": "tw",
"group_id": 16585312,
"group_name": "Taipei Sports and Social Club",
"group_lon": 121.45,
"group_urlname": "Taipei-Sports-and-Social-Club",
"group_lat": 25.02
}
}
и df.first()
возвращает:
Row(event=Row(event_id='jkpwmlywgbmb', event_name='Play Intermediate Volleyball', event_url='https://www.meetup.com/Taipei-Sports-and-Social-Club/events/236786445/', time=1491613200000), group=Row(group_city='Taipei', group_country='tw', group_id=16585312, group_lat=25.02, group_lon=121.45, group_name='Taipei Sports and Social Club', group_topics=[Row(topic_name='Fitness', urlkey='fitness'), Row(topic_name='Mountain Biking', urlkey='mountain-biking'), Row(topic_name='Sports and Recreation', urlkey='sports'), Row(topic_name='Outdoors', urlkey='outdoors'), Row(topic_name='Fun Times', urlkey='fun-times'), Row(topic_name='Winter and Summer Sports', urlkey='winter-and-summer-sports'), Row(topic_name='Adventure', urlkey='adventure'), Row(topic_name='Water Sports', urlkey='water-sports'), Row(topic_name='Sports and Socials', urlkey='sports-and-socials'), Row(topic_name='Hiking', urlkey='hiking'), Row(topic_name='Exercise', urlkey='excercise'), Row(topic_name='Recreational Sports', urlkey='recreational-sports')], group_urlname='Taipei-Sports-and-Social-Club'), guests=0, member=Row(member_id=120119272, member_name='Allen Wang', photo='http://photos3.meetupstatic.com/photos/member/b/2/b/c/thumb_262125756.jpeg'), mtime=1489925470960, response='no', rsvp_id=1658733801, venue=Row(lat=0, lon=0, venue_id=23779799, venue_name='Datong High School'), visibility='public')
Но я чувствую, что этот подход не самый умный, есть ли другой способ прочитать json в кадре данных?