Невозможно создать Dataframe в apache spark с пустым полем Key из JSON со следующим синтаксисом JSON - PullRequest
0 голосов
/ 20 сентября 2018

Я не могу создать Apache Spark Dataframe со структурированным пустым ключом JSON, как показано ниже -

Я попытался заменить "" на "main", тогда он импортирует json как ноль, что является правильнымспособ импорта json в датафрейм с выделенным синтаксисом

{ "": {"url": "https://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/2018/3/27/brand-new-luxurious-1-bedroom-apartment-fo-2/"," offer ": {" price ": 43000,"@type ":" Предложение "," priceCurrency ":" AED "}," @type ":" Product "," name ":" Совершенно новый роскошный номер с 1 спальней в аренду - Al Qusais (теперь бесплатно на один месяц!) "}, "numberOfRooms": 1, "floorSize": 980, "name": "Совершенно новый роскошный номер с 1 спальней в аренду - Al Qusais (теперь один месяц бесплатно!)", "Url": "https://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/2018/3/27/brand-new-luxurious-1-bedroom-apartment-fo-2/", "image": "https://dbzlpvfeeds -a.akamaihd.net / images / user_images / 2018/09/03 / 84282920_CP_photo.jpeg ", "address": {"addressLocality": "Dubai","addressRegion": "Dubai", "@type": "PostalAddress"}, "@context": "http://schema.org"," geo ": {" latitude ": 55.3889," @type ":" GeoCoordinates ","долгота": 25.2827}, "@type": "SingleFamilyResidence"}

Трескаe, который работает частично, как ниже-

#import org.apache.spark.sql.SparkSession
from pyspark.sql.session import SparkSession
#spark = SparkSession.builder.appName("DFTest").getOrCreate()
spark =SparkSession.builder.master("local[*]").appName("DFTest").config("spark.sql.warehouse.dir", "file:///C://spark-2.1.0-bin-hadoop2.7//bin").getOrCreate()

sc = spark.sparkContext

from urllib2 import urlopen as uReq
import re
from bs4 import BeautifulSoup
import requests
import json
from pyspark.sql import SQLContext

my_url='https://uae.dubizzle.com/en/property-for-rent/residential/apartmentflat/?filters=(neighborhoods.ids=123)&page=1'
import unicodedata


uClient=uReq(my_url)
page_html= uClient.read()
page_soup=BeautifulSoup(page_html, 'lxml')# 'html.parser')
json_output= BeautifulSoup(str(page_soup.find_all("script",type="application/ld+json",string=re.compile("SingleFamilyResidence"))), 'lxml')#find_all("script", "application/ld+json")
json_text=json_output.get_text()
print type(json_text)
json_text_clean=unicodedata.normalize('NFKD', json_text).encode('ascii','ignore')
print type(json_text_clean)
json_data = json.loads(json_text)
print type(json_data)

test=[{u'': {u'url': u'https://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/2017/10/9/fully-furnished-brand-new-2-bed-room-flat--2/', u'offers': {u'price': 70000, u'@type': u'Offer', u'priceCurrency': u'AED'}, u'@type': u'Product', u'name': u'Fully Furnished  2 Bed Room Flat -Al Qusais'}, u'numberOfRooms': 2, u'floorSize': 1400, u'name': u'Fully Furnished  2 Bed Room Flat -Al Qusais', u'url': u'https://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/2017/10/9/fully-furnished-brand-new-2-bed-room-flat--2/', u'image': u'https://dbzlpvfeeds-a.akamaihd.net/images/user_images/2018/09/05/84371522_CP_photo.jpeg', u'address': {u'addressLocality': u'Dubai', u'addressRegion': u'Dubai', u'@type': u'PostalAddress'}, u'@context': u'http://schema.org', u'geo': {u'latitude': 55.3959, u'@type': u'GeoCoordinates', u'longitude': 25.2959}, u'@type': u'SingleFamilyResidence'}]
#print json_data
#df = sqlContext.read.json(rdd)
#with open(json_data) as jsonfile:
#    js = json.load(jsonfile)      

# write a new file with one object per line
with open("flattened.json", 'a') as outfile:
    for d in json_data:
        #d["main"] = d.pop("")
        #print d
        json.dump(d, outfile)
        outfile.write('\n')

df = sqlContext.read.option("multiline",True).json("flattened.json") #sc.parallelize(json_data))#
#df.write.format("parquet").save("dubizzle.parquet")

#df.createOrReplaceTempView("dubizzle")
#sqlDF = spark.sql("select DISTINCT(floorSize) from dubizzle")
#sqlDF.show()
#df.printSchema()
df.show()
# Displays the content of the DataFrame to stdout
#df.show()
#print json_data
uClient.close()
...