Учебник по написанию JSON схемы для Spark - PullRequest
0 голосов
/ 07 апреля 2020

У меня есть поле JSON в столбце таблицы как String, которое мне нужно анализировать и анализировать с помощью Spark.

Редактировать: чтобы облегчить понимание вопроса: мои данные находятся в массиве JSON , Как мне написать схему для таких данных? Ниже приведен пример такого JSON:

[{"CategoryName":"cat5","CategoryTitle":"cat_title","CategoryLevels":"3"}]

Вот как JSON заключен в []:

[{"ProductName": " MS Quattro plan US QA "," CartProductPromotion ": null," ProductConfigPromotions ": null," ProductKey ":" PDUKVWL8XSBJUX "," IsConfigurable ": false," IsConfigured ": false," ProductProvisionType ": 0, VendorKey: 0," VendorKey ": 0 "ProductConfigSettingTemplateKey": нулевой, "ProductConfigKey": "PLK1O3JDCGVJFM", "ParentConfigKey": нулевой, "VendorConfigKey": нулевой, "Количество": 1, "PromoCodes": нулевой, "Цена": 140,0, "CustomerKey": нулевая "CustomerDomainPrefix": нулевой, "CustomerContactId": "6550d015-5ac1-464a-95b8-42ae4cfea05e", "ParentOrderLineId": нулевой, "ParentVendorSubscriptionId": нулевой, "BillingFrequency": 1, "BillingType": 1, "DueToday" : 140,0, "originalPrice": нулевой, "RemainingVendorSettingsName": нулевой, "Trial": нулевой, "Услуги": [{ "ServiceKey": "SV4RSKT6C0TAVU", "VendorServiceKey": "a044b16a-1861-4308-8086-a3a3b506fac2" , "VendorSubscriptionKey": null, "Name": "Office 365 Enterprise E5", "VendorProvisionResponse": null, "ProvisionStatus": 1, "SubscriptionStatus": 0, "BillingFrequency": 1, "CreatedOnUt c": нулевой, "UpdatedOnUt c": нулевой, "Количество": 1,0, "Цена": 120,0, "RateCardId": 1011294, "RateCardVersion": нулевой, "маржа": 100,0, "DefaultQuantity": нулевой, "Стоимость": 60,0 "ProvisionDate": нулевой, "ParentServiceKey": нулевой, "ServiceConfiguration": { "QuestionText": нулевой, "QuestionNumber" : нулевой, "IsQuestionRequired": нулевой, "OptionText": нулевой, "OptionNumber": нулевой, "MaxAllowedServices": нулевая, "NextAction": нулевой, "NextActionQuestion": нулевой}, "IsDummy": нулевой, "VendorKey": null, "EULADateTime": null}, {"ServiceKey": "SVZEOGY5DEUV29", "VendorServiceKey": "Служба поддержки Quattro", "VendorSubscriptionKey": null, "Имя": "Служба поддержки Quattro", "VendorProvisionResponse": "ProvisionStatus": 1, "SubscriptionStatus": 0, "BillingFrequency": 1, "CreatedOnUt c": нулевой, "UpdatedOnUt c": нулевой, "Количество": 1,0, "Цена": 20,0 "RateCardId ": 1011294," RateCardVersion ": нулевой," маржа ": 100,0," DefaultQuantity ": нулевой," Стоимость ": 10,0," ProvisionDate ": нулевой," ParentServiceKey ": нулевой," ServiceConfiguration ": {" Questio NTEXT ": нулевой," QuestionNumber ": нулевой," IsQuestionRequired ": нулевой," OptionText ": нулевой," OptionNumber ": нулевой," MaxAllowedServices ": нулевая," NextAction ": нулевой," NextActionQuestion ": нулевой}," IsDummy ": нулевой," VendorKey ": нулевой," EULADateTime ": нулевой}]," ProfileID ": 521," FloorPlanId ": нулевой," Валюта ":" USD», "ProductSettings": нулевой, "CustomerSettings": нулевой, «ResellerSettings»: null}, {«ProductName»: «Office 365 Enterprise E1», «CartProductPromotion»: null, «ProductConfigPromotions»: null, «ProductKey»: «PDUKVWL8XSBJUX», «IsConfigurable»: false, «IsConfigured»: false "ProductProvisionType": 0, "VendorKey": нулевой, "ProductConfigSettingTemplateKey": нулевой, "ProductConfigKey": "PL40SXO0YBS8LW", "ParentConfigKey": нулевой, "VendorConfigKey": нулевой, "Количество": 1, "PromoCodes": нулевая , "Цена": 84,0 "CustomerKey": нулевой, "CustomerDomainPrefix": нулевой, "CustomerContactId": "6550d015-5ac1-464a-95b8-42ae4cfea05e", "ParentOrderLineId": нулевой, "ParentVendorSubscriptionId": нулевой, "BillingFrequency" : 1, "BillingType": 1, "DueToday": 84,0 "originalPrice": нулевой, "RemainingVendorSettingsName": ню ll, "Trial": null, "Services": [{"ServiceKey": "SV097S8NKP3PLO", "VendorServiceKey": "Служба поддержки Quattro", "VendorSubscriptionKey": null, "Имя": "Служба поддержки Quattro", "VendorProvisionResponse" ": нулевой," ProvisionStatus ": 1," SubscriptionStatus ": 0," BillingFrequency ": 1," CreatedOnUt c ": нулевой," UpdatedOnUt c ": нулевой," Количество ": 1,0," Цена ": 44,0 "RateCardId": 1011294, "RateCardVersion": нулевой, "маржа": 100,0, "DefaultQuantity": нулевой, "Стоимость": 22. 0, "ProvisionDate": нулевой, "ParentServiceKey": нулевой, "ServiceConfiguration": { "QuestionText": нулевой, "QuestionNumber": нулевой, "IsQuestionRequired": нулевой, "OptionText": нулевой, "OptionNumber": нулевой,» MaxAllowedServices ": нулевая," NextAction ": нулевой," NextActionQuestion ": нулевой}," IsDummy ": нулевой," VendorKey ": нулевой," EULADateTime ": нулевой}, {" ServiceKey ":" SVJZL3RZL4LLZ7" , "VendorServiceKey": «91fd106f-4b2 c -4938-95a c -f54f74e9a239», «VendorSubscriptionKey»: ноль, «Имя»: «Office 365 Enterprise E1», «VendorProvisionResponse»: ноль, «ProvisionStatus»: 1, «SubscriptionSt» : 0, "BillingFrequency": 1, "CreatedOnUt c": нулевой, "UpdatedOnUt c": нулевой, "Количество": 1,0, "Цена": 40,0 "RateCardId": 1011294, "RateCardVersion": нулевая "маржа": 100,0, "DefaultQuantity": нулевой, "Стоимость": 20,0 "ProvisionDate": нулевой, "ParentServiceKey": нулевой, "ServiceConfiguration": { "QuestionText": нулевой, "QuestionNumber": нулевой, "IsQuestionRequired ": нулевой," OptionText ": нулевой," OptionNumber ": нулевой," MaxAllowedServices ": нулевая," NextAction ": нулевой," NextActionQuestion ": нулевой}," IsDummy ": нулевой," VendorKe у ": нулевой," EULADateTime ": нулевой}]," ProfileID ": 521," FloorPlanId ": нулевой," Валюта ":" USD», "ProductSettings": нулевой, "CustomerSettings": нулевой, "ResellerSettings": нулевая }]

Я написал эту схему вручную:

StructType([
            StructField("ProductName", StringType()),
            StructField("CartProductPromotion", StringType()),
            StructField("ProductConfigPromotions", StringType()),
            StructField("ProductKey", StringType()),
            StructField("IsConfigurable", StringType()),
            StructField("IsConfigured", StringType()),
            StructField("ProductProvisionType", StringType()),
            StructField("VendorKey", StringType()),
            StructField("ProductConfigSettingTemplateKey", StringType()),
            StructField("ProductConfigKey", StringType()),
            StructField("ParentConfigKey", StringType()),
            StructField("VendorConfigKey", StringType()),
            StructField("Quantity", StringType()),
            StructField("PromoCodes", StringType()),
            StructField("Price", StringType()),
            StructField("CustomerKey", StringType()),
            StructField("CustomerDomainPrefix", StringType()),
            StructField("CustomerContactId", StringType()),
            StructField("ParentOrderLineId", StringType()),
            StructField("ParentVendorSubscriptionId", StringType()),
            StructField("BillingFrequency", StringType()),
            StructField("BillingType", StringType()),
            StructField("DueToday", StringType()),
            StructField("originalPrice", StringType()),
            StructField("RemainingVendorSettingsName", StringType()),
            StructField("Trial", StringType()),
            StructField("Services", ArrayType(StructType([
                StructField("ServiceKey", StringType()),
                StructField("VendorServiceKey", StringType()),
                StructField("VendorSubscriptionKey", StringType()),
                StructField("Name", StringType()),
                StructField("VendorProvisionResponse", StringType()),
                StructField("ProvisionStatus", StringType()),
                StructField("SubscriptionStatus", StringType()),
                StructField("BillingFrequency", StringType()),              
                StructField("CreatedOnUtc", StringType()),
                StructField("UpdatedOnUtc", StringType()),
                StructField("Quantity", StringType()),
                StructField("Price", StringType()),

                StructField("RateCardId", StringType()),
                StructField("RateCardVersion", StringType()),
                StructField("Margin", StringType()),
                StructField("DefaultQuantity", StringType()),               
                StructField("Cost", StringType()),
                StructField("ProvisionDate", StringType()),
                StructField("ParentServiceKey", StringType()),
                StructField("ServiceConfiguration", StructType([
                    StructField("QuestionText", StringType()),
                    StructField("QuestionNumber", StringType()),
                    StructField("IsQuestionRequired", StringType()),
                    StructField("OptionText", StringType()),
                    StructField("OptionNumber", StringType()),
                    StructField("MaxAllowedServices", StringType()),
                    StructField("NextAction", StringType()),
                    StructField("NextActionQuestion", StringType())
                ])),
                StructField("IsDummy", StringType()),
                StructField("VendorKey", StringType()),
                StructField("EULADateTime", StringType())               
            ]))),
            StructField("ProfileId", StringType()),
            StructField("FloorPlanId", StringType()),
            StructField("Currency", StringType()),
            StructField("ProductSettings", StringType()),               
            StructField("CustomerSettings", StringType()),
            StructField("ResellerSettings", StringType())   ])

Она не анализирует мое поле в массиве, в новом поле я получаю значение Null (код внизу).

+---+--------------------+--------------+
|key|               value|value_w_schema|
+---+--------------------+--------------+
| 10|[{"ProductName":"...|          null|
+---+--------------------+--------------+

Но если я удалю [] из строкового поля и сохраню 1 json, завернутый в {}, тогда Схема сработает. Должен ли я обернуть написанную схему в другую структуру или массив? Может кто-нибудь указать мне хороший учебник по написанию этих схем?

Воспроизводимый код:

cart_CartProducts_schema = StructType([
            StructField("ProductName", StringType()),
            StructField("CartProductPromotion", StringType()),
            StructField("ProductConfigPromotions", StringType()),
            StructField("ProductKey", StringType()),
            StructField("IsConfigurable", StringType()),
            StructField("IsConfigured", StringType()),
            StructField("ProductProvisionType", StringType()),
            StructField("VendorKey", StringType()),
            StructField("ProductConfigSettingTemplateKey", StringType()),
            StructField("ProductConfigKey", StringType()),
            StructField("ParentConfigKey", StringType()),
            StructField("VendorConfigKey", StringType()),
            StructField("Quantity", StringType()),
            StructField("PromoCodes", StringType()),
            StructField("Price", StringType()),
            StructField("CustomerKey", StringType()),
            StructField("CustomerDomainPrefix", StringType()),
            StructField("CustomerContactId", StringType()),
            StructField("ParentOrderLineId", StringType()),
            StructField("ParentVendorSubscriptionId", StringType()),
            StructField("BillingFrequency", StringType()),
            StructField("BillingType", StringType()),
            StructField("DueToday", StringType()),
            StructField("originalPrice", StringType()),
            StructField("RemainingVendorSettingsName", StringType()),
            StructField("Trial", StringType()),
            StructField("Services", ArrayType(StructType([
                StructField("ServiceKey", StringType()),
                StructField("VendorServiceKey", StringType()),
                StructField("VendorSubscriptionKey", StringType()),
                StructField("Name", StringType()),
                StructField("VendorProvisionResponse", StringType()),
                StructField("ProvisionStatus", StringType()),
                StructField("SubscriptionStatus", StringType()),
                StructField("BillingFrequency", StringType()),              
                StructField("CreatedOnUtc", StringType()),
                StructField("UpdatedOnUtc", StringType()),
                StructField("Quantity", StringType()),
                StructField("Price", StringType()),

                StructField("RateCardId", StringType()),
                StructField("RateCardVersion", StringType()),
                StructField("Margin", StringType()),
                StructField("DefaultQuantity", StringType()),               
                StructField("Cost", StringType()),
                StructField("ProvisionDate", StringType()),
                StructField("ParentServiceKey", StringType()),
                StructField("ServiceConfiguration", StructType([
                    StructField("QuestionText", StringType()),
                    StructField("QuestionNumber", StringType()),
                    StructField("IsQuestionRequired", StringType()),
                    StructField("OptionText", StringType()),
                    StructField("OptionNumber", StringType()),
                    StructField("MaxAllowedServices", StringType()),
                    StructField("NextAction", StringType()),
                    StructField("NextActionQuestion", StringType())
                ])),
                StructField("IsDummy", StringType()),
                StructField("VendorKey", StringType()),
                StructField("EULADateTime", StringType())               
            ]))),
            StructField("ProfileId", StringType()),
            StructField("FloorPlanId", StringType()),
            StructField("Currency", StringType()),
            StructField("ProductSettings", StringType()),               
            StructField("CustomerSettings", StringType()),
            StructField("ResellerSettings", StringType())
   ])

data = [(10,'''[{"ProductName":"MS Quattro plan US QA","CartProductPromotion":null,"ProductConfigPromotions":null,"ProductKey":"PDUKVWL8XSBJUX","IsConfigurable":false,"IsConfigured":false,"ProductProvisionType":0,"VendorKey":null,"ProductConfigSettingTemplateKey":null,"ProductConfigKey":"PLK1O3JDCGVJFM","ParentConfigKey":null,"VendorConfigKey":null,"Quantity":1,"PromoCodes":null,"Price":140.0,"CustomerKey":null,"CustomerDomainPrefix":null,"CustomerContactId":"6550d015-5ac1-464a-95b8-42ae4cfea05e","ParentOrderLineId":null,"ParentVendorSubscriptionId":null,"BillingFrequency":1,"BillingType":1,"DueToday":140.0,"originalPrice":null,"RemainingVendorSettingsName":null,"Trial":null,"Services":[{"ServiceKey":"SV4RSKT6C0TAVU","VendorServiceKey":"a044b16a-1861-4308-8086-a3a3b506fac2","VendorSubscriptionKey":null,"Name":"Office 365 Enterprise E5","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":120.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":60.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":"Hello1","QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null},{"ServiceKey":"SVZEOGY5DEUV29","VendorServiceKey":"Quattro Support Service","VendorSubscriptionKey":null,"Name":"Quattro Support Service","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":20.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":10.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":"Hello2","QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null}],"ProfileId":521,"FloorPlanId":null,"Currency":"USD","ProductSettings":null,"CustomerSettings":null,"ResellerSettings":null},{"ProductName":"Office 365 Enterprise E1","CartProductPromotion":null,"ProductConfigPromotions":null,"ProductKey":"PDUKVWL8XSBJUX","IsConfigurable":false,"IsConfigured":false,"ProductProvisionType":0,"VendorKey":null,"ProductConfigSettingTemplateKey":null,"ProductConfigKey":"PL40SXO0YBS8LW","ParentConfigKey":null,"VendorConfigKey":null,"Quantity":1,"PromoCodes":null,"Price":84.0,"CustomerKey":null,"CustomerDomainPrefix":null,"CustomerContactId":"6550d015-5ac1-464a-95b8-42ae4cfea05e","ParentOrderLineId":null,"ParentVendorSubscriptionId":null,"BillingFrequency":1,"BillingType":1,"DueToday":84.0,"originalPrice":null,"RemainingVendorSettingsName":null,"Trial":null,"Services":[{"ServiceKey":"SV097S8NKP3PLO","VendorServiceKey":"Quattro Support Service","VendorSubscriptionKey":null,"Name":"Quattro Support Service","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":44.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":22.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":"Hello3","QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null},{"ServiceKey":"SVJZL3RZL4LLZ7","VendorServiceKey":"91fd106f-4b2c-4938-95ac-f54f74e9a239","VendorSubscriptionKey":null,"Name":"Office 365 Enterprise E1","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":40.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":20.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":null,"QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null}],"ProfileId":521,"FloorPlanId":null,"Currency":"USD","ProductSettings":null,"CustomerSettings":null,"ResellerSettings":null}]''')]
df1 = spark.createDataFrame(data, ("key", "value"))
df1.show(truncate=True)

#Apply the schema to the JSON string
df2 = df1.withColumn("value_w_schema", psf.from_json(df1.value, cart_CartProducts_schema))
#df2.printSchema()
df2.show()

1 Ответ

0 голосов
/ 08 апреля 2020

Я мог бы решить это, используя ответ, приведенный здесь: (2-й): { ссылка }

Мой вопрос лучше сформулировать так: Как написать схему для массива JSON?

Проблема JSON, скорее массив JSON: (Теперь я думаю, что смогу написать схему для своего комплекса JSON.

[{"CategoryName":"cat5","CategoryTitle":"cat_title","CategoryLevels":"5"}]

Решение:

trial_sch = ArrayType(StructType([
            StructField("CategoryName", StringType()),
            StructField("CategoryTitle", StringType()),
            StructField("CategoryLevels", StringType())
])
)

data = [(10,'''[{"CategoryName":"cat5","CategoryTitle":"cat_title","CategoryLevels":"5"}]''')]
df1 = spark.createDataFrame(data, ("key", "value"))
df1.show(truncate=True)

#Apply the schema to the JSON string
df2 = df1.withColumn("value_w_schema", psf.from_json(df1.value, trial_sch))  #Change schema here
#df2.printSchema()
df2.select("value_w_schema").show(truncate=False)
...