У меня есть поле JSON в столбце таблицы как String, которое мне нужно анализировать и анализировать с помощью Spark.
Редактировать: чтобы облегчить понимание вопроса: мои данные находятся в массиве JSON , Как мне написать схему для таких данных? Ниже приведен пример такого JSON:
[{"CategoryName":"cat5","CategoryTitle":"cat_title","CategoryLevels":"3"}]
Вот как JSON заключен в []:
[{"ProductName": " MS Quattro plan US QA "," CartProductPromotion ": null," ProductConfigPromotions ": null," ProductKey ":" PDUKVWL8XSBJUX "," IsConfigurable ": false," IsConfigured ": false," ProductProvisionType ": 0, VendorKey: 0," VendorKey ": 0 "ProductConfigSettingTemplateKey": нулевой, "ProductConfigKey": "PLK1O3JDCGVJFM", "ParentConfigKey": нулевой, "VendorConfigKey": нулевой, "Количество": 1, "PromoCodes": нулевой, "Цена": 140,0, "CustomerKey": нулевая "CustomerDomainPrefix": нулевой, "CustomerContactId": "6550d015-5ac1-464a-95b8-42ae4cfea05e", "ParentOrderLineId": нулевой, "ParentVendorSubscriptionId": нулевой, "BillingFrequency": 1, "BillingType": 1, "DueToday" : 140,0, "originalPrice": нулевой, "RemainingVendorSettingsName": нулевой, "Trial": нулевой, "Услуги": [{ "ServiceKey": "SV4RSKT6C0TAVU", "VendorServiceKey": "a044b16a-1861-4308-8086-a3a3b506fac2" , "VendorSubscriptionKey": null, "Name": "Office 365 Enterprise E5", "VendorProvisionResponse": null, "ProvisionStatus": 1, "SubscriptionStatus": 0, "BillingFrequency": 1, "CreatedOnUt c": нулевой, "UpdatedOnUt c": нулевой, "Количество": 1,0, "Цена": 120,0, "RateCardId": 1011294, "RateCardVersion": нулевой, "маржа": 100,0, "DefaultQuantity": нулевой, "Стоимость": 60,0 "ProvisionDate": нулевой, "ParentServiceKey": нулевой, "ServiceConfiguration": { "QuestionText": нулевой, "QuestionNumber" : нулевой, "IsQuestionRequired": нулевой, "OptionText": нулевой, "OptionNumber": нулевой, "MaxAllowedServices": нулевая, "NextAction": нулевой, "NextActionQuestion": нулевой}, "IsDummy": нулевой, "VendorKey": null, "EULADateTime": null}, {"ServiceKey": "SVZEOGY5DEUV29", "VendorServiceKey": "Служба поддержки Quattro", "VendorSubscriptionKey": null, "Имя": "Служба поддержки Quattro", "VendorProvisionResponse": "ProvisionStatus": 1, "SubscriptionStatus": 0, "BillingFrequency": 1, "CreatedOnUt c": нулевой, "UpdatedOnUt c": нулевой, "Количество": 1,0, "Цена": 20,0 "RateCardId ": 1011294," RateCardVersion ": нулевой," маржа ": 100,0," DefaultQuantity ": нулевой," Стоимость ": 10,0," ProvisionDate ": нулевой," ParentServiceKey ": нулевой," ServiceConfiguration ": {" Questio NTEXT ": нулевой," QuestionNumber ": нулевой," IsQuestionRequired ": нулевой," OptionText ": нулевой," OptionNumber ": нулевой," MaxAllowedServices ": нулевая," NextAction ": нулевой," NextActionQuestion ": нулевой}," IsDummy ": нулевой," VendorKey ": нулевой," EULADateTime ": нулевой}]," ProfileID ": 521," FloorPlanId ": нулевой," Валюта ":" USD», "ProductSettings": нулевой, "CustomerSettings": нулевой, «ResellerSettings»: null}, {«ProductName»: «Office 365 Enterprise E1», «CartProductPromotion»: null, «ProductConfigPromotions»: null, «ProductKey»: «PDUKVWL8XSBJUX», «IsConfigurable»: false, «IsConfigured»: false "ProductProvisionType": 0, "VendorKey": нулевой, "ProductConfigSettingTemplateKey": нулевой, "ProductConfigKey": "PL40SXO0YBS8LW", "ParentConfigKey": нулевой, "VendorConfigKey": нулевой, "Количество": 1, "PromoCodes": нулевая , "Цена": 84,0 "CustomerKey": нулевой, "CustomerDomainPrefix": нулевой, "CustomerContactId": "6550d015-5ac1-464a-95b8-42ae4cfea05e", "ParentOrderLineId": нулевой, "ParentVendorSubscriptionId": нулевой, "BillingFrequency" : 1, "BillingType": 1, "DueToday": 84,0 "originalPrice": нулевой, "RemainingVendorSettingsName": ню ll, "Trial": null, "Services": [{"ServiceKey": "SV097S8NKP3PLO", "VendorServiceKey": "Служба поддержки Quattro", "VendorSubscriptionKey": null, "Имя": "Служба поддержки Quattro", "VendorProvisionResponse" ": нулевой," ProvisionStatus ": 1," SubscriptionStatus ": 0," BillingFrequency ": 1," CreatedOnUt c ": нулевой," UpdatedOnUt c ": нулевой," Количество ": 1,0," Цена ": 44,0 "RateCardId": 1011294, "RateCardVersion": нулевой, "маржа": 100,0, "DefaultQuantity": нулевой, "Стоимость": 22. 0, "ProvisionDate": нулевой, "ParentServiceKey": нулевой, "ServiceConfiguration": { "QuestionText": нулевой, "QuestionNumber": нулевой, "IsQuestionRequired": нулевой, "OptionText": нулевой, "OptionNumber": нулевой,» MaxAllowedServices ": нулевая," NextAction ": нулевой," NextActionQuestion ": нулевой}," IsDummy ": нулевой," VendorKey ": нулевой," EULADateTime ": нулевой}, {" ServiceKey ":" SVJZL3RZL4LLZ7" , "VendorServiceKey": «91fd106f-4b2 c -4938-95a c -f54f74e9a239», «VendorSubscriptionKey»: ноль, «Имя»: «Office 365 Enterprise E1», «VendorProvisionResponse»: ноль, «ProvisionStatus»: 1, «SubscriptionSt» : 0, "BillingFrequency": 1, "CreatedOnUt c": нулевой, "UpdatedOnUt c": нулевой, "Количество": 1,0, "Цена": 40,0 "RateCardId": 1011294, "RateCardVersion": нулевая "маржа": 100,0, "DefaultQuantity": нулевой, "Стоимость": 20,0 "ProvisionDate": нулевой, "ParentServiceKey": нулевой, "ServiceConfiguration": { "QuestionText": нулевой, "QuestionNumber": нулевой, "IsQuestionRequired ": нулевой," OptionText ": нулевой," OptionNumber ": нулевой," MaxAllowedServices ": нулевая," NextAction ": нулевой," NextActionQuestion ": нулевой}," IsDummy ": нулевой," VendorKe у ": нулевой," EULADateTime ": нулевой}]," ProfileID ": 521," FloorPlanId ": нулевой," Валюта ":" USD», "ProductSettings": нулевой, "CustomerSettings": нулевой, "ResellerSettings": нулевая }]
Я написал эту схему вручную:
StructType([
StructField("ProductName", StringType()),
StructField("CartProductPromotion", StringType()),
StructField("ProductConfigPromotions", StringType()),
StructField("ProductKey", StringType()),
StructField("IsConfigurable", StringType()),
StructField("IsConfigured", StringType()),
StructField("ProductProvisionType", StringType()),
StructField("VendorKey", StringType()),
StructField("ProductConfigSettingTemplateKey", StringType()),
StructField("ProductConfigKey", StringType()),
StructField("ParentConfigKey", StringType()),
StructField("VendorConfigKey", StringType()),
StructField("Quantity", StringType()),
StructField("PromoCodes", StringType()),
StructField("Price", StringType()),
StructField("CustomerKey", StringType()),
StructField("CustomerDomainPrefix", StringType()),
StructField("CustomerContactId", StringType()),
StructField("ParentOrderLineId", StringType()),
StructField("ParentVendorSubscriptionId", StringType()),
StructField("BillingFrequency", StringType()),
StructField("BillingType", StringType()),
StructField("DueToday", StringType()),
StructField("originalPrice", StringType()),
StructField("RemainingVendorSettingsName", StringType()),
StructField("Trial", StringType()),
StructField("Services", ArrayType(StructType([
StructField("ServiceKey", StringType()),
StructField("VendorServiceKey", StringType()),
StructField("VendorSubscriptionKey", StringType()),
StructField("Name", StringType()),
StructField("VendorProvisionResponse", StringType()),
StructField("ProvisionStatus", StringType()),
StructField("SubscriptionStatus", StringType()),
StructField("BillingFrequency", StringType()),
StructField("CreatedOnUtc", StringType()),
StructField("UpdatedOnUtc", StringType()),
StructField("Quantity", StringType()),
StructField("Price", StringType()),
StructField("RateCardId", StringType()),
StructField("RateCardVersion", StringType()),
StructField("Margin", StringType()),
StructField("DefaultQuantity", StringType()),
StructField("Cost", StringType()),
StructField("ProvisionDate", StringType()),
StructField("ParentServiceKey", StringType()),
StructField("ServiceConfiguration", StructType([
StructField("QuestionText", StringType()),
StructField("QuestionNumber", StringType()),
StructField("IsQuestionRequired", StringType()),
StructField("OptionText", StringType()),
StructField("OptionNumber", StringType()),
StructField("MaxAllowedServices", StringType()),
StructField("NextAction", StringType()),
StructField("NextActionQuestion", StringType())
])),
StructField("IsDummy", StringType()),
StructField("VendorKey", StringType()),
StructField("EULADateTime", StringType())
]))),
StructField("ProfileId", StringType()),
StructField("FloorPlanId", StringType()),
StructField("Currency", StringType()),
StructField("ProductSettings", StringType()),
StructField("CustomerSettings", StringType()),
StructField("ResellerSettings", StringType()) ])
Она не анализирует мое поле в массиве, в новом поле я получаю значение Null (код внизу).
+---+--------------------+--------------+
|key| value|value_w_schema|
+---+--------------------+--------------+
| 10|[{"ProductName":"...| null|
+---+--------------------+--------------+
Но если я удалю [] из строкового поля и сохраню 1 json, завернутый в {}, тогда Схема сработает. Должен ли я обернуть написанную схему в другую структуру или массив? Может кто-нибудь указать мне хороший учебник по написанию этих схем?
Воспроизводимый код:
cart_CartProducts_schema = StructType([
StructField("ProductName", StringType()),
StructField("CartProductPromotion", StringType()),
StructField("ProductConfigPromotions", StringType()),
StructField("ProductKey", StringType()),
StructField("IsConfigurable", StringType()),
StructField("IsConfigured", StringType()),
StructField("ProductProvisionType", StringType()),
StructField("VendorKey", StringType()),
StructField("ProductConfigSettingTemplateKey", StringType()),
StructField("ProductConfigKey", StringType()),
StructField("ParentConfigKey", StringType()),
StructField("VendorConfigKey", StringType()),
StructField("Quantity", StringType()),
StructField("PromoCodes", StringType()),
StructField("Price", StringType()),
StructField("CustomerKey", StringType()),
StructField("CustomerDomainPrefix", StringType()),
StructField("CustomerContactId", StringType()),
StructField("ParentOrderLineId", StringType()),
StructField("ParentVendorSubscriptionId", StringType()),
StructField("BillingFrequency", StringType()),
StructField("BillingType", StringType()),
StructField("DueToday", StringType()),
StructField("originalPrice", StringType()),
StructField("RemainingVendorSettingsName", StringType()),
StructField("Trial", StringType()),
StructField("Services", ArrayType(StructType([
StructField("ServiceKey", StringType()),
StructField("VendorServiceKey", StringType()),
StructField("VendorSubscriptionKey", StringType()),
StructField("Name", StringType()),
StructField("VendorProvisionResponse", StringType()),
StructField("ProvisionStatus", StringType()),
StructField("SubscriptionStatus", StringType()),
StructField("BillingFrequency", StringType()),
StructField("CreatedOnUtc", StringType()),
StructField("UpdatedOnUtc", StringType()),
StructField("Quantity", StringType()),
StructField("Price", StringType()),
StructField("RateCardId", StringType()),
StructField("RateCardVersion", StringType()),
StructField("Margin", StringType()),
StructField("DefaultQuantity", StringType()),
StructField("Cost", StringType()),
StructField("ProvisionDate", StringType()),
StructField("ParentServiceKey", StringType()),
StructField("ServiceConfiguration", StructType([
StructField("QuestionText", StringType()),
StructField("QuestionNumber", StringType()),
StructField("IsQuestionRequired", StringType()),
StructField("OptionText", StringType()),
StructField("OptionNumber", StringType()),
StructField("MaxAllowedServices", StringType()),
StructField("NextAction", StringType()),
StructField("NextActionQuestion", StringType())
])),
StructField("IsDummy", StringType()),
StructField("VendorKey", StringType()),
StructField("EULADateTime", StringType())
]))),
StructField("ProfileId", StringType()),
StructField("FloorPlanId", StringType()),
StructField("Currency", StringType()),
StructField("ProductSettings", StringType()),
StructField("CustomerSettings", StringType()),
StructField("ResellerSettings", StringType())
])
data = [(10,'''[{"ProductName":"MS Quattro plan US QA","CartProductPromotion":null,"ProductConfigPromotions":null,"ProductKey":"PDUKVWL8XSBJUX","IsConfigurable":false,"IsConfigured":false,"ProductProvisionType":0,"VendorKey":null,"ProductConfigSettingTemplateKey":null,"ProductConfigKey":"PLK1O3JDCGVJFM","ParentConfigKey":null,"VendorConfigKey":null,"Quantity":1,"PromoCodes":null,"Price":140.0,"CustomerKey":null,"CustomerDomainPrefix":null,"CustomerContactId":"6550d015-5ac1-464a-95b8-42ae4cfea05e","ParentOrderLineId":null,"ParentVendorSubscriptionId":null,"BillingFrequency":1,"BillingType":1,"DueToday":140.0,"originalPrice":null,"RemainingVendorSettingsName":null,"Trial":null,"Services":[{"ServiceKey":"SV4RSKT6C0TAVU","VendorServiceKey":"a044b16a-1861-4308-8086-a3a3b506fac2","VendorSubscriptionKey":null,"Name":"Office 365 Enterprise E5","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":120.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":60.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":"Hello1","QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null},{"ServiceKey":"SVZEOGY5DEUV29","VendorServiceKey":"Quattro Support Service","VendorSubscriptionKey":null,"Name":"Quattro Support Service","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":20.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":10.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":"Hello2","QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null}],"ProfileId":521,"FloorPlanId":null,"Currency":"USD","ProductSettings":null,"CustomerSettings":null,"ResellerSettings":null},{"ProductName":"Office 365 Enterprise E1","CartProductPromotion":null,"ProductConfigPromotions":null,"ProductKey":"PDUKVWL8XSBJUX","IsConfigurable":false,"IsConfigured":false,"ProductProvisionType":0,"VendorKey":null,"ProductConfigSettingTemplateKey":null,"ProductConfigKey":"PL40SXO0YBS8LW","ParentConfigKey":null,"VendorConfigKey":null,"Quantity":1,"PromoCodes":null,"Price":84.0,"CustomerKey":null,"CustomerDomainPrefix":null,"CustomerContactId":"6550d015-5ac1-464a-95b8-42ae4cfea05e","ParentOrderLineId":null,"ParentVendorSubscriptionId":null,"BillingFrequency":1,"BillingType":1,"DueToday":84.0,"originalPrice":null,"RemainingVendorSettingsName":null,"Trial":null,"Services":[{"ServiceKey":"SV097S8NKP3PLO","VendorServiceKey":"Quattro Support Service","VendorSubscriptionKey":null,"Name":"Quattro Support Service","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":44.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":22.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":"Hello3","QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null},{"ServiceKey":"SVJZL3RZL4LLZ7","VendorServiceKey":"91fd106f-4b2c-4938-95ac-f54f74e9a239","VendorSubscriptionKey":null,"Name":"Office 365 Enterprise E1","VendorProvisionResponse":null,"ProvisionStatus":1,"SubscriptionStatus":0,"BillingFrequency":1,"CreatedOnUtc":null,"UpdatedOnUtc":null,"Quantity":1.0,"Price":40.0,"RateCardId":1011294,"RateCardVersion":null,"Margin":100.0,"DefaultQuantity":null,"Cost":20.0,"ProvisionDate":null,"ParentServiceKey":null,"ServiceConfiguration":{"QuestionText":null,"QuestionNumber":null,"IsQuestionRequired":null,"OptionText":null,"OptionNumber":null,"MaxAllowedServices":null,"NextAction":null,"NextActionQuestion":null},"IsDummy":null,"VendorKey":null,"EULADateTime":null}],"ProfileId":521,"FloorPlanId":null,"Currency":"USD","ProductSettings":null,"CustomerSettings":null,"ResellerSettings":null}]''')]
df1 = spark.createDataFrame(data, ("key", "value"))
df1.show(truncate=True)
#Apply the schema to the JSON string
df2 = df1.withColumn("value_w_schema", psf.from_json(df1.value, cart_CartProducts_schema))
#df2.printSchema()
df2.show()