Так как в вашем JSON есть символы, отличные от ASCII, вы должны сначала удалить их, чтобы использовать это решение:
def _decode_list(data):
rv = []
for item in data:
if isinstance(item, unicode):
item = item.encode('ascii', 'ignore')
elif isinstance(item, list):
item = _decode_list(item)
elif isinstance(item, dict):
item = _decode_dict(item)
rv.append(item)
return rv
def _decode_dict(data):
rv = {}
for key, value in data.iteritems():
if isinstance(key, unicode):
key = key.encode('ascii', 'ignore')
if isinstance(value, unicode):
value = value.encode('ascii', 'ignore')
elif isinstance(value, list):
value = _decode_list(value)
elif isinstance(value, dict):
value = _decode_dict(value)
rv[key] = value
return rv
with open('my_json.json', 'r') as f:
json_dict = json.load(f, object_hook=_decode_dict)
Теперь, когда у вас есть только символы UTF-8, вы можете извлечь StructType следующим образом:
rdd_JSON = sc.parallelize([json_dict])
df_JSON = spark.read.json(rdd_JSON)
schema = df_JSON.schema
df_JSON.printSchema()
Ваша полученная схема:
StructType(List(StructField($metadata,StructType(List(StructField($dataVector,StringType,true),StructField($dataset,StringType,true),StructField($datasource,StringType,true),StructField($fileFormat,StringType,true),StructField($ingestionMode,StringType,true),StructField($nameFormat,StringType,true))),true),StructField($schema,StringType,true),StructField(description,StringType,true),StructField(id,StringType,true),StructField(properties,StructType(List(StructField(content,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(resource,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(accountEtat,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(anonymizationDate,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(anonymized,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(ccuId,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(comptePrepaye,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(creationDate,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(currentSolde,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(solde,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(soldeDate,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(id,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(rechargeCPPEncours,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(dateCGV,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateCreation,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateDerniereModification,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(defaultAddresses,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(payment,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressDetail,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressL4ExtVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4LibVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4MotVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4NumVoie,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(addressName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(appartment,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(building,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(ceaL4,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(codeInseeCommune,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(country,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(isoCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(name,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(doorCode1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(doorCode2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(mascadiaError,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(poBox,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(postalCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(quartierLettre,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(service,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(soColissimoDeliveryMode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetNumber,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(typeVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(contact,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(cellPhone,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(company,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(email,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(firstName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(gender,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(lastName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(title,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(shipping,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressDetail,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressL4ExtVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4LibVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4MotVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4NumVoie,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(addressName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(appartment,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(building,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(ceaL4,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(codeInseeCommune,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(country,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(isoCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(name,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(doorCode1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(doorCode2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(mascadiaError,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(poBox,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(postalCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(quartierLettre,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(service,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(soColissimoDeliveryMode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetNumber,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(typeVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(contact,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(cellPhone,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(company,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(email,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(firstName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(gender,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(lastName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(title,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(etat,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(fraude,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(dateFraudeNiv1,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateFraudeNiv2,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateStatutFraude,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(statusFraude,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(guestFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(id,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(idGuest,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(identity,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(civility,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(dateOfBirth,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(firstName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(lastName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(mail,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(middleName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(isComptePrepaye,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(langage,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(listOfInterests,StructType(List(StructField(description,StringType,true),StructField(items,StructType(List(StructField(properties,StructType(List(StructField(description,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(name,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(uid,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(type,StringType,true))),true),StructField(marketing,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(codePromoParrain,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(isFilleul,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(nbConso,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(optins,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(infosGroupe,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosPartners,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosPhilaposte,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosSmsGroupe,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosSmsPartners,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(organization,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(codeCoclico,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(codeTypePorteFeuille,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(dateEcheanceDocCertif,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(enseigne,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(function,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(nomSociete,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(numCartePro,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(secteurActivite,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(siret,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(tvaIntraCommunautaire,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(typeEntreprise,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(philatelist,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(abonnementCataloguePhilatelique,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(abonnementCatalogue,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(abonnementSortie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(appetencePhilatelie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(emailingPhilatelistFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(philateListFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(termsOfUse,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(testSondeFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(type,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(schema,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(context,StructType(List(StructField(properties,StructType(List(StructField(dateSentEvent,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true))),true),StructField(required,ArrayType(StringType,true),true),StructField(type,StringType,true))),true),StructField(messageId,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(operation,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(actionDate,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(operationType,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(enum,ArrayType(StringType,true),true),StructField(type,StringType,true))),true),StructField(patch,StructType(List(StructField(description,StringType,true),StructField(items,StructType(List(StructField($metadata,StructType(List(StructField(PATCH_RESOURCE_ID,StringType,true),StructField(PATCH_TARGET_SCHEMA,StringType,true))),true),StructField(properties,StructType(List(StructField(_corrupt_recordvalue,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(op,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(enum,ArrayType(StringType,true),true),StructField(type,StringType,true))),true),StructField(path,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(resourceId,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true),StructField($tags,ArrayType(StructType(List(StructField(rdf:type,StringType,true),StructField(rdfs:domain,StringType,true))),true),true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(resourceType,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(self,StructType(List(StructField(commentaireVersion,StringType,true),StructField(format,StringType,true),StructField(name,StringType,true),StructField(vendor,StringType,true),StructField(version,StringType,true))),true),StructField(title,StringType,true),StructField(type,StringType,true)))