Считайте структуру, вложенную в файл JSON, в Spark Dataframe в Python, используя pyspark - PullRequest
0 голосов
/ 22 января 2020

У меня есть следующее JSON, из которого мне нужно извлечь структуры full_address_bucket__ (представленные в ['hits']['hits']['_source']) в информационный фрейм Spark со схемой full_address_bucket__, сохраненной с использованием библиотеки pyspark

{
  "took" : 436,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 1194393,
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "epf_0001",
        "_type" : "document",
        "_id" : "PUKOL1964690000",
        "_score" : 1.0,
        "_source" : {
          "full_address_" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
          "establishment_id" : "PUKOL1964690000",
          "full_address_bucket__" : {
            "care_of" : null,
            "adrs_long" : "gat no 329 , tardal, hatkanangle, kolhapur, 416109 , |,",
            "address" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
            "city" : "kolhapur",
            "building_soundex" : null,
            "locality" : null,
            "complex_soundex" : null,
            "house" : null,
            "building" : null,
            "is_adrs_good" : true,
            "adrs_long_soundex" : "KT N TRTL HTKNNKL KLHPR",
            "pin" : "416109",
            "untagged" : "gat no 329, tardal, hatkanangle",
            "street" : null,
            "district" : "kolhapur",
            "complex" : null,
            "house_unsure" : "329",
            "state" : "maharashtra",
            "floor" : null,
            "landmark" : null
          }
        }
      },
      {
        "_index" : "epf_0001",
        "_type" : "document",
        "_id" : "PUPUN0002003000",
        "_score" : 1.0,
        "_source" : {
          "full_address_" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
          "establishment_id" : "PUPUN0002003000",
          "full_address_bucket__" : {
            "care_of" : null,
            "adrs_long" : "tal - lonavala, 228 , gram - ravivarma taluk - lonavala, 228 , gram - ravivarma, malvali, 410405 , |,",
            "address" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
            "city" : "lonavala",
            "building_soundex" : null,
            "locality" : null,
            "complex_soundex" : null,
            "house" : null,
            "building" : null,
            "is_adrs_good" : true,
            "adrs_long_soundex" : "TL LNFL KRM RFFRM TLK LNFL KRM RFFRM MLFL",
            "pin" : "410405",
            "untagged" : "tal - |, 228, gram - ravivarma - |, 228, -, malvali",
            "street" : null,
            "district" : "pune",
            "complex" : null,
            "house_unsure" : "228 228",
            "state" : "maharashtra",
            "floor" : null,
            "landmark" : null
          }
        }
      },
      {
        "_index" : "epf_0001",
        "_type" : "document",
        "_id" : "PUPUN0124020000",
        "_score" : 1.0,
        "_source" : {
          "full_address_" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
          "establishment_id" : "PUPUN0124020000",
          "full_address_bucket__" : {
            "care_of" : null,
            "adrs_long" : "plot no . 9 survey no 105 - 108 . plot no . 9 survey no 105 - 108 ., pune, 411013 , |,",
            "address" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
            "city" : "pune",
            "building_soundex" : null,
            "locality" : null,
            "complex_soundex" : null,
            "house" : "plot no . 9| survey no 105 - 108 . | plot no . 9| survey no 105 - 108",
            "building" : null,
            "is_adrs_good" : true,
            "adrs_long_soundex" : "PLT N SRF N PLT N SRF N PN",
            "pin" : "411013",
            "untagged" : null,
            "street" : null,
            "district" : "pune",
            "complex" : null,
            "house_unsure" : null,
            "state" : "maharashtra",
            "floor" : null,
            "landmark" : null
          }
        }
      }
    ]
  }
}

Ответы [ 2 ]

1 голос
/ 27 января 2020

Если вы не хотите использовать json в качестве строки и извлекать ее из местоположения, используйте приведенный ниже код. Он непосредственно создаст Dataframe для full_address_bucket __

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("Test").getOrCreate()

df = spark.read.format("json").option("inferSchema", "true").load("test14.json").select("hits.*")\
    .select(F.explode("hits").alias('hits')).select('hits.*').select("_source.*").select("full_address_bucket__.*")
df.show()

Output:
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+
|             address|           adrs_long|   adrs_long_soundex|building|building_soundex|care_of|    city|complex|complex_soundex|district|floor|               house|house_unsure|is_adrs_good|landmark|locality|   pin|      state|street|            untagged|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+
|Gat No 329, Tarda...|gat no 329 , tard...|KT N TRTL HTKNNKL...|    null|            null|   null|kolhapur|   null|           null|kolhapur| null|                null|         329|        true|    null|    null|416109|maharashtra|  null|gat no 329, tarda...|
|TAL-LONAVALA,228,...|tal - lonavala, 2...|TL LNFL KRM RFFRM...|    null|            null|   null|lonavala|   null|           null|    pune| null|                null|     228 228|        true|    null|    null|410405|maharashtra|  null|tal - |, 228, gra...|
|PLOT NO.9 S.NO. 1...|plot no . 9 surve...|PLT N SRF N PLT N...|    null|            null|   null|    pune|   null|           null|    pune| null|plot no . 9| surv...|        null|        true|    null|    null|411013|maharashtra|  null|                null|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+

1 голос
/ 22 января 2020
import json
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test").getOrCreate()

json_data = """{
    "took" : 436,
    "timed_out": false,
    "_shards" : {
        "total" : 3,
        "successful" : 3,
        "skipped" : 0,
        "failed" : 0
    },
    "hits" : {
        "total" : 1194393,
        "max_score" : 1.0,
        "hits" : [
            {
                "_index" : "epf_0001",
                "_type" : "document",
                "_id" : "PUKOL1964690000",
                "_score" : 1.0,
                "_source" : {
                    "full_address_" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
                    "establishment_id" : "PUKOL1964690000",
                    "full_address_bucket__" : {
                        "care_of" : null,
                        "adrs_long" : "gat no 329 , tardal, hatkanangle, kolhapur, 416109 , |,",
                        "address" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
                        "city" : "kolhapur",
                        "building_soundex" : null,
                        "locality" : null,
                        "complex_soundex" : null,
                        "house" : null,
                        "building" : null,
                        "is_adrs_good" : true,
                        "adrs_long_soundex" : "KT N TRTL HTKNNKL KLHPR",
                        "pin" : "416109",
                        "untagged" : "gat no 329, tardal, hatkanangle",
                        "street" : null,
                        "district" : "kolhapur",
                        "complex" : null,
                        "house_unsure" : "329",
                        "state" : "maharashtra",
                        "floor" : null,
                        "landmark" : null
                    }
                }
            },
            {
                "_index" : "epf_0001",
                "_type" : "document",
                "_id" : "PUPUN0002003000",
                "_score" : 1.0,
                "_source" : {
                    "full_address_" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
                    "establishment_id" : "PUPUN0002003000",
                    "full_address_bucket__" : {
                        "care_of" : null,
                        "adrs_long" : "tal - lonavala, 228 , gram - ravivarma taluk - lonavala, 228 , gram - ravivarma, malvali, 410405 , |,",
                        "address" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
                        "city" : "lonavala",
                        "building_soundex" : null,
                        "locality" : null,
                        "complex_soundex" : null,
                        "house" : null,
                        "building" : null,
                        "is_adrs_good" : true,
                        "adrs_long_soundex" : "TL LNFL KRM RFFRM TLK LNFL KRM RFFRM MLFL",
                        "pin" : "410405",
                        "untagged" : "tal - |, 228, gram - ravivarma - |, 228, -, malvali",
                        "street" : null,
                        "district" : "pune",
                        "complex" : null,
                        "house_unsure" : "228 228",
                        "state" : "maharashtra",
                        "floor" : null,
                        "landmark" : null
                    }
                }
            },
            {
                "_index" : "epf_0001",
                "_type" : "document",
                "_id" : "PUPUN0124020000",
                "_score" : 1.0,
                "_source" : {
                    "full_address_" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
                    "establishment_id" : "PUPUN0124020000",
                    "full_address_bucket__" : {
                        "care_of" : null,
                        "adrs_long" : "plot no . 9 survey no 105 - 108 . plot no . 9 survey no 105 - 108 ., pune, 411013 , |,",
                        "address" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
                        "city" : "pune",
                        "building_soundex" : null,
                        "locality" : null,
                        "complex_soundex" : null,
                        "house" : "plot no . 9| survey no 105 - 108 . | plot no . 9| survey no 105 - 108",
                        "building" : null,
                        "is_adrs_good" : true,
                        "adrs_long_soundex" : "PLT N SRF N PLT N SRF N PN",
                        "pin" : "411013",
                        "untagged" : null,
                        "street" : null,
                        "district" : "pune",
                        "complex" : null,
                        "house_unsure" : null,
                        "state" : "maharashtra",
                        "floor" : null,
                        "landmark" : null
                    }
                }
            }
        ]
    }
}"""

json_obj = json.loads(json_data)

json_data1 = json_obj['hits']['hits'][0]['_source']['full_address_bucket__']

json_new = [json.dumps(json_data1)]
df = spark.read.json(spark.sparkContext.parallelize(json_new))
df.show()

Output:
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+
|             address|           adrs_long|   adrs_long_soundex|building|building_soundex|care_of|    city|complex|complex_soundex|district|floor|house|house_unsure|is_adrs_good|landmark|locality|   pin|      state|street|            untagged|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+
|Gat No 329, Tarda...|gat no 329 , tard...|KT N TRTL HTKNNKL...|    null|            null|   null|kolhapur|   null|           null|kolhapur| null| null|         329|        true|    null|    null|416109|maharashtra|  null|gat no 329, tarda...|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+


...