Очистка данных Python и формат вывода в данные JSON - PullRequest
0 голосов
/ 13 мая 2018

Мне удалось собрать некоторые данные с динамического веб-сайта, и мои выходные данные представлены в формате json с единственным значением. Как изменить этот код, чтобы получить формат ключа и значения json и записать в файл с помощью python

import requests
import json

URL='http://tfda.go.tz/portal/en/trader_module/trader_module/getRegisteredDrugs_products'
payload = "draw=1&columns%5B0%5D%5Bdata%5D=no&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=True&columns%5B0%5D%5Borderable%5D=True&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B1%5D%5Bdata%5D=certificate_no&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=True&columns%5B1%5D%5Borderable%5D=True&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B2%5D%5Bdata%5D=brand_name&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=True&columns%5B2%5D%5Borderable%5D=True&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B3%5D%5Bdata%5D=classification_name&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=True&columns%5B3%5D%5Borderable%5D=True&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B4%5D%5Bdata%5D=common_name&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=True&columns%5B4%5D%5Borderable%5D=True&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B5%5D%5Bdata%5D=dosage_form&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=True&columns%5B5%5D%5Borderable%5D=True&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B6%5D%5Bdata%5D=product_strength&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=True&columns%5B6%5D%5Borderable%5D=True&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B7%5D%5Bdata%5D=registrant&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=True&columns%5B7%5D%5Borderable%5D=True&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B8%5D%5Bdata%5D=registrant_country&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=True&columns%5B8%5D%5Borderable%5D=True&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B9%5D%5Bdata%5D=manufacturer&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=True&columns%5B9%5D%5Borderable%5D=True&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B10%5D%5Bdata%5D=manufacturer_country&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=True&columns%5B10%5D%5Borderable%5D=True&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B11%5D%5Bdata%5D=expiry_date&columns%5B11%5D%5Bname%5D=&columns%5B11%5D%5Bsearchable%5D=True&columns%5B11%5D%5Borderable%5D=True&columns%5B11%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B11%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B12%5D%5Bdata%5D=id&columns%5B12%5D%5Bname%5D=&columns%5B12%5D%5Bsearchable%5D=True&columns%5B12%5D%5Borderable%5D=True&columns%5B12%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B12%5D%5Bsearch%5D%5Bregex%5D=False&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=3911&search%5Bvalue%5D=&search%5Bregex%5D=False"

with requests.Session() as s:
s.headers={"User-Agent":"Mozilla/5.0"}
s.headers.update({'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
res = s.post(URL, data = payload)

for data in res.json()['data']:
    serial = data['no']
    certno = data['certificate_no']
    brndname = data['brand_name']
    clssification = data['classification_name']
    common_name = data['common_name']
    dosage_form = data['dosage_form']
    expiry_date = data['expiry_date']
    manufacturer = data['manufacturer']
    manufacturer_country = data['manufacturer_country']
    product_strength = data['product_strength']
    registrant = data['registrant']
    registrant_country = data['registrant_country']
    output = (dataserial,certno,brndname,clssification,
    common_name,dosage_form,expiry_date,m anufacturer, manufacturer_country,
    product_strength,registrant,registrant_country)

   data = {'brandname':brndname, 'cerficate_number':certno,'expiry_date':expiry_date,'product_strength':product_strength}
    output = json.dumps(data, ensure_ascii=True, sort_keys=True)


    with open('drugs.json', 'w') as file:
        json.dump(output, file)
        file.write('file')
        file.close()

Вот вывод, который мне удалось получить снимок экрана , но пример того, что мне нужно, в следующем формате

{
 "brand_name":"Supirocin"
 "certificate_no":"TAN 00,1820 D01A GLE"
 "classification_name":"Human Medicinal Products"
 "common_name":"Mupirocin"
 "dosage_form":"Ointment"
 "expiry_date":"22-06-2018"
 "id":"18345"
 "manufacturer":"Glenmark Pharmaceuticals Limited"
 "manufacturer_country":"INDIA"
 "no":"6"
 "product_strength":"2 %w/w"
 "registrant":"Glenmark Pharmaceuticals Limited"
 "registrant_country":"INDIA"
 }

Ответы [ 2 ]

0 голосов
/ 13 мая 2018

Попробуйте, чтобы получить точный результат, который вы упомянули в своем посте:

import requests
import json

URL='http://tfda.go.tz/portal/en/trader_module/trader_module/getRegisteredDrugs_products'
payload = "draw=1&columns%5B0%5D%5Bdata%5D=no&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=True&columns%5B0%5D%5Borderable%5D=True&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B1%5D%5Bdata%5D=certificate_no&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=True&columns%5B1%5D%5Borderable%5D=True&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B2%5D%5Bdata%5D=brand_name&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=True&columns%5B2%5D%5Borderable%5D=True&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B3%5D%5Bdata%5D=classification_name&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=True&columns%5B3%5D%5Borderable%5D=True&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B4%5D%5Bdata%5D=common_name&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=True&columns%5B4%5D%5Borderable%5D=True&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B5%5D%5Bdata%5D=dosage_form&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=True&columns%5B5%5D%5Borderable%5D=True&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B6%5D%5Bdata%5D=product_strength&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=True&columns%5B6%5D%5Borderable%5D=True&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B7%5D%5Bdata%5D=registrant&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=True&columns%5B7%5D%5Borderable%5D=True&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B8%5D%5Bdata%5D=registrant_country&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=True&columns%5B8%5D%5Borderable%5D=True&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B9%5D%5Bdata%5D=manufacturer&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=True&columns%5B9%5D%5Borderable%5D=True&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B10%5D%5Bdata%5D=manufacturer_country&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=True&columns%5B10%5D%5Borderable%5D=True&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B11%5D%5Bdata%5D=expiry_date&columns%5B11%5D%5Bname%5D=&columns%5B11%5D%5Bsearchable%5D=True&columns%5B11%5D%5Borderable%5D=True&columns%5B11%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B11%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B12%5D%5Bdata%5D=id&columns%5B12%5D%5Bname%5D=&columns%5B12%5D%5Bsearchable%5D=True&columns%5B12%5D%5Borderable%5D=True&columns%5B12%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B12%5D%5Bsearch%5D%5Bregex%5D=False&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=3911&search%5Bvalue%5D=&search%5Bregex%5D=False"

with requests.Session() as s:
    s.headers={"User-Agent":"Mozilla/5.0"}
    s.headers.update({'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
    res = s.post(URL, data = payload)
    itemlist = []
    for data in res.json()['data']:
        item = {}
        item['serial'] = data['no']
        item['certno'] = data['certificate_no']
        item['brndname'] = data['brand_name']
        item['clssification'] = data['classification_name']
        item['common_name'] = data['common_name']
        item['dosage_form'] = data['dosage_form']
        item['expiry_date'] = data['expiry_date']
        item['manufacturer'] = data['manufacturer']
        item['manufacturer_country'] = data['manufacturer_country']
        item['product_strength'] = data['product_strength']
        item['registrant'] = data['registrant']
        item['registrant_country'] = data['registrant_country']
        itemlist.append(item)

    print(itemlist)
0 голосов
/ 13 мая 2018

Сначала вы создаете словарь, а затем конвертируете его в JSON. Например:

name ='ali'
family='shahabi'
output={'name' :name ,'family': family}
json.dumps(output)

print:

'{"name": "ali", "family": "shahabi"}'

Я предлагаю преобразовать:

output = (dataserial,certno,brndname,clssification, common_name,dosage_form,expiry_date,m anufacturer, manufacturer_country, product_strength,registrant,registrant_country)

до

output = {'dataserial':dataserial,'certno':certno,'brndname':brndname , ....}
...