Python Data Scrapping возвращает пустые значения из таблицы - PullRequest
0 голосов
/ 10 мая 2018

Я пытаюсь очистить данные с веб-сайта, но вместо вывода на печать данных, содержащихся в таблице, возвращается пустая строка. Сайт пытается очистить данные от http://tfda.go.tz/portal/registered-products/registered-drug-products-1

и это код, который я использовал для списания

from bs4 import BeautifulSoup
import requests
import sys, io
page = requests.get("http://tfda.go.tz/portal/registered-products/registered-drug-products-1")
soup = BeautifulSoup(page.content, 'html.parser')
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,'cp437','backslashreplace')

#print(soup.prettify())

simple = list(soup.children)
#print(simple)

#S/n = ""
Certificate = ""
Brandname = ""
#Classfication Name = ""
#Dosage Form = ""
#Product Strength = ""
Registrant = ""
#Registrant Country = ""
Manufacturer = ""
#Manufacturer Country = ""
#Expiry Date = ""

table = soup.find("table", { "class" :"table table-stripped table_productDrugs" })
print (table)

И вот мой скриншот вывода пример вывода

Я ценю вашу помощь

Ответы [ 3 ]

0 голосов
/ 10 мая 2018

Сайт является динамическим, поэтому вам необходимо использовать инструмент управления браузером, такой как selenium:

from selenium import webdriver
from collections import namedtuple
from bs4 import BeautifulSoup as soup
results = namedtuple('results', [u'sn', u'certificate_no', u'brand_name', u'classification_name', u'common_name', u'dosage_form', u'product_strength', u'registrant', u'registrant_country', u'manufacturer', u'manufacturer_country', u'expiry_date'])
d = webdriver.Chrome('/path/to/chromedriver')
d.get('http://tfda.go.tz/portal/registered-products/registered-drug-products-1')
table_results = [i.text for i in soup(d.page_source, 'lxml').find_all('td')]
new_results = table_results[12:]
final_results = [results(*new_results[i:i+12]) for i in range(0, len(new_results), 12)]

Выход:

[results(sn=u'1', certificate_no=u'TAN 09,176 N02B MEP', brand_name=u'Trabilin', classification_name=u'Human Medicinal Products', common_name=u'Tramadol', dosage_form=u'Capsules', product_strength=u'50mg', registrant=u'Acino Pharma AG', registrant_country=u'SWITZERLAND', manufacturer=u'Merckle GmbH', manufacturer_country=u'GERMANY', expiry_date=u'13-10-2019'), results(sn=u'2', certificate_no=u'TZ13H188', brand_name=u'Melorem 7.5', classification_name=u'Human Medicinal Products', common_name=u'Meloxicam', dosage_form=u'Tablets', product_strength=u'7.5mg', registrant=u'Remedica Limited', registrant_country=u'CYPRUS', manufacturer=u'Remedica Limited', manufacturer_country=u'CYPRUS', expiry_date=u'05-01-2019'), results(sn=u'3', certificate_no=u'TAN 00,2246 P01X FAR', brand_name=u'Diminazen', classification_name=u'Veterinary Pharmaceutical', common_name=u'Diminazene + Phenazone', dosage_form=u'Powder for solution for injection ', product_strength=u'555 mg/g + 445 mg/g', registrant=u'Farvet Laboratories B.V', registrant_country=u'THE NETHERLANDS', manufacturer=u'Farvet Laboratories B.V', manufacturer_country=u'THE NETHERLANDS', expiry_date=u'27-09-2020'), results(sn=u'4', certificate_no=u'TAN 00,2075 J01D FAR', brand_name=u'Penstrep 20/25', classification_name=u'Veterinary Pharmaceutical', common_name=u'Procaine BenzylPenicillin + Dihydrostreptomycin', dosage_form=u'Suspension for injection ', product_strength=u'250 mg/ml + 200 ml', registrant=u'Farvet Laboratories B.V', registrant_country=u'THE NETHERLANDS', manufacturer=u'Farvet Laboratories B.V', manufacturer_country=u'THE NETHERLANDS', expiry_date=u'27-09-2020'), results(sn=u'5', certificate_no=u'TAN 00,2076 J01A FAR', brand_name=u'Tridox 20', classification_name=u'Veterinary Pharmaceutical', common_name=u'Oxytetracycline', dosage_form=u'Injection', product_strength=u'216 mg/ml', registrant=u'Farvet Laboratories B.V', registrant_country=u'THE NETHERLANDS', manufacturer=u'Farvet Laboratories B.V', manufacturer_country=u'THE NETHERLANDS', expiry_date=u'27-09-2020'), results(sn=u'6', certificate_no=u'TAN 00,1820 D01A GLE', brand_name=u'Supirocin', classification_name=u'Human Medicinal Products', common_name=u'Mupirocin', dosage_form=u'Ointment', product_strength=u'2 %w/w', registrant=u'Glenmark Pharmaceuticals Limited', registrant_country=u'INDIA', manufacturer=u'Glenmark Pharmaceuticals Limited', manufacturer_country=u'INDIA', expiry_date=u'22-06-2018'), results(sn=u'7', certificate_no=u'TAN 00,4282 J03B GLA', brand_name=u'Septrin', classification_name=u'Human Medicinal Products', common_name=u'Sulfamethoxazole+Trimethoprim', dosage_form=u'Tablets', product_strength=u'400 mg + 80 mg', registrant=u'Pharmacare Limited', registrant_country=u'SOUTH AFRICA', manufacturer=u'Eva Cosmetics', manufacturer_country=u'EGYPT', expiry_date=u'12-01-2019'), results(sn=u'8', certificate_no=u'TAN 00,4470 J02A HOE', brand_name=u'Candazole', classification_name=u'Human Medicinal Products', common_name=u'Clotrimazole', dosage_form=u'Cream', product_strength=u'1 % w/w', registrant=u'Hoe Pharmaceuticals SDN BHD', registrant_country=u'MALAYSIA', manufacturer=u'Hoe Pharmaceuticals Sdn Bhd', manufacturer_country=u'MALAYSIA', expiry_date=u'01-06-2019'), results(sn=u'9', certificate_no=u'TZ12H146', brand_name=u'Mazit', classification_name=u'Human Medicinal Products', common_name=u'AZITHROMYCIN', dosage_form=u'Capsules', product_strength=u' 250mg', registrant=u'Neopharma', registrant_country=u'UNITED ARAB EMIRATES', manufacturer=u'Neopharma', manufacturer_country=u'UNITED ARAB EMIRATES', expiry_date=u'22-04-2022'), results(sn=u'10', certificate_no=u'TAN 05,068 N02A CLA', brand_name=u'PROVIVE', classification_name=u'Human Medicinal Products', common_name=u'Propofol', dosage_form=u'Injectable Solution', product_strength=u'', registrant=u'ELDA International DMCC', registrant_country=u'U.A.E', manufacturer=u'', manufacturer_country=u'', expiry_date=u'31-12-2019'), results(sn=u'11', certificate_no=u'TAN 05,691 A03B SHA', brand_name=u'Spasmo', classification_name=u'Human Medicinal Products', common_name=u'Homatropine methyl Bromide', dosage_form=u'Syrup ', product_strength=u'2 mg/ml', registrant=u'Shaphaco Pharmaceuticals Industries', registrant_country=u'Republic of Yemen', manufacturer=u'Shaphaco Pharmaceuticals Industries', manufacturer_country=u'Republic of Yemen', expiry_date=u'02-10-2020'), results(sn=u'12', certificate_no=u'TAN 05,153 N04B CLA', brand_name=u'Sedoz', classification_name=u'Human Medicinal Products', common_name=u'Midazolam', dosage_form=u'Solution for injection ', product_strength=u'1 mg/ml', registrant=u'ELDA International DMCC', registrant_country=u'U.A.E', manufacturer=u'Claris Injectables Limited', manufacturer_country=u'INDIA', expiry_date=u'05-06-2020'), results(sn=u'13', certificate_no=u'TZ 15 V 0012', brand_name=u'Hepaturyl', classification_name=u'Veterinary Pharmaceutical', common_name=u'Magnesium + Sodium', dosage_form=u'Powder', product_strength=u'44.5%+25%+25%+5.0%', registrant=u'LAPROVET ', registrant_country=u'FRANCE', manufacturer=u'SOGEVAL', manufacturer_country=u'FRANCE', expiry_date=u'18-09-2019'), results(sn=u'14', certificate_no=u'TZ12H015', brand_name=u'Brunes-200', classification_name=u'Human Medicinal Products', common_name=u'Ibuprofen', dosage_form=u'Tablet, Film-coated ', product_strength=u'200 mg', registrant=u'Nestor Pharmaceuticals Limited', registrant_country=u'INDIA', manufacturer=u'NESTOR PHARMACEUTICALS LIMITED', manufacturer_country=u'INDIA', expiry_date=u'13-12-2021'), results(sn=u'15', certificate_no=u'TAN 06,190 A10B MIC', brand_name=u'Diapride', classification_name=u'Human Medicinal Products', common_name=u'Glimepiride', dosage_form=u'Tablets', product_strength=u'2mg', registrant=u'Micro Labs Limited', registrant_country=u'INDIA', manufacturer=u'Micro Labs Limited', manufacturer_country=u'INDIA', expiry_date=u'21-03-2021'), results(sn=u'16', certificate_no=u'TZ14H010', brand_name=u'Biodroxil 500mg Capsules', classification_name=u'Human Medicinal Products', common_name=u'Cefadroxil', dosage_form=u'Capsules', product_strength=u'500mg', registrant=u'Sandoz GmbH', registrant_country=u'AUSTRALIA', manufacturer=u'Sandoz GmbH', manufacturer_country=u'AUSTRIA', expiry_date=u'06-01-2019'), results(sn=u'17', certificate_no=u'TZ 17 H 0235', brand_name=u'Repoitin 2000', classification_name=u'Human Medicinal Products', common_name=u'Erythropoietin', dosage_form=u'Solution for injection ', product_strength=u'2000 IU/0.5ml ', registrant=u'Serum Institute of India Private Limited', registrant_country=u'INDIA', manufacturer=u'Serum Institute of India Private Limited', manufacturer_country=u'INDIA', expiry_date=u'10-09-2022'), results(sn=u'18', certificate_no=u'TAN 05,486 A10A ELI', brand_name=u'Humulin 70/30', classification_name=u'Human Medicinal Products', common_name=u'Insulin', dosage_form=u'Suspension for injection ', product_strength=u'100 IU/ml', registrant=u'Eli Lilly Export SA', registrant_country=u'SWITZERLAND', manufacturer=u'Lilly France S.A.S', manufacturer_country=u'FRANCE', expiry_date=u'02-08-2020'), results(sn=u'19', certificate_no=u'TAN 00,4452 C03C COS', brand_name=u'Frusemide', classification_name=u'Human Medicinal Products', common_name=u'Frusemide', dosage_form=u'Tablets', product_strength=u'40 mg', registrant=u'Cosmos Limited', registrant_country=u'KENYA', manufacturer=u'Cosmos Limited', manufacturer_country=u'KENYA', expiry_date=u'25-06-2024'), results(sn=u'20', certificate_no=u'TZ13H170', brand_name=u'NovoMix 30 FlexPen', classification_name=u'Human Medicinal Products', common_name=u'Insulin aspart', dosage_form=u'Solution for injection ', product_strength=u'100 U/ml', registrant=u'Novo Nordisk Pharma Gulf FZ - LLC', registrant_country=u'UNITED ARAB EMIRATES', manufacturer=u'Novo Nordisk', manufacturer_country=u'DENMARK', expiry_date=u'06-06-2018')]

Теперь к каждой строке можно получить доступ по атрибуту:

print([i.brand_name for i in final_results])

Выход:

[u'Trabilin', u'Melorem 7.5', u'Diminazen', u'Penstrep 20/25', u'Tridox 20', u'Supirocin', u'Septrin', u'Candazole', u'Mazit', u'PROVIVE', u'Spasmo', u'Sedoz', u'Hepaturyl', u'Brunes-200', u'Diapride', u'Biodroxil 500mg Capsules', u'Repoitin 2000', u'Humulin 70/30', u'Frusemide', u'NovoMix 30 FlexPen']

Чтобы очистить все страницы с таблицей, вы можете собрать ссылки на объекты тега selenium a, а затем выполнить итерацию по каждой из них и очистить данные:

from collections import deque
pages = []
full_links = deque(d.find_elements_by_class_name('paginate_button'))
while full_links:
   link = full_links.popleft()
   try:
     link.click()
     table_results = [i.text for i in soup(d.page_source, 'lxml').find_all('td')]
     new_results = table_results[12:]
     final_results = [results(*new_results[i:i+12]) for i in range(0, len(new_results), 12)]
     pages.append(final_results)
     full_links.extend(d.find_elements_by_class_name('paginate_button'))
   except:
     pass
0 голосов
/ 10 мая 2018

Вы уже заметили, что данные уже есть в формате json, которые также видны на изображении, предоставленном @Wu Wenter. Чтобы получить их соответственно, вам нужно сделать запрос post в этом URL вместе с необходимыми параметрами для передачи как data. При обычном подходе вы можете получить только 20 предметов. Тем не менее, я немного дернулся, чтобы разобрать все записи оттуда, которые 3,911 в количестве. Приведенный ниже скрипт извлечет данные из первых пяти столбцов.

Поскольку он дает вам все 3,911 записей из этой таблицы, загрузка может занять некоторое время.

import requests

URL = 'http://tfda.go.tz/portal/en/trader_module/trader_module/getRegisteredDrugs_products'
payload = "draw=1&columns%5B0%5D%5Bdata%5D=no&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=True&columns%5B0%5D%5Borderable%5D=True&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B1%5D%5Bdata%5D=certificate_no&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=True&columns%5B1%5D%5Borderable%5D=True&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B2%5D%5Bdata%5D=brand_name&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=True&columns%5B2%5D%5Borderable%5D=True&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B3%5D%5Bdata%5D=classification_name&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=True&columns%5B3%5D%5Borderable%5D=True&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B4%5D%5Bdata%5D=common_name&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=True&columns%5B4%5D%5Borderable%5D=True&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B5%5D%5Bdata%5D=dosage_form&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=True&columns%5B5%5D%5Borderable%5D=True&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B6%5D%5Bdata%5D=product_strength&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=True&columns%5B6%5D%5Borderable%5D=True&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B7%5D%5Bdata%5D=registrant&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=True&columns%5B7%5D%5Borderable%5D=True&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B8%5D%5Bdata%5D=registrant_country&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=True&columns%5B8%5D%5Borderable%5D=True&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B9%5D%5Bdata%5D=manufacturer&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=True&columns%5B9%5D%5Borderable%5D=True&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B10%5D%5Bdata%5D=manufacturer_country&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=True&columns%5B10%5D%5Borderable%5D=True&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B11%5D%5Bdata%5D=expiry_date&columns%5B11%5D%5Bname%5D=&columns%5B11%5D%5Bsearchable%5D=True&columns%5B11%5D%5Borderable%5D=True&columns%5B11%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B11%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B12%5D%5Bdata%5D=id&columns%5B12%5D%5Bname%5D=&columns%5B12%5D%5Bsearchable%5D=True&columns%5B12%5D%5Borderable%5D=True&columns%5B12%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B12%5D%5Bsearch%5D%5Bregex%5D=False&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=3911&search%5Bvalue%5D=&search%5Bregex%5D=False"

with requests.Session() as s:
    s.headers={"User-Agent":"Mozilla/5.0"}
    s.headers.update({'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
    res = s.post(URL, data = payload)

    for data in res.json()['data']:
        serial = data['no']
        certno = data['certificate_no']
        brndname = data['brand_name']
        clssification = data['classification_name']
        common_name = data['common_name']
        print(serial,certno,brndname,clssification,common_name)

Выход:

1 TAN 09,176 N02B MEP Trabilin Human Medicinal Products Tramadol
2 TZ13H188 Melorem 7.5 Human Medicinal Products Meloxicam
3 TAN 00,2246 P01X FAR Diminazen Veterinary Pharmaceutical Diminazene + Phenazone
4 TAN 00,2075 J01D FAR Penstrep 20/25 Veterinary Pharmaceutical Procaine BenzylPenicillin + Dihydrostreptomycin
5 TAN 00,2076 J01A FAR Tridox 20 Veterinary Pharmaceutical Oxytetracycline
6 TAN 00,1820 D01A GLE Supirocin Human Medicinal Products Mupirocin

и т. Д. ----

0 голосов
/ 10 мая 2018

Данные в таблице загружаются из динамического http-запроса xhr. Вы должны использовать Chrome Inspect элемент - сеть - Xhr. Вы можете найти запрос xhr.

xhr get

...