Почему я не получаю все значения, экспортированные в мой CSV-файл? - PullRequest
0 голосов
/ 15 апреля 2020

Я получаю только 308 строк в моем CSV-файле. Где я должен получить более 900 строк. Я написал этот код ниже. Я пытался изменить итерацию в ссылках. Но все так же. получать объем данных каждый раз. Это проблема с моим объявлением фрейма данных или что-то еще?

from bs4 import BeautifulSoup
import requests
import pandas as ps

#list of dataframe

suppliers_name = []
suppliers_location = []
suppliers_type = []
suppliers_content = []
suppliers_est =[]
suppliers_income = []

def parse(url):
    web = requests.get(url)
    soup = BeautifulSoup(web.content, "html.parser")

    container = soup.find_all(class_ = "supplier-search-results__card profile-card profile-card profile-card--secondary supplier-tier-1")

    for cont in container:
        # getting the names

        name = cont.find("h2").text
        suppliers_name.append(name)

        #getting the locations

        location =cont.find(class_ = "profile-card__supplier-data").find("a").text[8:]
        if "  " in location:
            suppliers_location.append(location.replace("  ",""))
        elif "Locations" in location:
            suppliers_location.append(location.replace("Locations", "None"))

        #suppliers type

        types = cont.find(class_ = "profile-card__supplier-data").find_all("span")[1].text[2:]
        suppliers_type.append(types.replace("*", ""))

        # suppliers content

        content = cont.find(class_ = "profile-card__body-text").find("p").text
        suppliers_content.append(content)

        # suppliers establishment

        years = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})
        if len(years) == 4:
            year = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[2].text
            suppliers_est.append(year[5:])

        elif len(years) == 3:
            year = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            word =year[5:]
            if len(word) != 4:
                suppliers_est.append("None")
            else:
                suppliers_est.append(word) 

        elif len(years) == 2:
            year = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            suppliers_est.append(year[5:])

        elif len(years)==1:
            suppliers_est.append("None")

        # suppliers income

        incomes =  cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})
        if len(incomes) == 4:
            income = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            suppliers_income.append(income[4:])

        elif len(incomes) == 3:
            income = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            word = income[4:]
            if len(word) != 5:
                suppliers_income.append(word)
            else:
                suppliers_income.append("None")
        elif len(incomes) == 2:
            suppliers_income.append("None")

        elif len(incomes) == 1:
            suppliers_income.append("None")

#itterate over links
number = 1
num =1
for i in range(43):
    urls = f'https://www.thomasnet.com/nsearch.html?_ga=2.53813992.1582589371.1586649402-45317423.1586649402&cov=NA&heading=97010359&pg={num}'
    parse(urls)
    num += 1
    print("\n" f'{number} - done')
    number += 1

#dataframe

covid = ps.DataFrame({
    "Name of the Suppliers": suppliers_name,
    "Location": suppliers_location,
    "Type of the suppliers": suppliers_type,
    "Establishment of the supplies": suppliers_est,
    "Motive": suppliers_content
})

covid.to_csv("E:/New folder/covid.csv", index=False)
print("File Creation Done")

код работает без ошибок, но я не получаю все данные.

Ответы [ 2 ]

0 голосов
/ 15 апреля 2020
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

params = {
    "_ga": "2.53813992.1582589371.1586649402-45317423.1586649402",
    "cov": "NA",
    "heading": "97010359",
}


def main(url):
    with requests.Session() as req:
        goal = []
        for item in range(1, 44):
            print(f"Extracting Page # {item}")

            params['pg'] = item
            r = req.get(url, params=params)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = soup.findAll("div", id=re.compile("^pc"))

            for tar in target:
                names = tar.select_one("h2.profile-card__title").text
                location = tar.find(
                    "span", {"data-template": True}).previous_element.strip()
                try:
                    types = tar.select("path")[5].next_element.strip()
                except:
                    types = tar.select("path")[2].next_element.strip()
                years = "".join([yr.group(1)
                                 for yr in re.finditer(r"g> (\d{4})<", str(tar))])
                descs = tar.select_one(
                    "div.profile-card__body-text").next_element.text
                data = [names, location, types, years, descs]
                goal.append(data)

        df = pd.DataFrame(
            goal, columns=['Name', 'Location', 'Type', 'Year', 'Description'])
        print(df)
        df.to_csv("data.csv", index=False)


main("https://www.thomasnet.com/nsearch.html")

Вывод: Просмотр онлайн

[1065 rows x 5 columns]

Каждая страница содержит 25 элементов, но последняя страница содержит только 15 элементов.

Итак 25 * 42 = 1050 + 15 = 1065 (Total of 43 Pages).

enter image description here

0 голосов
/ 15 апреля 2020

Атрибуты класса изменяются через несколько страниц: ie изменяется от "supplier-search-results__card profile-card profile-card profile-card--secondary supplier-tier-1" до "supplier-search-results__card profile-card profile-card profile-card--tertiary " (и это только 1, что я заметил, кажется, это больше).

Так что в качестве альтернативы мне кажется, у них всех есть атрибут id, который начинается с pc. Вы можете попробовать это (найти все элементы с атрибутом id, который начинается с pc.)

В вашей логике также есть ошибки c, если в заявлениях вы не уловили все сценарии ios , Например, в какой-то момент местоположение - "", но вы проверяете только наличие двойного пробела перед городом, штат. И в нем также нет "Locations", так что оно пропускается, и в результате вы получаете разные места в списке. Лучше всего просто использовать .strip() пробел, а затем даже не нужно использовать if там и / или использовать замену, поскольку это позаботится о пробелах в строке.

Наконец, не так уж много договорились, но я видел, что большинство pandas пользователей импортируют как pd, а не ps

Так что я думаю, что этот код получает то, что вы хотите, и на самом деле он дает мне 1065 строк:

from bs4 import BeautifulSoup
import requests
import pandas as ps

#list of dataframe

suppliers_name = []
suppliers_location = []
suppliers_type = []
suppliers_content = []
suppliers_est =[]
suppliers_income = []

def parse(urls):
    web = requests.get(urls, headers={})
    soup = BeautifulSoup(web.content, "html.parser")


    container = soup.find_all(id=re.compile("^pc"))  # <----- Fixed this line' find all id attributes that start with pc
    for cont in container:
        # getting the names

        name = cont.find("h2").text
        suppliers_name.append(name)

        #getting the locations

        location =cont.find(class_ = "profile-card__supplier-data").find("a").text[8:]
        if "Locations" in location:
            suppliers_location.append(location.replace("Locations", "None"))
        else:
            suppliers_location.append(location.strip()) # <----- Fixed this line

        #suppliers type

        types = cont.find(class_ = "profile-card__supplier-data").find_all("span")[1].text[2:]
        suppliers_type.append(types.replace("*", ""))

        # suppliers content

        content = cont.find(class_ = "profile-card__body-text").find("p").text
        suppliers_content.append(content)

        # suppliers establishment

        years = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})
        if len(years) == 4:
            year = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[2].text
            suppliers_est.append(year[5:])

        elif len(years) == 3:
            year = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            word =year[5:]
            if len(word) != 4:
                suppliers_est.append("None")
            else:
                suppliers_est.append(word) 

        elif len(years) == 2:
            year = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            suppliers_est.append(year[5:])

        elif len(years)==1:
            suppliers_est.append("None")

        # suppliers income

        incomes =  cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})
        if len(incomes) == 4:
            income = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            suppliers_income.append(income[4:])

        elif len(incomes) == 3:
            income = cont.find(class_ = "profile-card__supplier-data").find_all("span", {"data-toggle":"popover"})[1].text
            word = income[4:]
            if len(word) != 5:
                suppliers_income.append(word)
            else:
                suppliers_income.append("None")
        elif len(incomes) == 2:
            suppliers_income.append("None")

        elif len(incomes) == 1:
            suppliers_income.append("None")

#itterate over links
number = 1
num =1
for i in range(43):
    urls = f'https://www.thomasnet.com/nsearch.html?_ga=2.53813992.1582589371.1586649402-45317423.1586649402&cov=NA&heading=97010359&pg={num}'
    parse(urls)
    num += 1
    print("\n" f'{number} - done')
    number += 1

#dataframe

covid = ps.DataFrame({
    "Name of the Suppliers": suppliers_name,
    "Location": suppliers_location,
    "Type of the suppliers": suppliers_type,
    "Establishment of the supplies": suppliers_est,
    "Motive": suppliers_content
})

Выход:

print (covid)
                                  Name of the Suppliers  ...                                             Motive
0                                 All Metal Sales, Inc.  ...  Supplier providing services and products for C...
1                                   Ellsworth Adhesives  ...  We are a large distributor with the ability to...
2                           Monroe Engineering Products  ...  Proud member of the Defense Industrial Base an...
3                       New Process Fibre Company, Inc.  ...  We supply parts to help produce the following ...
4                               Vanguard Products Corp.  ...  Custom manufacturing available for component p...
5                                      Techmetals, Inc.  ...  We are a certified metal plating facility, hol...
6                                       The Rodon Group  ...  We can design, mold and manufacturer plastic i...
7                                           Mardek, LLC  ...  Mardek LLC's core business is sourcing manufac...
8                         Allstates Rubber & Tool Corp.  ...  Materials or component parts are available tha...
9                    Estes Design & Manufacturing, Inc.  ...  We are a sheet metal fabricator still in opera...
10                         Nadco Tapes and Labels, Inc.  ...  We are manufacturer of tapes and labels capabl...
11                              NewAge Industries, Inc.  ...  We are a manufacturer and fabricator of plasti...
12                                       Associated Bag  ...  We are a nationwide supplier of packaging, shi...
13                                              3D Hubs  ...  Surgical masks - we have started a GoFundMe ra...
14                        Tailored Label Products, Inc.  ...  In response to the COVID-19 crisis, we're prov...
15                         Compressed Air Systems, Inc.  ...  We are specialists in compressed air, we can d...
16                               A & S Mold & Die Corp.  ...  Custom manufacturing available for component p...
17                                 Vibromatic Co., Inc.  ...  We manufacture custom part handling systems. I...
18                           Wyandotte Industries, Inc.  ...  Wyandotte Industries is a manufacturer of Spec...
19                                            MOCAP LLC  ...  We manufacture a full line of protective caps,...
20                       Emco Industrial Plastics, Inc.  ...  Custom manufacturer of guards, divider and fac...
21                   Bracalente Manufacturing Co., Inc.  ...  We specialize in complex and high volume turne...
22                             Liberty Industries, Inc.  ...  Engineers, designs and builds cleanrooms, modu...
23                                 Waples Manufacturing  ...  We are using our expertise in CNC precision ma...
24                                   Griff Paper & Film  ...  In response to the COVID-19 crisis, we are cur...
25                              The Hollaender Mfg. Co.  ...  Hollaender is a manufacturer supplying key inf...
26                                    IFM Efector, Inc.  ...  We offer durably tested and highly reliable se...
27                           Precision Associates, Inc.  ...  We are ramping up our production of several es...
28                                            LBU, Inc.  ...  For the production of in demand COVID response...
29                              EMC Precision Machining  ...  Available to custom manufacture component part...
                                                ...  ...                                                ...
1035                   Chembio Diagnostic Systems, Inc.  ...  Manufacturer of rapid PCR test kits for aid in...
1036                                        Innuscience  ...  Manufacturer of biotechnology based, environme...
1037                                Baumgartner Machine  ...  In response to the COVID-19 crisis, we can off...
1038                            Resitech Industries LLC  ...  We can provide face masks and face shields for...
1039                                   Bean, L.L., Inc.  ...  We can supply face masks to assist during the ...
1040                       Trinity Medical Devices Inc.  ...  Supplies and materials for respirators - Certa...
1041                                        Prent Corp.  ...  We can supply face shields that can be used du...
1042                                          GDC, Inc.  ...  Extensive range of machinery available to supp...
1043                      Honeywell International, Inc.  ...  We can supply face masks that can be used duri...
1044                                    Scan Group, The  ...  We are adapting part of our production to manu...
1045                    Advanced Sterilization Products  ...  We can supply face masks that can be used duri...
1046        International Wire Dies div. of DS Hai, LLC  ...  We have access to 3D printers so we can make s...
1047                         Prima Strategic Group Inc.  ...  We are a Houston, TX based organization and wi...
1048                               R+L Global Logistics  ...  We can ship anything, anytime, anywhere. We ar...
1049                          Interiors by Maite Granda  ...  If you need materials or component parts, we h...
1050                                        Pioneer IWS  ...  Prototyping services and mass production offer...
1051                                American Belleville  ...  Custom component part manufacturing available ...
1052                                     PSP Seals, LLC  ...  We have access to supplies of Chinese made KN9...
1053                    Gulf States International, Inc.  ...  We manufacture hand sanitizer for COVID-19 res...
1054                   Prochem Specialty Products, Inc.  ...  We manufacturer biodegradable and environmenta...
1055  Business & Industry Resource Solutions LLC, (B...  ...  During the COVID-19 crisis, we are supplying M...
1056                           Boardman Molded Products  ...  Custom thermoplastic injection molding company...
1057                          Machine Safety Management  ...  We are a manufacturing company with engineers ...
1058                                           JN White  ...  Manufacturing a proprietary all-in-one-piece f...
1059                                        New Balance  ...  We can supply face masks that can be used duri...
1060                                       Rhino Health  ...  One stop nitrile exam glove manufacturer. We h...
1061                               Orchid International  ...  Able to custom manufacture rapid PCR test kits...
1062                            MegaPlast United States  ...  We manufacture mobile hospital shelters, corps...
1063                                       Lion Apparel  ...  We can supply protective clothing that can be ...
1064                                   NuMa Group, Inc.  ...  Manufacturer of supplies and materials capable...

[1065 rows x 5 columns]
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...