Python / BeautiifulSoup. список URL-> parse-> извлечь данные в CSV. ОШИБКА - PullRequest
0 голосов
/ 03 июля 2019

У меня есть список URL-адресов в CSV-файле (я могу разместить указанный файл на локальном компьютере или в сети). Мне нужно вытащить имя, адрес и номер телефона компании из веб-страниц в списке. У меня есть все правильные имена классов. Я хочу извлечь эти данные в CSV с вышеупомянутыми столбцами.

Из CSV:

https://slicelife.com/restaurants/wi/milwaukee/53211/pizza-man-milwaukee/menu
https://slicelife.com/restaurants/nj/northvale/7647/three-boys-from-italy-northvale/menu
https://slicelife.com/restaurants/mn/mankato/56001/jake-s-stadium-pizza/menu
https://slicelife.com/restaurants/pa/new-brighton/15066/bakers-buck-hut/menu

Что насчёт этого:


from bs4 import BeautifulSoup
import requests
import json
import csv
from urllib.request import urlopen
import requests



with open('aliveSlice.csv', 'r') as csvUrls_list:
    csv_reader = csv.DictReader(csvUrls_list)
    for row in csv_reader:

        url = (print (row))

        # Collect first page of menu
        page = requests.get(url)

            # Collect first page of menu
        response = requests.get(url)
        html = response.content

        # Create a BeautifulSoup object
        soup = BeautifulSoup(page.text, 'html.parser')

        # Pull all text from the f19xeu2d div
        pizzaPage = soup.find(class_='f19xeu2d')

        # Pull text from all instances of tags within associated divs
        restaurantNames = pizzaPage.find_all('f13p7rsj'),
        address = pizzaPage.find_all('f1lfckhr'),
        phoneNumber = pizzaPage.find_all('f12gt8lx'),

        pizzeriaObject = {
            restaurantNames,
            address,
            phoneNumber
            }
        # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'), 
        # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
        # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
        # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),

        # print (pizzaPage)
        # Create for loop to print out all restaurants' names





        with open('scrapedBiz.csv', 'w') as new_file:
            fieldnames = ['url', 'Raw Data', 'Business Name', 'Address', 'Phone'] 

            csv_writer =csv.DictWriter(new_file, fieldnames=fieldnames, delimiter=',') 

            csv_writer.writeheader()

            for line in csv_reader: 
                csv_writer.writerow(pizzeriaObject)





# # TrattoriArray = []
# # with open('aliveSlice.csv','r') as csvf: # Open file in read mode
    # # urls = csv.reader(csvf)
    # # for url in urls:
        # # TrattoriArray.append(url) # Add each url to list contents

# # for url in TrattoriArray:  # Parse through each url in the list.
    # # page = urlopen(url[0]).read()
    # # content = BeautifulSoup(page.content, "html.parser")

# # pizzaArray = []
# # for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
    # # pizzeriaObject = {
        # # "pizzeriaName": pizzeria.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
        # # "address": pizzeria.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
        # # "phoneNumber": pizzeria.find('rc-c2d-number', attrs={"span": "rc-c2d-number"}).text.encode('utf-8'),

    # # }
    # # pizzaArray.append(pizzeriaObject)
# # with open('pizzeriaData.json', 'w') as outfile:
    # # json.dump(pizzaArray, outfile)





# # # from bs4 import BeautifulSoup
# # # import requests
# # # import json
# # # import csv
# # # from urllib.request import urlopen
# # # import urllib2
# # # import re

# # # urls = csv.reader(open('aliveSlice.csv'))
# # # for url in urls:
    # # # response = urllib2.urlopen(url[0])
    # # # html = response.read()
    # # # # print re.findall('f19xeu2d',html)
    # # # content = BeautifulSoup(f19xeu2d.content, "html.parser")

# # # # url = 'https://slicelife.com/restaurants/fl/west-palm-bea/33406/albertos-pizzeria/menu'
# # # # response = requests.get(url, timeout=5)


# # # # TrattoriArray = []
# # # # with open('aliveSlice.csv','r') as csvf: # Open file in read mode
    # # # # urls = csv.reader(csvf)
    # # # # for url in urls:
        # # # # TrattoriArray.append(url) # Add each url to list contents

# # # # for url in TrattoriArray:  # Parse through each url in the list.
    # # # # page = urlopen(url[0]).read()
    # # # # content = BeautifulSoup(page.content, "html.parser")

# # # # pizzaArray = []
# # # # for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
    # # # # pizzaArray.append(pizzeriaObject)
# # # # with open('pizzeriaData.json', 'w') as outfile:
    # # # # json.dump(pizzaArray, outfile)


# # # htmlContent = response.content
# # # soup = BeautifulSoup(htmlContent, features="html.parser")


# # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'), 
# # # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# # # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),


# # # #print(soup.prettify())


# # # pizzeriaObject = {
# # # (
# # # pizzeriaName
# # # +phoneNumber
# # # +address
# # # )
# # # }


# # # print(pizzeriaObject)
# # # print(pizzeriaName)
# # # print(phoneNumber)
# # # print(address)

# # # # import requests
# # # # from bs4 import BeautifulSoup
# # # # import csv

# # # # with open('aliveSlice.csv', newline='') as f_urls, open('output.csv', 'w', newline='') as f_output:
    # # # # csv_urls = csv.reader(f_urls)
    # # # # csv_output = csv.writer(f_output)
    # # # # csv_output.writerow(['locationRawData' , 'pizzeriaName' , 'address', 'Phone'])
    # # # # csv_output.writerow(['Ngoname', 'CEO', 'City', 'Address', 'Phone', 'Mobile', 'E-mail'])

    # # # # for line in csv_urls:
        # # # # r = requests.get(line[0]).text
        # # # # soup = BeautifulSoup(r, "html.parser")

# # # # locationRawData = soup.find('h1')
# # # # print('RAW :', locationRawData.text)
# # # f13p7rsj
# # # # pizzeriaName = soup.find('h1', class_='f13p7rsj')
# # # # print('pizzeriaName:', pizzeriaName[1].text)

# # # ###########
        # # # ngoname = soup.find('h1')
        # # # print('NGO Name :', ngoname.text)

        # # # pizzeriaName = soup.find('h1', class_='').text
        # # # ceo_name = ceo.split(':')
        # # # print('CeoName:', ceo_name[1])

        # # # city = soup.find_all('span')
        # # # print('City :', city[5].text)

        # # # addressBiz = soup.find_all('address')
        # # # print('Address :', addressBiz[6].text)

        # # # phoneNumber = soup.find_all('button')
        # # # print('Phone :', phoneNumber[7].text)

        # # # mobile = soup.find_all('span')
        # # # print('Mobile :', mobile[8].text)

        # # # email = soup.find_all('span')
        # # # print('Email_id :', email[9].text)

        # # # csv_output.writerow([ngoname.text, ceo_name[1], city[5].text, address[6].text, phone[7].text, mobile[8].text, email[9].text])


# # # locationRawData = soup.find('h1')
# # # print('RAW :', locationRawData.text)

# # # pizzeriaName = soup.find('h1', class_='f13p7rsj')
# # # # pizzeria_name = pizzeriaName.split(':')
# # # print('pizzeriaName:', pizzeriaName[0])

# # # address = soup.find('address', class_='f1lfckhr')
# # # print('Address :', address[1].text)

# # # phoneNumber = soup.find('button', class_='f12gt8lx')
# # # print('Phone :', phoneNumber[2].text)

# # # locationRawData = soup.find('div', class_='f19xeu2d')
# # # print('RAW :', locationRawData[3].text)
# # # #############

# # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'), 
# # # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# # # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),

# # # # address = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8')
# # # # print('Address :', address[2].text)

# # # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8')
# # # # print('Phone :', phoneNumber[3].text)

# # # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8')
# # # # print('RAW :', locationRawData[4].text)



# # # # csv_output.writerow([locationRawData.text, pizzeria_name[1], address[2].text, phoneNumber[3].text, locationRawData[4].text])


1 Ответ

0 голосов
/ 03 июля 2019

Ваш код для получения номера телефона вызывал проблему, вы должны проверить, что возвращенные объекты действительны, прежде чем пытаться получить текст от них.Вы можете использовать csv.writer(), если вы хотите, чтобы выход CSV:

from bs4 import BeautifulSoup
import requests
import csv


with open('aliveSlice.csv', newline='') as f_input, open('output.csv', 'w', newline='', encoding='utf-8') as f_output:
    csv_input = csv.reader(f_input)
    csv_output = csv.writer(f_output)
    csv_output.writerow(["url", "name", "address", "phone"])

    for row in csv_input:
        url = row[0]
        req = requests.get(url)
        content = BeautifulSoup(req.content, "html.parser")

        for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
            name = pizzeria.find('h1', attrs={"class": "f13p7rsj"})
            address = pizzeria.find('address', attrs={"class": "f1lfckhr"})
            phone = pizzeria.find('button', attrs={"class": "f12gt8lx"})

            if name and address and phone:
                csv_output.writerow([url, name.text, address.text, phone.text])
            else:
                print(f"Missing data - {url}")

Предоставлял вам следующий вид вывода CSV:

url,name,address,phone
https://slicelife.com/restaurants/wi/milwaukee/53211/pizza-man-milwaukee/menu,Pizza Man Milwaukee,"2597 N Downer Ave, Milwaukee, WI 53211",414-622-1034
https://slicelife.com/restaurants/nj/northvale/7647/three-boys-from-italy-northvale/menu,Three Boys From Italy,"238 Livingston St, Northvale, NJ 7647",201-879-0152
https://slicelife.com/restaurants/mn/mankato/56001/jake-s-stadium-pizza/menu,Jake's Stadium Pizza,"330 Stadium Rd, Mankato, MN 56001",507-225-7978
https://slicelife.com/restaurants/pa/new-brighton/15066/bakers-buck-hut/menu,Bakers Buck Hut,"1103 Route 68, New Brighton, PA 15066",724-521-4028

Также обратите внимание, если вы используете split(':') итекст не включает :, тогда вы получите список только с одним элементом.Так что использование [1] не удастся.Вы могли бы вместо этого использовать [-1], чтобы вернуть последнее прошлое.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...