Web Scraping to CSV - ValueError Не удалось передать входной массив из shape (2) в shape (1) - PullRequest
0 голосов
/ 23 сентября 2019

Я играл с python и beautifulsoup последние несколько дней.Я пытался очистить местную рекламную платформу для оружия.

Я получил помощь, и это было здорово.с этими этапами помощи я перешел к следующему: «правильно, как мне поступить так…»

Вот где я сейчас нахожусь:

Я экспортирую (не в состоянии) мои данные в CSV.

Я получаю сообщение об ошибке - ValueError - Не удалось передать входной массив из shape (2) в shape (1).

В некоторых чтениях предлагается поместить 2d-массив в 1d-массив?- глядя на мой код, я не вижу, где я мог бы это делать?

Кто-нибудь возражал бы против сканирования и определения, где я ошибаюсь?

Спасибо!

полныйсценарий:

from bs4 import BeautifulSoup
import requests
import urllib.request
import csv
import pandas
from pandas import DataFrame
import re

#csv creation
with open('Guntrader_Dealer.csv', mode='w') as csv_file:
fieldnames = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()

#all links list
all_links=[]

#grab all links which contain the href specifed
url="https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1,3):
 res=requests.get(url).text
 soup=BeautifulSoup(res,'html.parser')
 for link in soup.select('a[href*="dealers/minsterley/minsterley-ranges/guns/"]'):
  all_links.append("https://www.guntrader.uk" + link['href'])

for a_link in all_links:

#Defining the span text in GunDetails lookups
    def make_span(make):
       return make.name=='span' and 'Make:' in make.parent.contents[0]    
    def model_span(model):
       return model.name=='span' and 'Model:' in model.parent.contents[0]
    def licence_span(licence):
       return licence.name=='span' and 'Licence:' in licence.parent.contents[0]
    def orient_span(orient):
       return orient.name=='span' and 'Orient.:' in orient.parent.contents[0]    
    def barrel_span(barrel):
      return barrel.name=='span' and 'Barrel:' in barrel.parent.contents[0]
    def stock_span(stock):
      return stock.name=='span' and 'Stock:' in stock.parent.contents[0]    
    def choke_span(choke):
      return choke.name=='span' and 'Chokes:' in choke.parent.contents[0]
    def origin_span(origin):
      return origin.name=='span' and 'Origin:' in origin.parent.contents[0]
    def trigger_span(trigger):
      return trigger.name=='span' and 'Trigger:' in trigger.parent.contents[0]
    def ejection_span(ejection):
      return ejection.name=='span' and 'Ejection:' in ejection.parent.contents[0]
    def serial_span(serial):
      return serial.name=='span' and 'Serial #:' in serial.parent.contents[0]
    def stockno_span(stockno):
      return stockno.name=='span' and 'Stock #:' in stockno.parent.contents[0]
    def condition_span(condition):
      return condition.name=='span' and 'Condition:' in condition.parent.contents[0]
    def scope_span(scope):
      return scope.name=='span' and 'Scope:' in scope.parent.contents[0]

    res = urllib.request.urlopen(a_link)
    soup = BeautifulSoup(res, 'html.parser')

#soup searches using the define criteria

    makes = soup.find(make_span)
    gun_makes = makes.content if makes else 'none'
    models = soup.find(model_span)
    gun_models = models.contents if models else 'none'
    licences = soup.find(licence_span)
    gun_licences = licences.contents if licences else 'none'
    orients = soup.find(orient_span)
    gun_orients = orients.contents if orients else 'none'    
    barrels = soup.find(barrel_span)
    gun_barrels = barrels.contents if barrels else 'none'
    stocks = soup.find(stock_span)
    gun_stocks = stocks.contents if stocks else 'none'
    chokes = soup.find(choke_span)
    gun_chokes = chokes.contents if chokes else 'none'
    origins = soup.find(origin_span)
    gun_origins = origins.contents if origins else 'none'
    triggers = soup.find(trigger_span)
    gun_triggers = triggers.contents if triggers else 'none'
    ejections = soup.find(ejection_span)
    gun_ejections = ejections.contents if ejections else 'none'
    scopes = soup.find(scope_span)
    gun_scopes = scopes.contents if scopes else 'none'
    serials = soup.find(serial_span)
    gun_serials = serials.contents if serials else 'none'
    stocknos = soup.find(stockno_span)
    gun_stocknos = stocknos.contents if stocknos else 'none'
    conditions = soup.find(condition_span)
    gun_conditions = conditions.contents if conditions else 'none'

#title price and description
    title = soup.select_one('h1[itemprop="name"]')
    gun_title = title.text if title else 'none'
    price = soup.select_one('p.price')
    gun_price = price.text if price else 'none'
    description = soup.select_one('p[itemprop="description"]')
    gun_description = description.text if description else 'none'


    data = { 'Title': gun_title, 'Make': gun_makes, 'Model': gun_models, 'Licence': gun_licences, 'Orientation': gun_orients, 'Barrel Length': gun_barrels, 'Stock Length': gun_stocks, 'Chokes': gun_chokes, 'Origin': gun_origins, 'Trigger': gun_triggers, 'Ejection': gun_ejections, 'Scope': gun_scopes, 'Serial No': gun_serials, 'Stock No': gun_stocknos, 'Condition': gun_conditions, 'Description': gun_description, 'Price': gun_price}

    df = DataFrame(data, columns = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price'], index=[0])

    df.to_csv(r'Guntrader_Dealer.csv')

1 Ответ

1 голос
/ 23 сентября 2019

Я написал сценарий для вас.Вместо того, чтобы перезаписывать разные df в один и тот же файл, я создал основной df, который добавляет все df в цикл for.

Вот окончательный код:

from bs4 import BeautifulSoup
import requests
import csv
import pandas
from pandas import DataFrame
import re
import os
import locale
os.environ["PYTHONIOENCODING"] = "utf-8"


#csv creation
with open('Guntrader_Dealer.csv', mode='w') as csv_file:
    fieldnames = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

all_links=[]

#grab all links which contain the href specifed
url="https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1,3):
    res=requests.get(url).text
    soup=BeautifulSoup(res,'html.parser')
    for link in soup.select('a[href*="dealers/minsterley/minsterley-ranges/guns/"]'):
        all_links.append("https://www.guntrader.uk" + link['href'])

df_main = DataFrame(columns = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price'])

for a_link in all_links:
    def make_span(make):
       return make.name=='span' and 'Make:' in make.parent.contents[0]    
    def model_span(model):
       return model.name=='span' and 'Model:' in model.parent.contents[0]
    def licence_span(licence):
       return licence.name=='span' and 'Licence:' in licence.parent.contents[0]
    def orient_span(orient):
       return orient.name=='span' and 'Orient.:' in orient.parent.contents[0]    
    def barrel_span(barrel):
      return barrel.name=='span' and 'Barrel:' in barrel.parent.contents[0]
    def stock_span(stock):
      return stock.name=='span' and 'Stock:' in stock.parent.contents[0]    
    def choke_span(choke):
      return choke.name=='span' and 'Chokes:' in choke.parent.contents[0]
    def origin_span(origin):
      return origin.name=='span' and 'Origin:' in origin.parent.contents[0]
    def trigger_span(trigger):
      return trigger.name=='span' and 'Trigger:' in trigger.parent.contents[0]
    def ejection_span(ejection):
      return ejection.name=='span' and 'Ejection:' in ejection.parent.contents[0]
    def serial_span(serial):
      return serial.name=='span' and 'Serial #:' in serial.parent.contents[0]
    def stockno_span(stockno):
      return stockno.name=='span' and 'Stock #:' in stockno.parent.contents[0]
    def condition_span(condition):
      return condition.name=='span' and 'Condition:' in condition.parent.contents[0]
    def scope_span(scope):
      return scope.name=='span' and 'Scope:' in scope.parent.contents[0]

    res = requests.get(a_link)
    soup = BeautifulSoup(res.text, 'html.parser')

    makes = soup.find(make_span)
    gun_makes = makes.content if makes else 'none'
    models = soup.find(model_span)
    gun_models = models.contents if models else 'none'
    licences = soup.find(licence_span)
    gun_licences = licences.contents if licences else 'none'
    orients = soup.find(orient_span)
    gun_orients = orients.contents if orients else 'none'    
    barrels = soup.find(barrel_span)
    gun_barrels = barrels.contents if barrels else 'none'
    stocks = soup.find(stock_span)
    gun_stocks = stocks.contents if stocks else 'none'
    chokes = soup.find(choke_span)
    gun_chokes = chokes.contents if chokes else 'none'
    origins = soup.find(origin_span)
    gun_origins = origins.contents if origins else 'none'
    triggers = soup.find(trigger_span)
    gun_triggers = triggers.contents if triggers else 'none'
    ejections = soup.find(ejection_span)
    gun_ejections = ejections.contents if ejections else 'none'
    scopes = soup.find(scope_span)
    gun_scopes = scopes.contents if scopes else 'none'
    serials = soup.find(serial_span)
    gun_serials = serials.contents if serials else 'none'
    stocknos = soup.find(stockno_span)
    gun_stocknos = stocknos.contents if stocknos else 'none'
    conditions = soup.find(condition_span)
    gun_conditions = conditions.contents if conditions else 'none'

    title = soup.select_one('h1[itemprop="name"]')
    gun_title = title.text if title else 'none'
    price = soup.select_one('p.price')
    gun_price = price.text if price else 'none'
    description = soup.select_one('p[itemprop="description"]')
    gun_description = description.text if description else 'none'


    data = { 'Title': gun_title, 'Make': gun_makes, 'Model': gun_models, 'Licence': gun_licences, 'Orientation': gun_orients, 'Barrel Length': gun_barrels, 'Stock Length': gun_stocks, 'Chokes': gun_chokes, 'Origin': gun_origins, 'Trigger': gun_triggers, 'Ejection': gun_ejections, 'Scope': gun_scopes, 'Serial No': gun_serials, 'Stock No': gun_stocknos, 'Condition': gun_conditions, 'Description': gun_description, 'Price': gun_price}

    df = DataFrame(data, columns = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price'], index=[0])
    df_main = df_main.append(df, ignore_index = True)
df_main.to_csv('Guntrader_Dealer.csv', encoding='UTF-8')
...