Вытащить это довольно сложно. Я попробовал использовать другой пакет Spacy, чтобы попытаться вытащить объекты, связанные с организациями / автомобильными компаниями. Это не идеально, но это начало:
Код:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy
nlp = spacy.load("en_core_web_sm")
req_url = 'https://chicago.craigslist.org/search/cta'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36'}
payload = {
's': '0',
'query': 'automotive',
'sort': 'rel'}
response = requests.get(req_url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
total_posts = int(soup.find('span',{'class':'totalcount'}).text)
pages = list(range(0, total_posts, 120))
iterations = 0
post_timing = []
post_hoods = []
post_title_texts = []
post_links = []
post_prices = []
post_makes = []
post_models = []
for page in pages:
payload = {
's': page,
'query': 'automotive',
'sort': 'rel'}
response = requests.get(req_url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all('li', class_= 'result-row')
#extract data item-wise
for post in posts:
if post.find('span', class_ = 'result-hood') is not None:
#posting date
#grab the datetime element 0 for date and 1 for time
post_datetime = post.find('time', class_= 'result-date')['datetime']
post_timing.append(post_datetime)
#neighborhoods
post_hood = post.find('span', class_= 'result-hood').text
post_hoods.append(post_hood)
#title text
post_title = post.find('a', class_='result-title hdrlnk')
post_title_text = post_title.text
post_title_texts.append(post_title_text)
#post link
post_link = post_title['href']
post_links.append(post_link)
post_price = post.a.text.strip()
post_prices.append(post_price)
try:
# Used Spacy and Named Entity Recognition (NER) to pull out makes/models within the title text
post_title_text = post_title_text.replace('*', ' ')
post_title_text = [ each.strip() for each in post_title_text.split(' ') if each.strip() != '' ]
post_title_text = ' '.join( post_title_text)
doc = nlp(post_title_text)
model = [ent.text for ent in doc.ents if ent.label_ == 'PRODUCT']
make_model_list = [ent.text for ent in doc if ent.tag_ == 'NNP']
doc = nlp(' '.join(make_model_list))
make = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
post_make = make[0]
post_makes.append(post_make)
post_model = model[0]
post_models.append(post_model)
except:
post_makes.append('')
post_models.append('')
iterations += 1
print("Finished iteration: " + str(iterations))
data = list(zip(post_timing,post_hoods,post_title_texts,post_links,post_prices,post_makes,post_models))
df = pd.DataFrame(list(zip(post_timing,post_hoods,post_title_texts,post_links,post_prices,post_makes,post_models)),
columns = ['time','hood','title','link','price','make','model'])
Вывод:
print (df.head(20).to_string())
time hood title link price make model
0 2019-10-03 07:12 (TEXT 855-976-4304 FOR CUSTOM PAYMENT) 2015 Ford Focus SE Sedan 4D sedan Dk. Gray - F... https://chicago.craigslist.org/chc/ctd/d/chica... $11500 Ford Focus SE
1 2019-10-03 06:03 (EVERYBODY DRIVES IN SOUTH ELGIN) $174/mo [][][] 2013 Hyundai Sonata BAD CREDIT OK https://chicago.craigslist.org/nwc/ctd/d/south... $174 Sonata BAD
2 2019-10-03 00:04 (EVERYBODY DRIVES IN SOUTH ELGIN) $658/mo [][][] 2016 Jeep Grand Cherokee BAD CR... https://chicago.craigslist.org/nwc/ctd/d/south... $658 Hyundai
3 2019-10-02 21:04 (EVERYBODY DRIVES IN SOUTH ELGIN) $203/mo [][][] 2010 Chevrolet Traverse BAD CRE... https://chicago.craigslist.org/nwc/ctd/d/south... $203 Jeep Grand Cherokee BAD Traverse BAD
4 2019-10-02 20:24 (DENVER) 2017 Jeep Cherokee Latitude 4x4 4dr SUV SKU:60... https://chicago.craigslist.org/chc/ctd/d/denve... $8995 Cherokee
5 2019-10-02 20:03 ( Buy Here Pay Here!) Good Credit, Bad Credit, NO Credit = NO Problem https://chicago.craigslist.org/nwc/ctd/d/chica... $0 Chevrolet
6 2019-10-02 20:03 ( Buy Here Pay Here!) Aceptamos Matricula!!! Te pagan en efectivo?? ... https://chicago.craigslist.org/wcl/ctd/d/chica... $0 Jeep
7 2019-10-02 20:02 ( Buy Here Pay Here!) Good Credit, Bad Credit, No Credit = No Problem https://chicago.craigslist.org/chc/ctd/d/vista... $0 Credit Bad Credit
8 2019-10-02 20:00 ( Buy Here Pay Here!) Good Credit, Bad Credit, No Credit= No Problem https://chicago.craigslist.org/sox/ctd/d/chica... $0
9 2019-10-02 19:15 (* CHRYSLER * TOWN AND COUNTRY * WWW.YOURCHOI... 2013*CHRYSLER*TOWN & COUNTRY*TOURING LEATHER K... https://chicago.craigslist.org/nwc/ctd/d/2013c... $9499
10 2019-10-02 19:09 (*CADILLAC* *DTS* WWW.YOURCHOICEAUTOS.COM) 2008*CADILLAC*DTS*1OWNER LEATHER SUNROOF NAVI ... https://chicago.craigslist.org/sox/ctd/d/2008c... $5999 Credit Bad Credit
11 2019-10-02 18:59 (WAUKEGANAUTOAUCTION.COM OPEN TO PUBLIC OVER ... 2001 *GMC**YUKON* XL DENALI AWD 6.0L V8 1OWNER... https://chicago.craigslist.org/nch/ctd/d/2001-... $1200
12 2019-10-02 18:47 (*GMC *SAVANA *CARGO* WWW.YOURCHOICEAUTOS.COM) 1999 *GMC *SAVANA *CARGO*G2500 SHELVES CABINET... https://chicago.craigslist.org/sox/ctd/d/1999-... $2999 Credit Bad Credit
13 2019-10-02 18:04 ( Buy Here Pay Here!) GoodCredit, Bad Credit, No credit = No Problem https://chicago.craigslist.org/nwc/ctd/d/chica... $0
14 2019-10-02 18:05 ( Buy Here Pay Here!) Rebuild your credit today!!! https://chicago.craigslist.org/sox/ctd/d/chica... $0 CHRYSLER
15 2019-10-02 18:03 ( Buy Here Pay Here!) Rebuild your credit today!!! Repo? No Problem!... https://chicago.craigslist.org/chc/ctd/d/vista... $0
16 2019-10-02 17:59 (* ACURA * TL * WWW.YOURCHOICEAUTOS.COM) 2006 *ACURA**TL* LEATHER SUNROOF CD KEYLES ALL... https://chicago.craigslist.org/sox/ctd/d/2006-... $4499
17 2019-10-02 18:00 ( Buy Here Pay Here!) Buy Here Pay Here!!! We Make it Happen!! Bad C... https://chicago.craigslist.org/wcl/ctd/d/chica... $0
18 2019-10-02 17:35 (ST JOHN) 2009 NISSAN VERSA https://chicago.craigslist.org/nwi/ctd/d/saint... $4995
19 2019-10-02 17:33 (DENVER) 2013 Scion tC Base 2dr Coupe 6M SKU:065744 Sci... https://chicago.craigslist.org/chc/ctd/d/denve... $5995 GoodCredit Bad Credit