Этот скрипт будет go просматривать страницы и получать информацию о каждом результате. Если результат не имеет, например, зарплаты - он поместит '-'
в данные (вы можете изменить его на None
, если хотите):
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.indeed.com/jobs?q=Data+Scientist&start={}'
data = []
for p in range(0, 100, 10):
print('Scraping results {}...'.format(p))
soup = BeautifulSoup(requests.get(base_url.format(p)).content, 'html.parser')
for result in soup.select('.result'):
title = result.select_one('.title').get_text(strip=True)
job_url = result.select_one('.title a')['href']
company = result.select_one('.company').get_text(strip=True) if result.select_one('.company') else '-'
rating = result.select_one('.ratingsDisplay').get_text(strip=True) if result.select_one('.ratingsDisplay') else '-'
location = result.select_one('.location').get_text(strip=True) if result.select_one('.location') else '-'
salary = result.select_one('.salary').get_text(strip=True) if result.select_one('.salary') else '-'
data.append((title, company, rating, location, salary, job_url))
# just print the data for now:
print('{:<65} {:<50} {:<10} {:<65} {:<10}'.format(*'Title Company Rating Location Salary'.split()))
for row in data:
print('{:<65} {:<50} {:<10} {:<65} {:<10}'.format(*row[:-1]))
Печать:
Scraping results 0...
Scraping results 10...
Scraping results 20...
...
Title Company Rating Location Salary
Data Scientist – Pricing Optimization Delta 4.2 Atlanta, GA -
Data Scientist - Entry Level Numerdox - Sacramento, CA -
Data Scientist RTI International 3.7 Durham, NC 27709 -
Entry Level Data Scientist IBM 3.9 United States -
Data Scientist - Economic Data Zillow 3.8 Seattle, WA 98101(Downtown area) -
Data Scientist FCA 4.0 Detroit, MI 48201 -
Data Scientist, Analytics (University Grad) Facebook 4.2 New York, NY 10017 -
Data Scientist Oath Inc 3.8 Champaign, IL -
...and so on.