Я написал программу, которая собирает информацию о финансировании из Crunchbase для списка компаний, использующих Python и BeautifulSoup. Первоначально я написал код без циклического перебора IP-адресов, и кажется, что мой IP-адрес был заблокирован из Crunchbase, потому что сейчас я даже не могу сделать ни одного запроса. Итак, я интегрировал IP-цикл в программу, но по какой-то причине Crunchbase все еще блокирует меня - может кто-нибудь указать мне правильное направление в отношении того, что мне не хватает? Я не слишком знаком с Python, я начал программировать всего пару месяцев назад, поэтому мне еще предстоит многому научиться. Большое вам спасибо!
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import csv
import time
import random
def scraper(url):
return_list = []
ua = UserAgent() # From here we generate a random user agent
proxies = [] # Will contain proxies [ip, port]
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')
soup = BeautifulSoup(proxies_doc, 'html.parser')
proxies_table = soup.find(id='proxylisttable')
# Save proxies in the array
for row in proxies_table.tbody.find_all('tr'):
proxies.append({
'ip': row.find_all('td')[0].string,
'port': row.find_all('td')[1].string
})
for proxy in proxies:
req = Request('http://icanhazip.com')
req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')
try:
success = urlopen(req).read().decode('utf8')
full_ip = proxy['ip'] + ':' + proxy['port']
break
except: # If error, delete this proxy and find another one
continue
try:
reques = Request(url)
reques.add_header('User-Agent', ua.random)
reques.set_proxy(full_ip, 'http')
response = urlopen(reques)
except:
return_list.append("No Crunchbase Page Found")
return_list.append("No Crunchbase Page Found")
print("Not found")
print(full_ip)
else:
data = response.read()
soup = BeautifulSoup(response, "html.parser")
try:
funding_status = soup.find_all("span", class_= "component--field-formatter field-type-enum ng-star-inserted")[1].text
return_list.append(funding_status)
except:
return_list.append("N/A")
try:
last_funding_type = soup.find("a", class_= "cb-link component--field-formatter field-type-enum ng-star-inserted").text
if last_funding_type[:6] != "Series" and last_funding_type[:7] != "Venture" and last_funding_type[:4] != "Seed" and last_funding_type[:3] != "Pre" and last_funding_type[:5] != "Angel" and last_funding_type[:7] != "Private" and last_funding_type[:4] != "Debt" and last_funding_type[:11] != "Convertible" and last_funding_type[:5] != "Grant" and last_funding_type[:9] != "Corporate" and last_funding_type[:6] != "Equity" and last_funding_type[:7] != "Product" and last_funding_type[:9] != "Secondary" and last_funding_type[:4] != "Post" and last_funding_type[:3] != "Non" and last_funding_type[:7] != "Initial" and last_funding_type[:7] != "Funding":
return_list.append("N/A")
else:
return_list.append(last_funding_type)
except:
return_list.append("N/A")
return return_list
user_input = input("CSV File Name (e.g: myfile.csv): ")
user_input2 = input("New CSV file name (e.g: newfile.csv): ")
print()
scrape_file = open(user_input, "r", newline = '', encoding = "utf-8")
row_count = sum(1 for row in csv.reader(scrape_file))
scrape_file = open(user_input, "r", newline = '', encoding = "utf-8")
new_file = open(user_input2, "w", newline = '', encoding = "utf-8")
writer = csv.writer(new_file)
writer.writerow(["Company Name", "Description", "Website", "Founded", "Product Name", "Country", "Funding Status", "Last Funding Type"])
count = 0
for row in csv.reader(scrape_file):
company_name = row[0]
if company_name == "Company Name":
continue
count += 1
print("Scraping company {} of {}".format(count, row_count))
company_name = company_name.replace(",", "")
company_name = company_name.replace("'", "")
company_name = company_name.replace("-", " ")
company_name = company_name.replace(".", " ")
s = "-"
join_name = s.join(company_name.lower().split())
company_url = "https://www.crunchbase.com/organization/" + join_name
writer.writerow([row[0], row[1], row[2], row[3], row[4], row[5], scraper(company_url)[0], scraper(company_url)[1]])
time.sleep(random.randint(30, 40))
new_file.close()
print("Done! You can now open your file %s." % user_input2)