Ниже приведен полный сценарий веб-сканирования и сохранения в sql дБ.
Проблема в том, что я не могу go в l oop на странице, и данные сохранения будут только для одна страница и скрипт будет работать для одной страницы. Как я могу go в l oop запросить страницы и сохранить их все вместе?
# -*- coding: utf-8 -*-
import requests, re, time, csv, pandas, sqlite3 ,Rotate_User_Agents
from random import choice
from bs4 import BeautifulSoup
# 1st things 1st : Extract all links you will scrape.
class SpiderFirstHand:
def __init__(self):
self.HideMyAss()
self.GetUserAgent()
self.Links = []
self.Names = []
self.Address = []
self.Phone = []
self.Category = []
self.Directions = []
def HideMyAss(self):
response = requests.get("https://free-proxy-list.net/")
soup = BeautifulSoup(response.content, 'html.parser')
self.IP = \
{'https': choice(list(map(lambda x: x[0] + ':' + x[1], list(
zip(map(lambda x: x.text, soup.findAll('td')[::8]),
map(lambda x: x.text, soup.findAll('td')[1::8]))))))}
# self.IP = "" + str(headers) + " , proxies = " + str(proxy)
return self.IP
def GetUserAgent(self):
self.headers = {
"Content-type": "application/x-www-form-urlencoded",
"User-Agent": "'" + Rotate_User_Agents.RandomUA() + "'"
}
return self.headers
def Source(self):
STime = time.time()
R_Number = 1
PageStart = 1
PageEnd = 3
while PageStart < PageEnd:
url = "https://www.yellowpages.com.eg/ar/condensed-category/%D8%B5%D8%AD%D9%81-%D9%88%D9%85%D8%AC%D9%84%D8%A7%D8%AA/p"+str(PageStart)
# url = "https://ww:w.yellowpages.com.eg/ar/category/%D8%A7%D9%84%D9%82%D8%A7%D9%87%D8%B1%D8%A9-%D8%B9%D9%82%D8%A7%D8%B1%D8%A7%D8%AA/213/p" + str(PageStart) + "/sortByrates"
########## , proxies=self.IP ########## I can change IP EveryTime i request but will take long time load
# proxies = {'https': '182.53.197.87:61603'}
PAGE = requests.get(url=url, headers=self.headers)
CODE = PAGE.status_code
SourceCode = PAGE.content
R_Number += 1
PageStart += 1
# print PageStart
# print ("Finish Request" , CODE)
FTime = time.time()
RTime = FTime - STime
bsObj = BeautifulSoup(SourceCode, "html.parser")
ParentAllLinks = bsObj.find_all("a", attrs={"class": "companyTitle"})
for link in ParentAllLinks:
# print len(ParentAllLinks) #
if len(link['href']) == 0:
PureLink = "N-A"
self.Links.append(PureLink)
else:
PureLink = "https:" + link['href']
self.Links.append(PureLink)
print ("Request Ref ", R_Number - 1)
print ("The Total Companies links collecting number is : ", len(self.Links), " link(s)")
Companies = bsObj.find_all("div", attrs={"class": "searchResultsDiv"})
for Details in Companies:
try:
C_Name = Details.find("p", attrs={"class": "companyTitle"}).find("strong").get_text()
C_Address = Details.find("a", attrs={"class": "col-md-9 company_address"}).get_text()
C_Category = Details.find("span", attrs={"class": "category"}).get_text()
C_Phone = Details.find_all("a", attrs={"class": "search-call-mob"})
C_Direction = Details.find_all("a", href=re.compile("^(https://maps.google)"))
if not C_Direction :
self.Directions.append("N/A")
else:
for Dir in C_Direction:
self.Directions.append(Dir['href'])
if not C_Phone:
self.Phone.append("N/A")
else:
for Phone in C_Phone:
self.Phone.append(Phone['href'])
if not C_Name:
self.Names.append(" ")
else:
self.Names.append(C_Name)
if not C_Address:
self.Address.append(" ")
else:
self.Address.append(C_Address)
if not C_Category:
self.Category.append(" ")
else:
self.Category.append(C_Category)
except AttributeError as AE:
print ("AE")
ETime = time.time()
DTime = ETime - STime
print ("The length of links is : ", len(self.Links))
print ("The length of Names is : ", len(self.Names))
print ("The length of Address is : ", len(self.Address))
print ("The length of Categories is : ", len(self.Category))
print ("The length of Directions is : ", len(self.Directions))
print ("The length of Phone is : ", len(self.Phone))
ZIPPED = zip(self.Links, self.Names, self.Address, self.Phone, self.Category, self.Directions)
ZIPPED = list(ZIPPED)
connection = sqlite3.connect("SQLTestDB.db")
cursor = connection.cursor()
cursor.execute("""create table if not exists YellowCompanies(CompanyLink text,CompanyName text,CompanyAddress text,CompanyPhone text,CompanyCategory text,CompanyDirection text)""")
connection.commit()
for Index1 in range(len(ZIPPED)):
# print ZIPPED[Index1]
ZData = ZIPPED[Index1]
cursor.executemany("INSERT INTO YellowCompanies (CompanyLink,CompanyName,CompanyAddress,CompanyPhone,CompanyCategory,CompanyDirection) VALUES (?,?,?,?,?,?)", ((ZData,)))
connection.commit()
print ("\n##########################################################\n" \
"\n######### This mission is done through ##########\n" \
"\n######### {} seconds of time ##########\n" \
"\n############################################# Heisenberg #\n" \
.format(DTime))
# else:
# print"Connection Error 502: The server couldn`t build a successful connection."
x = SpiderFirstHand()
x.Source()