L oop над страницей в Python запросах и Beautiful Soup - PullRequest
0 голосов
/ 10 января 2020

Ниже приведен полный сценарий веб-сканирования и сохранения в sql дБ.

Проблема в том, что я не могу go в l oop на странице, и данные сохранения будут только для одна страница и скрипт будет работать для одной страницы. Как я могу go в l oop запросить страницы и сохранить их все вместе?

# -*- coding: utf-8 -*-
import requests, re, time, csv, pandas, sqlite3 ,Rotate_User_Agents
from random import choice
from bs4 import BeautifulSoup


# 1st things 1st : Extract all links you will scrape.
class SpiderFirstHand:
    def __init__(self):
        self.HideMyAss()
        self.GetUserAgent()
        self.Links = []
        self.Names = []
        self.Address = []
        self.Phone = []
        self.Category = []
        self.Directions = []

    def HideMyAss(self):
        response = requests.get("https://free-proxy-list.net/")
        soup = BeautifulSoup(response.content, 'html.parser')
        self.IP = \
            {'https': choice(list(map(lambda x: x[0] + ':' + x[1], list(
                zip(map(lambda x: x.text, soup.findAll('td')[::8]),
                    map(lambda x: x.text, soup.findAll('td')[1::8]))))))}
        # self.IP = "" + str(headers) + " , proxies = " + str(proxy)
        return self.IP

    def GetUserAgent(self):
        self.headers = {
            "Content-type": "application/x-www-form-urlencoded",
            "User-Agent": "'" + Rotate_User_Agents.RandomUA() + "'"
        }
        return self.headers

    def Source(self):
        STime = time.time()
        R_Number = 1
        PageStart = 1
        PageEnd = 3
        while PageStart < PageEnd:
            url = "https://www.yellowpages.com.eg/ar/condensed-category/%D8%B5%D8%AD%D9%81-%D9%88%D9%85%D8%AC%D9%84%D8%A7%D8%AA/p"+str(PageStart)
            # url = "https://ww:w.yellowpages.com.eg/ar/category/%D8%A7%D9%84%D9%82%D8%A7%D9%87%D8%B1%D8%A9-%D8%B9%D9%82%D8%A7%D8%B1%D8%A7%D8%AA/213/p" + str(PageStart) + "/sortByrates"
            ########## , proxies=self.IP   ########## I can change IP EveryTime i request but will take long time load
            # proxies = {'https': '182.53.197.87:61603'}
            PAGE = requests.get(url=url, headers=self.headers)
            CODE = PAGE.status_code
            SourceCode = PAGE.content
            R_Number += 1
            PageStart += 1
            # print PageStart
            # print ("Finish Request" , CODE)
            FTime = time.time()
            RTime = FTime - STime
            bsObj = BeautifulSoup(SourceCode, "html.parser")
            ParentAllLinks = bsObj.find_all("a", attrs={"class": "companyTitle"})
            for link in ParentAllLinks:
                # print len(ParentAllLinks)   #
                if len(link['href']) == 0:
                    PureLink = "N-A"
                    self.Links.append(PureLink)
                else:
                    PureLink = "https:" + link['href']
                    self.Links.append(PureLink)
            print  ("Request Ref ", R_Number - 1)
            print ("The Total Companies links collecting number is : ", len(self.Links), " link(s)")
            Companies = bsObj.find_all("div", attrs={"class": "searchResultsDiv"})
            for Details in Companies:
                try:
                    C_Name = Details.find("p", attrs={"class": "companyTitle"}).find("strong").get_text()
                    C_Address = Details.find("a", attrs={"class": "col-md-9 company_address"}).get_text()
                    C_Category = Details.find("span", attrs={"class": "category"}).get_text()
                    C_Phone = Details.find_all("a", attrs={"class": "search-call-mob"})
                    C_Direction = Details.find_all("a", href=re.compile("^(https://maps.google)"))
                    if not C_Direction :
                        self.Directions.append("N/A")
                    else:
                        for Dir in C_Direction:
                           self.Directions.append(Dir['href'])
                    if  not C_Phone:
                        self.Phone.append("N/A")
                    else:                                
                        for Phone in C_Phone:
                            self.Phone.append(Phone['href'])

                    if not C_Name:
                        self.Names.append(" ")
                    else:
                        self.Names.append(C_Name)
                    if not C_Address:
                        self.Address.append(" ")
                    else:
                        self.Address.append(C_Address)
                    if not C_Category:
                        self.Category.append(" ")
                    else:
                        self.Category.append(C_Category)

                except AttributeError as AE:
                    print ("AE")

            ETime = time.time()
            DTime = ETime - STime
            print ("The length of links is : ", len(self.Links))
            print ("The length of Names is : ", len(self.Names))
            print ("The length of Address is : ", len(self.Address))
            print ("The length of Categories is : ", len(self.Category))
            print ("The length of Directions is : ", len(self.Directions))
            print ("The length of Phone is : ", len(self.Phone))
            ZIPPED = zip(self.Links, self.Names, self.Address, self.Phone, self.Category, self.Directions)
            ZIPPED = list(ZIPPED)
            connection = sqlite3.connect("SQLTestDB.db")
            cursor = connection.cursor()
            cursor.execute("""create table if not exists YellowCompanies(CompanyLink text,CompanyName text,CompanyAddress text,CompanyPhone text,CompanyCategory text,CompanyDirection text)""")
            connection.commit()
            for Index1 in range(len(ZIPPED)):
                # print ZIPPED[Index1]
                ZData = ZIPPED[Index1]
                cursor.executemany("INSERT INTO YellowCompanies (CompanyLink,CompanyName,CompanyAddress,CompanyPhone,CompanyCategory,CompanyDirection) VALUES (?,?,?,?,?,?)", ((ZData,)))
                connection.commit()
            print ("\n##########################################################\n" \
               "\n#########    This mission is done through       ##########\n" \
               "\n#########    {} seconds of time      ##########\n" \
               "\n############################################# Heisenberg #\n" \
               .format(DTime))
            # else:
            #     print"Connection Error 502: The server couldn`t build a successful connection."


x = SpiderFirstHand()
x.Source()
...