Question

Я пытаюсь собрать данные с нескольких страниц, используя скрап и селен. Я успешно вошел в систему, используя драйвер селена, но когда мой паук начинает очистку, он не использует сеанс, зарегистрированный в системе из селена, а только очищает данные, доступные любому пользователю (не прошедшему проверку подлинности)

class Brother(Spider):
name = "spiderbrother"
allowed_domain = ["mywebsite"]
start_urls = ['https://mywebsite../']
custom_settings = {
    'ITEM_PIPELINES': {
        'Equipe.pipelines.Brother': 500
    },
    'COOKIES_ENABLED': True
}

def parse(self, response):
    driver = webdriver.Firefox()
    driver.get("https://mywebsite../login")
    username = driver.find_element_by_id("email")
    password = driver.find_element_by_id("passwd")
    username.send_keys("myEmail")
    password.send_keys("MyPWD")
    driver.find_element_by_name("SubmitLogin").click()
    categories = Selector(response).xpath('//*[@id="leo-top-menu"]/ul/li/a')
    for categorie in categories:
        page_url = categorie.xpath('@href').extract_first()
        next_page = response.urljoin(page_url)
        if next_page:
            yield scrapy.Request(url=next_page, callback=self.types)

def types(self, response):

    sub_categories = Selector(response).xpath('//*[@id="subcategories"]/div/div/div/h5/a')
    for sub_categorie in sub_categories:
        page_url = sub_categorie.xpath('@href').extract_first()
        next_page = response.urljoin(page_url)
        if next_page:
            yield scrapy.Request(url=next_page, callback=self.products)

def products(self, response):

    products = Selector(response).xpath('//div[@class="product-image-container image"]/a')

    for product in products:
        url = product.xpath('@href').extract_first()
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'html.parser')
        item = TestItem()
        item["title"] = soup.find("h1").text
        item['image_url'] = soup.find("div", {"id": "image-block"}).img["src"]
        item['price'] = soup.find("span", {"id": "our_price_display"}).text
        try:
            item['availability'] = soup.find("span", {"id": "availability_value"}).text()
        except:
            item['availability'] = "Available"
        try:
            item['description'] = soup.find("div", {"itemprop": "description"}).text.strip()
        except:
            item['description'] = "no description found"
        yield item

    next_page = response.xpath('//li[@class="pagination_next"]/a/@href').extract_first()
    next_page = response.urljoin(next_page)
    if next_page:
        yield scrapy.Request(url=next_page, callback=self.products)

Я получаю все данные, кроме «Цена», потому что они доступны только при входе в систему

Попытка использования FormRequest для входа в систему вместо селена все еще приводила к той же проблеме. здесь

Я вошел в систему с FormRequest

    def parse(self, response):
    return FormRequest.from_response(response,
                                     formxpath="//*[@id='login_form']",
                                     formdata={'email': 'MyEmail', 'passwd': 'myPWD'},
                                     callback=self.after_login)

def after_login(self, response):
    categories = Selector(response).xpath('//*[@id="leo-top-menu"]/ul/li/a')
    for categorie in categories:
        page_url = categorie.xpath('@href').extract_first()
        next_page = response.urljoin(page_url)
        if next_page:
            yield Request(url=next_page, callback=self.types)

Mohamed · Answer 1 · 22 марта 2019

Оказалось, что requests.get() открывал URL без входа в сеанс, поэтому я попытался получить к нему доступ с помощью Request и вызвать новый метод parse_item(), чтобы Beatifulsoup проанализировал ответ, и он заработал.

Обновленный код

def products(self, response):
    products = Selector(response).xpath('//div[@class="product-image-container image"]/a')
    for product in products:
        url = product.xpath('@href').extract_first()
        page = response.urljoin(url)
        yield Request(url=page, callback=self.parse_item)
    next_page = response.xpath('//li[@class="pagination_next"]/a/@href').extract_first()
    next_page = response.urljoin(next_page)
    if next_page:
        yield Request(url=next_page, callback=self.products)

def parse_item(self, response):
    soup = BeautifulSoup(response.text, 'lxml')
    item = TestItem()
    item["title"] = soup.find("h1").text
    item['image_url'] = soup.find("div", {"id": "image-block"}).img["src"]
    item['price'] = soup.find("span", {"id": "our_price_display"}).text
    try:
        item['availability'] = soup.find("span", {"id": "availability_value"}).text()
    except:
        item['availability'] = "Available"
    try:
        item['description'] = soup.find("div", {"itemprop": "description"}).text.strip().replace(u'\xa0', u' ')
    except:
        print("no description found")
    yield item

BeautifulSoup разбирает страницу без аутентифицированного сеанса

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

BeautifulSoup разбирает страницу без аутентифицированного сеанса

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов