Так что я пытаюсь собрать кое-какую информацию об аренде в Киджи. Моя программа продолжает возвращать пустые значения, где это не должно, затем происходит сбой. Я сделал упрощенную версию своей программы:
#-------------------------------------------------------------------------------
# General set up.
#-------------------------------------------------------------------------------
# Packages.
import os
import scrapy
import requests
import re
import time
from math import ceil
# Change directory.
os.chdir("C:/Users/Owner/Desktop/Python/Projects/Kijiji Rental Prices")
#-------------------------------------------------------------------------------
# Predefined variables.
#-------------------------------------------------------------------------------
# Areas in northern BC.
regions = ['cariboo-area']
# Kijiji Area codes in northern BC.
region_codes = ['c37l1700296']
# Index set to one.
i = 0
# General posting variables.
url = []
# Empty lists for individual post scraping.
rental_type = []
#-------------------------------------------------------------------------------
# Scraping general posting data.
#-------------------------------------------------------------------------------
# Loop to scrape general posting data.
while i < len(regions):
# Set current page number to one.
current_pg_num = 1
# Web address.
kijiji_page = "".join(["https://www.kijiji.ca/b-apartments-condos/", regions[i],
"/page-", str(current_pg_num), "/", region_codes[i],
"?ad=offering"])
# Download HTML page.
html = requests.get(kijiji_page).content
# Create selector to determine number of results.
pg_num = scrapy.Selector(text = html)
pg_num = pg_num.xpath("normalize-space(//div[@class='showing'])")
pg_num = pg_num.extract()[0]
# Create page count number.
pg_num_reg = re.compile(r"\d+")
pg_num = pg_num_reg.findall(pg_num)
pg_num = ceil(float(pg_num[2]) / float(pg_num[1]))
# Loop that iterates through page numbers.
while current_pg_num < pg_num + 1:
# Download HTML page.
html = requests.get(kijiji_page).content
# Create selector object for organic rent ads.
pg = scrapy.Selector(text = html)
pg = pg.xpath("//div[@class='info-container']")
# Select advertisement URL.
xpath_temp = "normalize-space(.//a/@href)"
posting_url = pg.xpath(xpath_temp).extract()
# Create full URLs.
for index, link in enumerate(posting_url):
posting_url[index] = "https://www.kijiji.ca" + link
url = url + posting_url
# Add page number index by one.
current_pg_num += 1
# Sleep for three seconds.
time.sleep(3)
# Add to regions index by one.
i += 1
#-------------------------------------------------------------------------------
# Individual posting scraping
#-------------------------------------------------------------------------------
# Loop to scrape individual postings.
for index, link in enumerate(url):
posting_html = requests.get(url[index]).content
# Selector for posting.
posting_sel = scrapy.Selector(text = posting_html)
# Select rental type from posting.
xpath_temp = "(//div[@class='unitRow-1281171205']//span/text())[1]"
posting_rental_type = posting_sel.xpath(xpath_temp).extract()
rental_type.append(posting_rental_type[0])
# Sleep for three seconds.
time.sleep(3)
# Add page number index by one.
current_pg_num += 1
Я думаю, что это может иметь отношение к фактическим запросам к серверу, но я не уверен. Кто-нибудь знает, что здесь происходит?