Мне сложно понять, как я могу передать дату теннисных матчей в свой конвейер. Дата теннисного матча не указывается в таблице. На странице есть дата в странном формате ('// * [@ id = "center"] / ul / div [2] / div / div / ul / li [1] / span / text ()'), которая может возможно, будет использоваться.
Есть ли способ передать дату, которая используется в качестве параметров URL? Любая помощь будет принята с благодарностью!
My Spider:
import scrapy
import datetime
from datetime import timedelta
class Tennis_ExplorerSpider(scrapy.Spider):
name = 'tennis_explorer'
allowed_domains = ['tennisexplorer.com']
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = datetime.datetime.today() - datetime.timedelta(days=1)
end_date = datetime.datetime.today() + datetime.timedelta(days=1)
start_urls = []
start_url='https://www.tennisexplorer.com/matches/?type=all&year='
for single_date in daterange(start_date, end_date):
start_urls.append(single_date.strftime(start_url+"%Y&month=%m&day=%d&timezone=-6"))
def parse(self, response):
#self.logger.debug('callback "parse": got response %r' % response)
#Extracting the content using xpath
time = response.xpath('//tr/td[1][@class="first time"]/text()').extract()
match_id = response.xpath('substring-after(//tr/td[13]/a/@href, "/match-detail/?id=")').extract()
player1 = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
player2 = response.xpath('//tr[not(contains(@class, "head"))]/td[1][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
player1_sets = response.xpath('//tr/td[3][@class="result"]/text()').extract()
player2_sets = response.xpath('//tr/td[2][@class="result"]/text()').extract()
#Give the extracted content row wise
for item in zip(time, match_id, player1, player2, player1_sets, player2_sets):
#create a dictionary to store the scraped info
scraped_info = {
'time' : item[0],
'match_id' : item[1],
'player1' : item[2],
'player2' : item[3],
'player1_sets' : item[4],
'player2_sets' : item[5]
}
#yield or give the scraped info to scrapy
yield scraped_info
По какой-то причине я получаю дату по одному символу за раз. Вот мой обновленный код:
import scrapy
import datetime
import re
from datetime import timedelta
class Tennis_ExplorerSpider(scrapy.Spider):
name = 'tennis_explorer'
allowed_domains = ['tennisexplorer.com']
# start_urls = 'https://www.tennisexplorer.com/results/?type=atp-single&year=2020&month=07&day=27'
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = datetime.datetime.today() #- datetime.timedelta(days=1)
end_date = datetime.datetime.today() + datetime.timedelta(days=1)
start_urls = []
start_url='https://www.tennisexplorer.com/matches/?type=all&year='
for single_date in daterange(start_date, end_date):
start_urls.append(single_date.strftime(start_url+"%Y&month=%m&day=%d&timezone=-6"))
def parse(self, response):
#Extracting the content using xpath
self.logger.debug('callback "parse": got response %r' % response)
data = response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]/@href').extract()
match_id =[re.sub('^.+=','',el) for el in data]
data2 = response.xpath('//span[@class="tab"]/text()').get().replace(". ", "-")
date = datetime.datetime.strptime(data2, "%d-%m-%Y").strftime('%Y-%m-%d')
time = response.xpath('//tr/td[1][@class="first time"]/text()').extract()
#event_name = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract()
#event_id = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract()
#player1_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
player1 = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
#player1_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
#player2_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
player2 = response.xpath('//tr[not(contains(@class, "head"))]/td[1][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
#player2_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
player1_sets = response.xpath('//tr/td[3][@class="result"]/text()').get()
player1_set1 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[4][@class="score"]/text()').get()
player1_set2 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[5][@class="score"]/text()').get()
player1_set3 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[6][@class="score"]/text()').get()
player1_set4 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[7][@class="score"]/text()').get()
player1_set5 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[8][@class="score"]/text()').get()
player2_sets = response.xpath('//tr/td[2][@class="result"]/text()').get()
player2_set1 = response.xpath('//tr[contains(@id, "b")]/td[3][@class="score"]/text()').get()
player2_set2 = response.xpath('//tr[contains(@id, "b")]/td[4][@class="score"]/text()').get()
player2_set3 = response.xpath('//tr[contains(@id, "b")]/td[5][@class="score"]/text()').get()
player2_set4 = response.xpath('//tr[contains(@id, "b")]/td[6][@class="score"]/text()').get()
player2_set5 = response.xpath('//tr[contains(@id, "b")]/td[7][@class="score"]/text()').get()
#Give the extracted content row wise
for item in zip(match_id, date, time, player1, player2, player1_sets, player1_set1, player1_set2, player1_set3, player1_set4, player1_set5, player2_sets, player2_set1, player2_set2, player2_set3, player2_set4, player2_set5):
#create a dictionary to store the scraped info
scraped_info = {
'match_id' : item[0],
'date' : item[1],
'time' : item[2],
#'event_name' : item[0],
'player1' : item[3],
'player2' : item[4],
'player1_sets' : item[5],
'player1_set1' : item[6],
'player1_set2' : item[7],
'player1_set3' : item[8],
'player1_set4' : item[9],
'player1_set5' : item[10],
'player2_sets' : item[11],
'player2_set1' : item[12],
'player2_set2' : item[13],
'player2_set3' : item[14],
'player2_set4' : item[15],
'player2_set5' : item[16],
}
#yield or give the scraped info to scrapy
yield scraped_info