Я пытаюсь скопировать данные с нескольких страниц одного и того же URL-адреса в один файл csv;)
Этот сценарий работает, записывая URL-адрес в файл url.txt, сценарий прочтите из этого файла, затем сохраните его в файл csv
Я пытаюсь разобраться самостоятельно, но мне нужна помощь светлых умов этого сообщества, чтобы помочь мне
Как видно из кода, я пытаюсь удалить данные с kakaku.com (веб-сайт jp)
'''
import os
import sys
import csv
import codecs
import requests
from bs4 import BeautifulSoup
# scraping function for kakatu.com / old version
def kakaku_scraper_o(url):
for u in url:
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
page = requests.get(u, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
titles_temp = soup.find_all(class_ = "ckitemLink")
prices_temp = soup.find_all(class_ = "pryen")
links_temp = soup.find_all(class_ = "ckitanker")
titles = []
prices = []
links = []
for i in range(len(titles_temp)):
links.append(links_temp[i]['href'])
titles.append(titles_temp[i].get_text())
prices.append(prices_temp[i].get_text())
filename = u.split("/")
filename = filename[-2] + "_kakaku.csv"
with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Link", "Title", "Price"])
for i in range(len(titles)):
csvWriter.writerow([links[i], titles[i].encode("utf8"), prices[i].encode("utf8")])
# scraping function for kakatu.com / new version
def kakaku_scraper_n(url):
for u in url:
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
page = requests.get(u, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
titles_temp = soup.find_all(class_ = "p-list_name")
prices_temp = soup.find_all(class_ = "p-list_price_data_price_num-1 p-num")
links_temp = soup.find_all(class_ = 'p-list_name')
titles = []
prices = []
links = []
for i in range(len(titles_temp)):
links_temp[i] = links_temp[i].find("a")
links.append("https://kakaku.com" + str(links_temp[i]['href']))
titles.append(titles_temp[i].get_text())
prices.append(prices_temp[i].get_text())
filename = u.split("/")
filename = filename[-2] + "_kakaku.csv"
with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Link", "Title", "Price"])
for i in range(len(titles)):
csvWriter.writerow([links[i], titles[i], prices[i]])
# scraping fuction for bestgate.net
def bestgate_scraper(url):
for u in url:
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
page = requests.get(u, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
titles_temp = soup.find_all(class_ = "name")
prices_temp = soup.find_all(class_ = "price")
links_temp = soup.find_all(class_ = 'name')
titles = []
prices = []
links = []
for i in range(len(titles_temp)):
links.append(links_temp[i]['href'])
titles.append(titles_temp[i].get_text())
prices.append(prices_temp[i].get_text())
filename = u.split("/")
filename = filename[-2] + "_bestgate.csv"`enter code here`
with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Link", "Title", "Price"])
for i in range(len(titles)):
csvWriter.writerow([links[i], titles[i], prices[i]])
# main function
if __name__ == '__main__':
with open("url.txt", mode='r', newline='') as urlfile:
url = urlfile.read().splitlines()
print(url)
urlfile.close()
# sort out the links for each website's function
kko = []
kkn = []
btg = []
for u in url:
if not "aspx" in u:
if "kakaku" in u:
kkn.append(u)
elif "aspx" and "kakaku" in u:
kko.append(u)
else:
btg.append(u)
bestgate_scraper(btg)
kakaku_scraper_o(kko)
kakaku_scraper_n(kkn)
'''