Чтобы очистить все посты, вам не нужен Selenium.Вы можете делать все, используя библиотеки Requests
и BeautifulSoup
:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# Using request get 50 items from first page. pg=1 is page number, outmax=50 items per page
response = requests.post(
"http://law.go.kr/precScListR.do?q=*§ion=evtNm&outmax=50&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")
# Parse html using BeautifulSoup
page = BeautifulSoup(response.text, "html.parser")
# Find "go to last page" element and get "onclick" attribute, inside "onlick" attribute parse last page number
# for "outmax=50" (used before)
onclick = str(page.select(".paging > a:last-child")[0].attrs["onclick"])
last_page_number = int(''.join([n for n in onclick if n.isdigit()]))
# To test uncomment code below to get items only from first page
# last_page_number = 1
# Go through all pages and collect posts numbers in items
items = []
for i in range(1, last_page_number + 1):
if i>1:
# Go to next page
response = requests.post(
"http://law.go.kr/precScListR.do?q=*§ion=evtNm&outmax=100&pg=%d&fsort=21,10,30&precSeq=0&dtlYn=N" % i)
# Get all links
links = page.select("#viewHeightDiv .s_tit a")
# Loop all links and collect post numbers
for link in links:
# Parse post number from "onclick" attribute
items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
t = BeautifulSoup(response.text, "html.parser").find('div', attrs={'id': 'contentBody'}).text
posts.append({'number': item, 'url': url, 'text': t})
Чтобы сохранить в файл, измените последнюю часть кода ниже, где /yourfullpath/
замените на ваш путь, например "C: //файлы / "или" / Users / myuser / files / ":
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
parsed = BeautifulSoup(response.text, "html.parser")
text = parsed.find('div', attrs={'id': 'contentBody'}).text
title = parsed.select_one("h2").text
posts.append({'number': item, 'url': url, 'text': text, 'title': title})
with open('/yourfullpath/' + title + '.text', 'w') as f:
f.write(text)