Я пытаюсь сделать скребок, загружающий 10 самых распространенных слов вместе с их номерами из каждой статьи блога, но у меня проблема с Counter.
очищенные данные должны идти в базу данных.REST, возвращающий следующую статистику в виде документа JSON:
10 самых распространенных слов с их количеством по адресу / stats /
10 самых распространенных слов с их номерами для каждого авторапод адресом / статистика / /
я пробовал со счетчиком ниже:
# split() returns list of all the words in the string
split_it = contents.split()
# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)
# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counter.most_common(10)
print(most_occur)
Ниже весь мой скребок:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from collections import Counter
from sqlalchemy.dialects.postgresql import psycopg2
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0'
}
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
d = webdriver.Chrome(ChromeDriverManager().install())
contents = []
authors = []
for article in all_links:
d.get(article)
soup = bs(d.page_source, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText()
content = soup.find('section', attrs={'class': 'post-content'})
contents.append(content)
author = soup.find('span', attrs={'class': 'author-content'})
authors.append(author)
unique_authors = list(set(authors))
unique_contents = list(set(contents))
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break # for debugging
d.quit()
# POSTGRESQL CONNECTION
# 1. Connect to local database using psycopg2
import psycopg2
hostname = 'balarama.db.elephantsql.com'
username = 'yagoiucf'
password = 'jXoWg8Hc8Ftxxxxxxxxxxxxxxxxxxxxxxxxxo'
database = 'yagoiucf'
conn = psycopg2.connect(host='balarama.db.elephantsql.com', user='yagoiucf',
password='jXoWg8Hc8FthwIxxxxxxxxxxxxxxxxx', dbname='yagoiucf')
conn.close()