Python webscraping удалить дубликаты электронных писем - PullRequest
0 голосов
/ 10 октября 2018

Как я могу удалить дубликаты писем из этого кода?Работает ли с set()?Я попробовал кое-что, но без удачи.

import requests
from bs4 import BeautifulSoup as soup
def get_emails(_links:list):

for i in range(len(_links)):
 new_d = soup(requests.get(_links[i]).text, 'html.parser').find_all('a', {'class':'my_modal_open'})
 if new_d:
   yield new_d[-1]['title']

start = 20
while True:
d = soup(requests.get('http://www.schulliste.eu/type/gymnasien/?bundesland=&start={page_id}'.format(page_id=start)).text, 'html.parser')
results = [i['href'] for i in d.find_all('a')][52:-9]
results = [link for link in results if link.startswith('http://')]

items = list(get_emails(results))
for item in items:

    print(item)

next_page=d.find('div', {'class': 'paging'}, 'weiter')

if next_page:

    start+=20

else:
    break

Есть ли способ просто получить адрес электронной почты один раз?

1 Ответ

0 голосов
/ 10 октября 2018

Ты пишешь так нелепо.Это расширенный пример написания письма на одну страницу для получения писем:

import requests
from lxml.html import fromstring


def start(link):
    response = requests.get(link)
    if response.status_code == 200:
        tree = fromstring(response.text)
        all_links_titles = tree.xpath('//a/@title')
        emails = []
        for title in all_links_titles:
            if '@' in title:
                emails.append(title)

        return list(set(emails))


if __name__ == "__main__":
    for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
        print(i)

Или вы можете извлекать письма из URL с помощью регулярных выражений, например:

import requests
import re
from lxml.html import fromstring


def start(link):
    response = requests.get(link)
    if response.status_code == 200:
        tree = fromstring(response.text)
        all_links = tree.xpath('//a/@href')
        emails = []
        for link in all_links:
            is_there = re.search('(?<=email=).+@.+?(?=(&|$))', link)
            if is_there:
                emails.append(is_there.group())

        return list(set(emails))


if __name__ == "__main__":
    for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
        print(i)

И весь код длясписание со всех школ:

import requests
import random
from time import sleep
from lxml.html import fromstring


def get_all_schools_urls(url='http://www.schulliste.eu/type/gymnasien/', paginate_by=20, ping=None):
    school_urls = []
    offset = 0
    while True:
        if ping:
            sleep(random.randrange(*ping))

        school_list_url = '{0}?start={1}'.format(url, offset)

        print('\tCollecting urls from {0}'.format(school_list_url))
        response = requests.get(school_list_url)
        if response.status_code == 200:
            tree = fromstring(response.text)
            urls = tree.xpath('//div[@class="school_name"]/a/@href')
            if urls:
                school_urls += urls
                print('\t\tFound urls {0}'.format(len(urls)))
            else:
                break
        else:
            raise ConnectionError

        offset += paginate_by

    return list(set(school_urls))


def get_emails(urls):
    emails = []

    for url in urls:
        print('\tCollecting e-mails from {0}'.format(url))
        response = requests.get(url)
        if response.status_code == 200:
            tree = fromstring(response.text)
            all_links_titles = tree.xpath('//a/@title')
            for title in all_links_titles:
                if '@' in title:
                    emails.append(title)
        else:
            raise ConnectionError

    return list(set(emails))


def start(output_urls, output_emails):
    print("Starting collection of school urls")
    schools_urls = get_all_schools_urls()
    print("Collected {0} schools urls".format(len(schools_urls)))
    with open(output_urls, 'w') as file:
        file.write("\n".join(schools_urls))
    print("Schools urls saved: {0}".format(output_urls))

    print("Starting collection of school emails")
    schools_emails = get_emails(schools_urls)
    print("Collected {0} schools emails".format(len(schools_emails)))
    with open(output_emails, 'w') as file:
        file.write("\n".join(schools_emails))
    print("Schools e-mails saved: {0}".format(output_urls))


if __name__ == "__main__":
    start('schools_urls.txt', 'schools_emails.txt')
...