Ты пишешь так нелепо.Это расширенный пример написания письма на одну страницу для получения писем:
import requests
from lxml.html import fromstring
def start(link):
response = requests.get(link)
if response.status_code == 200:
tree = fromstring(response.text)
all_links_titles = tree.xpath('//a/@title')
emails = []
for title in all_links_titles:
if '@' in title:
emails.append(title)
return list(set(emails))
if __name__ == "__main__":
for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
print(i)
Или вы можете извлекать письма из URL с помощью регулярных выражений, например:
import requests
import re
from lxml.html import fromstring
def start(link):
response = requests.get(link)
if response.status_code == 200:
tree = fromstring(response.text)
all_links = tree.xpath('//a/@href')
emails = []
for link in all_links:
is_there = re.search('(?<=email=).+@.+?(?=(&|$))', link)
if is_there:
emails.append(is_there.group())
return list(set(emails))
if __name__ == "__main__":
for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
print(i)
И весь код длясписание со всех школ:
import requests
import random
from time import sleep
from lxml.html import fromstring
def get_all_schools_urls(url='http://www.schulliste.eu/type/gymnasien/', paginate_by=20, ping=None):
school_urls = []
offset = 0
while True:
if ping:
sleep(random.randrange(*ping))
school_list_url = '{0}?start={1}'.format(url, offset)
print('\tCollecting urls from {0}'.format(school_list_url))
response = requests.get(school_list_url)
if response.status_code == 200:
tree = fromstring(response.text)
urls = tree.xpath('//div[@class="school_name"]/a/@href')
if urls:
school_urls += urls
print('\t\tFound urls {0}'.format(len(urls)))
else:
break
else:
raise ConnectionError
offset += paginate_by
return list(set(school_urls))
def get_emails(urls):
emails = []
for url in urls:
print('\tCollecting e-mails from {0}'.format(url))
response = requests.get(url)
if response.status_code == 200:
tree = fromstring(response.text)
all_links_titles = tree.xpath('//a/@title')
for title in all_links_titles:
if '@' in title:
emails.append(title)
else:
raise ConnectionError
return list(set(emails))
def start(output_urls, output_emails):
print("Starting collection of school urls")
schools_urls = get_all_schools_urls()
print("Collected {0} schools urls".format(len(schools_urls)))
with open(output_urls, 'w') as file:
file.write("\n".join(schools_urls))
print("Schools urls saved: {0}".format(output_urls))
print("Starting collection of school emails")
schools_emails = get_emails(schools_urls)
print("Collected {0} schools emails".format(len(schools_emails)))
with open(output_emails, 'w') as file:
file.write("\n".join(schools_emails))
print("Schools e-mails saved: {0}".format(output_urls))
if __name__ == "__main__":
start('schools_urls.txt', 'schools_emails.txt')