Код ниже собирает необходимые данные.Работает для 2/3 URL.
import requests
from bs4 import BeautifulSoup
URLS = ['http://www.astro.com', 'http://www.supermap.com', 'http://www.itc.com']
ATTRIBUTES = ['description', 'keywords', 'Description', 'Keywords']
collected_data = []
for url in URLS:
entry = {'url': url}
try:
r = requests.get(url)
except Exception as e:
print('Could not load page {}. Reason: {}'.format(url, str(e)))
continue
if r.status_code == 200:
soup = BeautifulSoup(r.content, 'html.parser')
meta_list = soup.find_all("meta")
for meta in meta_list:
if 'name' in meta.attrs:
name = meta.attrs['name']
if name in ATTRIBUTES:
entry[name.lower()] = meta.attrs['content']
if len(entry) == 3:
collected_data.append(entry)
else:
print('Could not find all required attributes for URL {}'.format(url))
else:
print('Could not load page {}.Reason: {}'.format(url, r.status_code))
print('Collected meta attributes (TODO - push to DB):')
for entry in collected_data:
print(entry)
Вывод
Could not find all required attributes for URL http://www.itc.com
Collected meta attributes (TODO - push to DB):
{'url': 'http://www.astro.com', u'keywords': u'Astrodienst, horoscope, horoscopes, horoscope 2013, free horoscopes, daily horoscope, astrology, love, aries, taurus, gemini, cancer, leo, virgo, libra, scorpio, sagittarius, capricorn, aquarius, pisces, zodiac, starsigns, forecast, yearly horoscope, liz greene, robert hand, sunsign, birth chart', u'description': u"Free Astrology and Horoscopes from Astrodienst! Get your free horoscope - and much more! Astrodienst provides the world's best astrology site for free horoscopes, professional astrological reports and information about astrology."}
{'url': 'http://www.supermap.com', u'keywords': u'\xb5\xd8\xc0\xed\xd0\xc5\xcf\xa2,GIS,\xb5\xd8\xc0\xed\xd0\xc5\xcf\xa2\xcf\xb5\xcd\xb3,\xd4\xc6GIS,\xc8\xfd\xce\xacGIS,\xd2\xc6\xb6\xafGIS,\xbf\xd5\xbc\xe4\xb4\xf3\xca\xfd\xbe\xdd,GIS\xbd\xe2\xbe\xf6\xb7\xbd\xb0\xb8,GIS\xb0\xb8\xc0\xfd', u'description': u'\xb1\xb1\xbe\xa9\xb3\xac\xcd\xbc\xc8\xed\xbc\xfe\xd6\xf7\xd2\xaa\xb4\xd3\xca\xc2\xb5\xd8\xc0\xed\xd0\xc5\xcf\xa2\xcf\xb5\xcd\xb3 (GIS)\xbb\xf9\xb4\xa1\xc8\xed\xbc\xfe\xc6\xbd\xcc\xa8\xd1\xd0\xbe\xbf\xa1\xa2\xbf\xaa\xb7\xa2\xba\xcd\xcf\xfa\xca\xdb\xa3\xac\xce\xaa\xd5\xfe\xb8\xae\xba\xcd\xc6\xf3\xd2\xb5\xcc\xe1\xb9\xa9\xb5\xd8\xc0\xed\xbf\xd5\xbc\xe4\xd0\xc5\xcf\xa2\xbc\xbc\xca\xf5\xb5\xc4\xd7\xc9\xd1\xaf\xb7\xfe\xce\xf1\xa1\xa3SuperMap GIS\xcf\xb5\xc1\xd0\xc8\xed\xbc\xfe\xd6\xf7\xd2\xaa\xb0\xfc\xc0\xa8\xd4\xc6GIS\xc6\xbd\xcc\xa8\xc8\xed\xbc\xfe\xa1\xa2\xd7\xe9\xbc\xfeGIS\xbf\xaa\xb7\xa2\xc6\xbd\xcc\xa8\xa1\xa2\xd2\xc6\xb6\xafGIS\xbf\xaa\xb7\xa2\xc6\xbd\xcc\xa8\xa1\xa2\xd7\xc0\xc3\xe6GIS\xc6\xbd\xcc\xa8\xa1\xa2\xcd\xf8\xc2\xe7\xbf\xcd\xbb\xa7\xb6\xcbGIS\xbf\xaa\xb7\xa2\xc6\xbd\xcc\xa8\xd2\xd4\xbc\xb0\xcf\xe0\xb9\xd8\xb5\xc4\xbf\xd5\xbc\xe4\xca\xfd\xbe\xdd\xc9\xfa\xb2\xfa\xa1\xa2\xbc\xd3\xb9\xa4\xba\xcd\xb9\xdc\xc0\xed\xb9\xa4\xbe\xdf\xa1\xa3\xbe\xad\xb9\xfd\xb6\xfe\xca\xae\xc4\xea\xb7\xa2\xd5\xb9\xa3\xacSuperMap GIS\xd2\xd1\xbe\xad\xb3\xc9\xce\xaa\xb2\xfa\xc6\xb7\xc3\xc5\xc0\xe0\xc6\xeb\xc8\xab\xa1\xa2\xb9\xa6\xc4\xdc\xc7\xbf\xb4\xf3\xa1\xa2\xb8\xb2\xb8\xc7\xd0\xd0\xd2\xb5\xb7\xb6\xce\xa7\xb9\xe3\xb7\xba\xa1\xa2\xc2\xfa\xd7\xe3\xb8\xf7\xc0\xe0\xd0\xc5\xcf\xa2\xcf\xb5\xcd\xb3\xbd\xa8\xc9\xe8\xb5\xc4GIS\xc8\xed\xbc\xfe\xc6\xb7\xc5\xc6\xa3\xac\xb2\xa2\xc9\xee\xc8\xeb\xb5\xbd\xb9\xfa\xc4\xda\xb8\xf7\xb8\xf6GIS\xd0\xd0\xd2\xb5\xd3\xa6\xd3\xc3\xa1\xa3'}