Я сделал скребок, который собирает данные с веб-сайта. Мой код на данный момент пишет в файле Excel. Он также читает и обновляет файл Excel. Мой код сначала читает базу данных Excel, чтобы удостовериться, что он обновляет текущую информацию в листе Excel, и если на веб-сайте есть какая-то новая информация, которой нет в базе данных Excel, она добавляется в лист.
Следующий код:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from xlrd import open_workbook
from selenium.webdriver.chrome.options import Options
import logging
#make lists for all the different aspects needed.
links = []
pics = []
types = []
names = []
descs = []
views = []
no_speakers = []
location = []
dates = []
people = []
organization = []
summ = []
twitter = []
facebook = []
contact = []
emails = []
website_link = []
venue = []
official_address = []
speakers = []
fees = []
at_tr = []
prev_links = []
index = -1
update = []
def main_url(url):
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url) #gets the URL
time.sleep(5) # wait 5 seconds until DOM will load completly
while True:
try:
driver.find_element_by_id('view_more').click() #clicks on load more until there are no more events to be loaded.
time.sleep(3)
except Exception as e:
break
rows = driver.find_elements_by_class_name('sec_conf_main')
for row in rows:
conf = row.find_element_by_class_name('conf_summery')
nam = conf.find_element_by_class_name('c_name')
name = nam.find_element_by_tag_name('a')
if len(names) != 0 and name.get_attribute('title') in names:
index = names.index(name.get_attribute('title'))
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
if links[index] == link:
pass
else:
links[index] = link.get_attribute('href') #get link of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
img = link.find_element_by_tag_name('img')
if pics[index] == img.get_attribute('src'):
pass
else:
pics[index] = img.get_attribute('src') #picture source of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
desc = row.find_element_by_class_name('conf_desc')
if descs[index] == desc.text:
pass
else:
descs[index] = desc.text #description of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
if views[index] == view:
pass
else:
views[index] = view #number of views.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
if no_speakers[index] == d[count + 1].text:
pass
else:
no_speakers[index] = d[count + 1].text #number of speakers.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
if types[index] == ','.join(ty):
pass
else:
types[index] = (','.join(ty))#speciality of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
if location[index] == item.text:
pass
else:
location[index] = (item.text) #location of event
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
i = date.find('|')
if dates[index] == date[:i]:
pass
else:
dates[index] = (date[:i]) #date from and to of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
else:
names.append(name.get_attribute('title')) #title of event.
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
links.append(link.get_attribute('href')) #get link of event.
img = link.find_element_by_tag_name('img')
pics.append(img.get_attribute('src')) #picture source of event.
desc = row.find_element_by_class_name('conf_desc')
descs.append(desc.text) #description of event.
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
views.append(view) #number of views.
no_speakers.append(d[count + 1].text) #number of speakers.
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
types.append(','.join(ty))#speciality of event.
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
location.append(item.text) #location of event
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
index = date.find('|')
dates.append(date[:index]) #date from and to of event.
except NoSuchElementException as e:
pass
driver.close()
driver.quit()
def each_event(item):
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
try:
driver.get(item) #get each Link of the event.
time.sleep(5)
if len(prev_links) != 0 and item in prev_links:
index = links.index(item)
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
if organization[index] == ' '.join(l[3:]):
pass
else:
organization[index] = (' '.join(l[3:]))
if not item in update:
update.append(item)
except NoSuchElementException as e:
organization[index] = 'No Organization Given.'
try:
summary = driver.find_element_by_class_name('conf_head_summary')
if summ[index] == summary.find_element_by_tag_name('p').text:
pass
else:
summ[index] = (summary.find_element_by_tag_name('p').text)
if not item in update:
update.append(item)
except NoSuchElementException as e:
summ[index] = 'No Conference Summary Given.'
try:
tw = driver.find_element_by_class_name('TW')
if twitter[index] == tw.get_attribute('title'):
pass
else:
twitter[index] = (tw.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
twitter[index] = 'No Twitter Link'
try:
fb = driver.find_element_by_class_name('FB')
if facebook[index] == fb.get_attribute('title'):
pass
else:
facebook[index] = (fb.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
facebook[index] = ('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if contact[index] == c:
pass
else:
if len(c) == 0:
contact[index] = ('No Contact Number Given.')
else:
contact[index] = (c)
if not item in update:
update.append(item)
except NoSuchElementException as e:
contact[index] = ('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
if emails[index] == ','.join(e):
pass
else:
emails[index] = (','.join(e))
if not item in update:
update.append(item)
except NoSuchElementException as e:
emails[index] = ('No email.')
try:
web = driver.find_element_by_id('cRegistraionpopup5').get_attribute('href')
if website_link[index] == web:
pass
else:
website_link[index] = (web)
if not item in update:
update.append(item)
except NoSuchElementException as e:
website_link[index] = ('No Website Link')
try:
v = driver.find_element_by_class_name('conf_venue1').text
if venue[index] == v:
pass
else:
venue[index] = (v)
if not item in update:
update.append(item)
except NoSuchElementException as e:
venue[index] = ('No Venue Given.')
try:
oa = driver.find_element_by_class_name('hotel-detail').text
if official_address[index] == oa:
pass
else:
official_address[index] = oa
if not item in update:
update.append(item)
except NoSuchElementException as e:
official_address[index] = ('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers[index] = 'No Speakers'
if speakers[index] == ','.join(l):
pass
else:
speakers[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
speakers[index] = ('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
if fees[index] == ','.join(l):
pass
else:
fees[index] = (';'.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
fees[index] = ('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr[index] = 'No Attenders or Trackers Given.'
if at_tr[index] == ','.join(l):
pass
else:
at_tr[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
at_tr[index] = ('No Attenders or Trackers Given')
else:
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
organization.append(' '.join(l[3:]))
except NoSuchElementException as e:
organization.append('No Organization Given.')
try:
summary = driver.find_element_by_class_name('conf_head_summary')
summ.append(summary.find_element_by_tag_name('p').text)
except NoSuchElementException as e:
summ.append('No Conference Summary Given.')
try:
tw = driver.find_element_by_class_name('TW')
twitter.append(tw.get_attribute('title'))
except:
twitter.append('No Twitter Link')
try:
fb = driver.find_element_by_class_name('FB')
facebook.append(fb.get_attribute('title'))
except:
facebook.append('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if len(c) == 0:
contact.append('No Contact Number Given.')
else:
contact.append(c)
except NoSuchElementException as e:
contact.append('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
emails.append(' '.join(e))
except NoSuchElementException as e:
emails.append('No email.')
try:
website_link.append(driver.find_element_by_id('cRegistraionpopup5').get_attribute('href'))
except NoSuchElementException as e:
website_link.append('No Website Link')
try:
venue.append(driver.find_element_by_class_name('conf_venue1').text)
except NoSuchElementException as e:
venue.append('No Venue Given.')
try:
official_address.append(driver.find_element_by_class_name('hotel-detail').text)
except NoSuchElementException as e:
official_address.append('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers.append('No Speakers Given.')
else:
speakers.append(','.join(l))
except NoSuchElementException as e:
speakers.append('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
fees.append(';'.join(l))
except NoSuchElementException as e:
fees.append('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr.append('No Attenders or Trackers Given')
else:
at_tr.append(','.join(l))
except NoSuchElementException as e:
at_tr.append('No Attenders or Trackers Given')
driver.close()
driver.quit()
except Exception as e:
pass
def main():
file = 'EMedEvents.xlsx' #file to write in
book = open_workbook(file)
sheet = book.sheet_by_index(0)
d = pd.read_excel(file)
if d.empty:
pass
else:
for row in range(1, sheet.nrows):
names.append(sheet.cell(row, 0).value)
dates.append(sheet.cell(row, 1).value)
types.append(sheet.cell(row, 2).value)
location.append(sheet.cell(row, 3).value)
descs.append(sheet.cell(row, 4).value)
views.append(sheet.cell(row, 5).value)
no_speakers.append(sheet.cell(row, 6).value)
pics.append(sheet.cell(row, 7).value)
links.append(sheet.cell(row, 8).value)
organization.append(sheet.cell(row, 9).value)
summ.append(sheet.cell(row, 10).value)
twitter.append(sheet.cell(row, 11).value)
facebook.append(sheet.cell(row, 12).value)
contact.append(sheet.cell(row, 13).value)
emails.append(sheet.cell(row, 14).value)
website_link.append(sheet.cell(row, 15).value)
venue.append(sheet.cell(row, 16).value)
official_address.append(sheet.cell(row, 17).value)
speakers.append(sheet.cell(row, 18).value)
fees.append(sheet.cell(row, 19).value)
at_tr.append(sheet.cell(row, 20).value)
if len(links) != 0:
for item in links:
prev_links.append(item)
main_url("https://www.emedevents.com/india-medical-conferences") #main url to use.
for item in links:
each_event(item) #get people information of each event.
df = pd.DataFrame.from_dict({'Event Name':names,'Event Dates':dates, 'Specialty' : types,'Event Location' : location, 'Description' : descs,
'Views' : views, 'Speakers' : no_speakers, 'Picture Source' : pics, 'Event Link' : links, 'Organized By' : organization,
'Conference Summary' : summ, 'Twitter Link' : twitter, 'Facebook Link' : facebook,'Contact Number' : contact,
'Email' : emails, 'Website Link' : website_link, 'Venue' : venue, 'Official Address' : official_address, 'Speaking' : speakers,
'Fees' : fees, 'Attenders and Trackers': at_tr})
df.to_excel(file, header=True, index=False) #print the data in the excel sheet.
logging.basicConfig(filename = 'error_' + str(time.time()) + '.log', level = logging.INFO)
logging.info('%d events were read from the excel sheet', len(prev_links))
logging.info('%d events were added to the excel sheet', len(links) - len(prev_links))
logging.info('Following are the links of the events that were updated:')
for item in update:
logging.info(item)
if __name__ == '__main__':
main() #if the name is main, run the main method and continue with the program.
Мне нужна вся эта функциональность в MongoDB, а не в Excel. Я совершенно новичок в MongoDB, поэтому я не понимаю, как поступить так же.
Любая помощь будет потрясающей.
Заранее спасибо.