Это код, который я использую
import webbrowser
import os
import requests
from bs4 import BeautifulSoup
import sys
#import wget
import pandas as pd
import re
import urlmarker
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
if __name__ == '__main__':
driver = webdriver.Chrome('D:\\crome drive\\chromedriver.exe')
#df = pd.read_csv('Link_2.csv', encoding="utf8")
df_list= ["https://twitter.com/search?q=%22dry%20eye%22%20OR%20%22dry%20eyes%22&src=typed_query"] #df['Link'].tolist()#[0:10] #
likes=[]
contents=[]
date_loc=[]
links=[]
main_link=[]
tweet_no=[]
i=0
foll_no,followers,lk,oth_link,p_link,header=[],[],[],[],[],[]
for item in df_list:
driver.get(item)
time.sleep(2)
src1 = driver.page_source # gets the html source of the page
parser1 = BeautifulSoup(src1)
#number of tweets
# twtno=''
# try:
# attr0 = {'class': 'ProfileNav-stat ProfileNav-stat--link u-borderUserColor u-textCenter js-tooltip js-nav'}
# tag0 = parser1.find_all('a', attrs=attr0)
# twtno=tag0[0].text.replace('Tweets\nTweets, current page.\n','').strip('\t\r\n')
# # temp=re.findall(r'\d+', tag0[0].text)
# # for h in temp:
# # twtno= twtno+h
#
# except:
# twtno=0
# pass
#
# #Following
# fl,fw,like='','',''
# attr0 = {'class': 'ProfileNav-stat ProfileNav-stat--link u-borderUserColor u-textCenter js-tooltip js-openSignupDialog js-nonNavigable u-textUserColor'}
# tag0 = parser1.find_all('a', attrs=attr0)
# for w in range(0,len(tag0)):
# if 'Following' in tag0[w].text:
# fl=tag0[w].text.replace('Following','').strip('\t\r\n')
# # temp=((re.findall(r'\d+', tag0[w].text)))
# # for h in temp:
# # fl = fl + h
# if 'Followers' in tag0[w].text:
# fw=tag0[w].text.replace('Followers','').strip('\t\r\n')
# #temp=((re.findall(r'\d+', tag0[w].text)))
# #for h in temp:
# # fw = fw + h
# if 'Likes' in tag0[w].text:
# like=tag0[w].text.replace('Likes','').strip('\t\r\n')
# # temp=((re.findall(r'\d+', tag0[w].text)))
# # for h in temp:
# # like = like + h
#
sh=0
####last_height = driver.execute_script("return document.body.scrollHeight")
####while True:
# Scroll down to bottom
####driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
####time.sleep(3)
# Calculate new scroll height and compare with last scroll height
####new_height = driver.execute_script("return document.body.scrollHeight")
####if (new_height == last_height):
####break
####last_height = new_height
#sh=sh+1
# twt=driver.find_elements_by_class_name('js-tweet-text-container')
# action = webdriver.common.action_chains.ActionChains(driver)
# action.move_to_element_with_offset(el, 5, 5)
# action.click()
# action.perform()
twt = driver.find_elements_by_xpath('//div[@class="stream-item-header"]')
# attr = {'class': 'js-tweet-text-container'}
# tag = parser.find_all('div', attrs=attr)
driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
#time.sleep(2)
# if (twtno==''):
# ttno=0
# # for retwt in twt:
# # time.sleep(2)
# # try:
# # retwt.click()
# # ttno=ttno+1
# # except:
# # if driver.current_url==item:
# # twt1[ttno].click()
# # ttno = ttno + 1
# # else:
# # driver.execute_script("window.history.go(-1)")
# # while True:
# # # Scroll down to bottom
# # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# #
# # # Wait to load page
# # time.sleep(3)
# #
# # # Calculate new scroll height and compare with last scroll height
# # new_height = driver.execute_script("return document.body.scrollHeight")
# # if new_height == last_height:
# # break
# # last_height = new_height
# # driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
# # twt1 = driver.find_elements_by_xpath('//p[@class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"]')
# # try:
# # twt1[ttno].click()
# # except:
# # pass
# # ttno=ttno+1
# # pass
# # time.sleep(3)
#
# src = driver.page_source # gets the html source of the page
# parser = BeautifulSoup(src)
#
# #twitter count
# tweet_no.append(int(twtno))
# foll_no.append(fl)
# followers.append(fw)
# lk.append(like)
# #content
# try:
# attr1 = {'class': 'TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text'}
# tag1=parser.find_all('p',attrs=attr1)
# contents.append(tag1[0].text)
# except:
# contents.append('')
# pass
#
# #dATE
# try:
# attr2={'class':'metadata'}
# tag2=parser.find_all('span',attrs=attr2)
# date_loc.append(tag2[0].text.strip('\t\n'))
# except:
# date_loc.append('')
# pass
#
# #Likes
# try:
# attr3={'class':'stats'}
# tag3=parser.find_all('ul',attrs=attr3)
# likes.append(tag3[0].text.strip('\n\t\r'))
# except:
# likes.append('')
# pass
# #links
# links.append(driver.current_url)
# main_link.append(item)
# try:
# driver.find_element_by_xpath('//div[@class="PermalinkProfile-dismiss modal-close-fixed"]').click()
# except:
# pass
src = driver.page_source # gets the html source of the page
parser = BeautifulSoup(src)
f1 = parser.find_all('a')
df1 = pd.DataFrame()
df1['firstlink'] = f1
df2 = df1[df1['firstlink'].astype(str).str.contains("status")]
ds = df2['firstlink'].astype(str).apply(lambda x: x.split('href')[-1].split('title')[0])
dt1 = pd.DataFrame(ds)
dt2 = pd.DataFrame(dt1['firstlink'].apply(lambda x: x.split('"')))
dt3 = pd.DataFrame(dt2['firstlink'].apply(lambda x: x[1]))
# dt4=dt3[dt3['firstlink'].astype(str).str.contains("status")]
dt5 = dt3[~dt3['firstlink'].astype(str).str.contains("twitter")]
dt5['firstlink'] = dt5['firstlink'].astype(str).apply(lambda x: 'https://twitter.com' + x)
df_list1 = dt5['firstlink'].tolist()
for item in df_list1:
item = ('https://'+item.split('//')[2]) if '//t.co' in item else item
driver.get(item)
time.sleep(2)
src = driver.page_source # gets the html source of the page
parser = BeautifulSoup(src)
#twitter count
# tweet_no.append(twtno)
# foll_no.append(fl)
# followers.append(fw)
# lk.append(like)
# content
try:
attr1 = {'class': 'TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text'}
tag1 = parser.find_all('p', attrs=attr1)
contents.append(tag1[0].text)
except:
contents.append('')
pass
# dATE
try:
attr2 = {'class': 'metadata'}
tag2 = parser.find_all('span', attrs=attr2)
date_loc.append(tag2[0].text.strip('\t\n'))
except:
date_loc.append('')
pass
# Likes
try:
attr3 = {'class': 'stats'}
tag3 = parser.find_all('ul', attrs=attr3)
likes.append(tag3[0].text.strip('\n\t\r'))
except:
likes.append('')
pass
# links
links.append(driver.current_url)
main_link.append(df_list[i])
#Other links
lk_tmp=[]
try:
for w in range(0,len(tag1[0].contents)):#tag1[0].contents[1].attrs['href']
try:
lk_tmp.append(tag1[0].contents[w].attrs['href'])
except:
pass
except:
pass
lk_tmp1 = []
for hjk in range(0,len(lk_tmp)):
if 't.co' in lk_tmp[hjk]:
lk_tmp1.append(lk_tmp[hjk])
try:
oth_link.append(lk_tmp1[:len(lk_tmp1)-1])
except:
oth_link.append('')
pass
#links in para
tmp_plk=[]
try:
for plk in re.findall(urlmarker.URL_REGEX, tag1[0].text):
if not 'pic.twitter' in plk:
tmp_plk.append(plk)
except:
pass
p_link.append(tmp_plk)
#Header
attr4 = {'class': 'account-group js-account-group js-action-profile js-user-profile-link js-nav'}
tag4 = parser.find_all('a', attrs=attr4)
try:
header.append(tag4[0].text)
except:
header.append('')
pass
i=i+1
#contents[0][:contents[0].find('pic.twitter.com')]
contents1=[x[:x.find('pic.twitter.com')] if 'pic.twitter' in x else x for x
in contents]
# for y in likes:
# actlikes =[z if 'Likes' in z else '' for z in y.split]
#actlikes=[z if 'Likes' in z else '' for z in y.split for y in likes]
# actlikes,actRet=[],[]
# for y in likes:
# acttemp,Ret=0,0
# for z in str(y).split('\n'):
# if 'Likes' in z:
# actlikes.append(z)
# acttemp=1
# elif 'Retweets' in z:
# actRet.append(z)
# Ret=1
# if acttemp==0:
# actlikes.append('')
# if Ret == 0:
# actRet.append('')
dt=pd.DataFrame()
dt['Tweet']=contents
dt['Header']=header
dt['Likes and Retweets']=likes
#dt['Likes Status']=actlikes
#dt['Retweet Status']=actRet
dt['Date']=date_loc
dt['Links Seperated']=p_link
dt['Other links']=oth_link
# dt['Number of tweets']=tweet_no
# dt['Following']=foll_no
# dt['Followers']o=followers
# dt['Likes']=lk
dt['Link of tweet']=links
dt['Main Link']=main_link
dt.to_csv('dry_eyes_output.csv')
Я не могу извлечь ретвит и комментарий.