извлечение данных из твиттера с ключевым словом в Python - PullRequest
0 голосов
/ 27 января 2020

Это код, который я использую

import webbrowser
import os
import requests
from bs4 import BeautifulSoup
import sys
#import wget
import pandas as pd
import re
import urlmarker
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

if __name__ == '__main__':
    driver = webdriver.Chrome('D:\\crome drive\\chromedriver.exe')

    #df = pd.read_csv('Link_2.csv', encoding="utf8")

    df_list= ["https://twitter.com/search?q=%22dry%20eye%22%20OR%20%22dry%20eyes%22&src=typed_query"]  #df['Link'].tolist()#[0:10] #
    likes=[]
    contents=[]
    date_loc=[]
    links=[]
    main_link=[]
    tweet_no=[]
    i=0
    foll_no,followers,lk,oth_link,p_link,header=[],[],[],[],[],[]
    for item in df_list:
        driver.get(item)
        time.sleep(2)

        src1 = driver.page_source  # gets the html source of the page
        parser1 = BeautifulSoup(src1)

        #number of tweets
        # twtno=''
        # try:
        #     attr0 = {'class': 'ProfileNav-stat ProfileNav-stat--link u-borderUserColor u-textCenter js-tooltip js-nav'}
        #     tag0 = parser1.find_all('a', attrs=attr0)
        #     twtno=tag0[0].text.replace('Tweets\nTweets, current page.\n','').strip('\t\r\n')
        #     # temp=re.findall(r'\d+', tag0[0].text)
        #     # for h in temp:
        #     #    twtno= twtno+h
        #
        # except:
        #     twtno=0
        #     pass
        #
        # #Following
        # fl,fw,like='','',''
        # attr0 = {'class': 'ProfileNav-stat ProfileNav-stat--link u-borderUserColor u-textCenter js-tooltip js-openSignupDialog js-nonNavigable u-textUserColor'}
        # tag0 = parser1.find_all('a', attrs=attr0)
        # for w in range(0,len(tag0)):
        #     if 'Following' in tag0[w].text:
        #         fl=tag0[w].text.replace('Following','').strip('\t\r\n')
        #         # temp=((re.findall(r'\d+', tag0[w].text)))
        #         # for h in temp:
        #         #     fl = fl + h
        #     if 'Followers' in tag0[w].text:
        #         fw=tag0[w].text.replace('Followers','').strip('\t\r\n')
        #         #temp=((re.findall(r'\d+', tag0[w].text)))
        #         #for h in temp:
        #          #   fw = fw + h
        #     if 'Likes' in tag0[w].text:
        #         like=tag0[w].text.replace('Likes','').strip('\t\r\n')
        #         # temp=((re.findall(r'\d+', tag0[w].text)))
        #         # for h in temp:
        #         #     like = like + h
        #
        sh=0
        ####last_height = driver.execute_script("return document.body.scrollHeight")
        ####while True:
            # Scroll down to bottom
            ####driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Wait to load page
            ####time.sleep(3)

            # Calculate new scroll height and compare with last scroll height
            ####new_height = driver.execute_script("return document.body.scrollHeight")
            ####if (new_height == last_height):
                ####break
            ####last_height = new_height
            #sh=sh+1



        # twt=driver.find_elements_by_class_name('js-tweet-text-container')
        # action = webdriver.common.action_chains.ActionChains(driver)
        # action.move_to_element_with_offset(el, 5, 5)
        # action.click()
        # action.perform()
        twt = driver.find_elements_by_xpath('//div[@class="stream-item-header"]')
        # attr = {'class': 'js-tweet-text-container'}
        # tag = parser.find_all('div', attrs=attr)
        driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
        #time.sleep(2)
        # if (twtno==''):
        #     ttno=0
        #     # for retwt in twt:
        #     #     time.sleep(2)
        #     #     try:
        #     #         retwt.click()
        #     #         ttno=ttno+1
        #     #     except:
        #     #         if driver.current_url==item:
        #     #             twt1[ttno].click()
        #     #             ttno = ttno + 1
        #     #         else:
        #     #             driver.execute_script("window.history.go(-1)")
        #     #             while True:
        #     #                 # Scroll down to bottom
        #     #                 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        #     #
        #     #                 # Wait to load page
        #     #                 time.sleep(3)
        #     #
        #     #                 # Calculate new scroll height and compare with last scroll height
        #     #                 new_height = driver.execute_script("return document.body.scrollHeight")
        #     #                 if new_height == last_height:
        #     #                     break
        #     #                 last_height = new_height
        #     #             driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
        #     #             twt1 = driver.find_elements_by_xpath('//p[@class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"]')
        #     #             try:
        #     #                 twt1[ttno].click()
        #     #             except:
        #     #                 pass
        #     #             ttno=ttno+1
        #     #         pass
        #     #     time.sleep(3)
        #
        #     src = driver.page_source  # gets the html source of the page
        #     parser = BeautifulSoup(src)
        #
        #     #twitter count
        #     tweet_no.append(int(twtno))
        #     foll_no.append(fl)
        #     followers.append(fw)
        #     lk.append(like)
        #     #content
        #     try:
        #         attr1 = {'class': 'TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text'}
        #         tag1=parser.find_all('p',attrs=attr1)
        #         contents.append(tag1[0].text)
        #     except:
        #         contents.append('')
        #         pass
        #
        #     #dATE
        #     try:
        #         attr2={'class':'metadata'}
        #         tag2=parser.find_all('span',attrs=attr2)
        #         date_loc.append(tag2[0].text.strip('\t\n'))
        #     except:
        #         date_loc.append('')
        #         pass
        #
        #     #Likes
        #     try:
        #         attr3={'class':'stats'}
        #         tag3=parser.find_all('ul',attrs=attr3)
        #         likes.append(tag3[0].text.strip('\n\t\r'))
        #     except:
        #         likes.append('')
        #         pass
        #     #links
        #     links.append(driver.current_url)
        #     main_link.append(item)
        #     try:
        #         driver.find_element_by_xpath('//div[@class="PermalinkProfile-dismiss modal-close-fixed"]').click()
        #     except:
        #         pass

        src = driver.page_source  # gets the html source of the page
        parser = BeautifulSoup(src)
        f1 = parser.find_all('a')
        df1 = pd.DataFrame()
        df1['firstlink'] = f1
        df2 = df1[df1['firstlink'].astype(str).str.contains("status")]
        ds = df2['firstlink'].astype(str).apply(lambda x: x.split('href')[-1].split('title')[0])
        dt1 = pd.DataFrame(ds)
        dt2 = pd.DataFrame(dt1['firstlink'].apply(lambda x: x.split('"')))
        dt3 = pd.DataFrame(dt2['firstlink'].apply(lambda x: x[1]))
        # dt4=dt3[dt3['firstlink'].astype(str).str.contains("status")]
        dt5 = dt3[~dt3['firstlink'].astype(str).str.contains("twitter")]
        dt5['firstlink'] = dt5['firstlink'].astype(str).apply(lambda x: 'https://twitter.com' + x)
        df_list1 = dt5['firstlink'].tolist()
        for item in df_list1:
            item = ('https://'+item.split('//')[2]) if '//t.co' in item else item
            driver.get(item)
            time.sleep(2)

            src = driver.page_source  # gets the html source of the page
            parser = BeautifulSoup(src)

            #twitter count
            # tweet_no.append(twtno)
            # foll_no.append(fl)
            # followers.append(fw)
            # lk.append(like)
            # content
            try:
                attr1 = {'class': 'TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text'}
                tag1 = parser.find_all('p', attrs=attr1)
                contents.append(tag1[0].text)
            except:
                contents.append('')
                pass

            # dATE
            try:
                attr2 = {'class': 'metadata'}
                tag2 = parser.find_all('span', attrs=attr2)
                date_loc.append(tag2[0].text.strip('\t\n'))
            except:
                date_loc.append('')
                pass

            # Likes
            try:
                attr3 = {'class': 'stats'}
                tag3 = parser.find_all('ul', attrs=attr3)
                likes.append(tag3[0].text.strip('\n\t\r'))
            except:
                likes.append('')
                pass
            # links
            links.append(driver.current_url)
            main_link.append(df_list[i])

            #Other links
            lk_tmp=[]
            try:
                for w in range(0,len(tag1[0].contents)):#tag1[0].contents[1].attrs['href']
                    try:
                        lk_tmp.append(tag1[0].contents[w].attrs['href'])
                    except:
                        pass
            except:
                pass
            lk_tmp1 = []
            for hjk in range(0,len(lk_tmp)):
                if 't.co' in lk_tmp[hjk]:
                    lk_tmp1.append(lk_tmp[hjk])

            try:
                oth_link.append(lk_tmp1[:len(lk_tmp1)-1])
            except:
                oth_link.append('')
                pass
            #links in para
            tmp_plk=[]
            try:
                for plk in re.findall(urlmarker.URL_REGEX, tag1[0].text):
                    if not 'pic.twitter' in plk:
                        tmp_plk.append(plk)
            except:
                pass

            p_link.append(tmp_plk)

            #Header
            attr4 = {'class': 'account-group js-account-group js-action-profile js-user-profile-link js-nav'}
            tag4 = parser.find_all('a', attrs=attr4)
            try:
                header.append(tag4[0].text)
            except:
                header.append('')
                pass

        i=i+1
#contents[0][:contents[0].find('pic.twitter.com')]
contents1=[x[:x.find('pic.twitter.com')] if 'pic.twitter' in x else x for x 
 in contents]
# for y in likes:
#     actlikes =[z if 'Likes' in z else '' for z in y.split]
#actlikes=[z if 'Likes' in z else '' for z in y.split for y in likes]
# actlikes,actRet=[],[]
# for y in likes:
#     acttemp,Ret=0,0
#     for z in str(y).split('\n'):
#         if 'Likes' in z:
#             actlikes.append(z)
#             acttemp=1
#         elif 'Retweets' in z:
#             actRet.append(z)
#             Ret=1
#     if acttemp==0:
#         actlikes.append('')
#     if Ret == 0:
#         actRet.append('')

dt=pd.DataFrame()
dt['Tweet']=contents
dt['Header']=header
dt['Likes and Retweets']=likes
#dt['Likes Status']=actlikes
#dt['Retweet Status']=actRet
dt['Date']=date_loc
dt['Links Seperated']=p_link
dt['Other links']=oth_link
# dt['Number of tweets']=tweet_no
# dt['Following']=foll_no
# dt['Followers']o=followers
# dt['Likes']=lk
dt['Link of tweet']=links
dt['Main Link']=main_link
dt.to_csv('dry_eyes_output.csv')

Я не могу извлечь ретвит и комментарий.

1 Ответ

1 голос
/ 27 января 2020

Попробуйте tweepy lib из https://www.tweepy.org/.

Код см. Начало работы: https://docs.tweepy.org/en/latest/getting_started.html

Например: для поиска будет использовать

api.search(<search>, result_type="recent", lang="en")

(https://docs.tweepy.org/en/latest/api.html#API .search )

См. это объяснение для получения доступа к ключам Twitter и создания учетной записи и прочего: https://realpython.com/twitter-bot-python-tweepy/

...