Я пытался извлечь URL из твитов и проверить количество перенаправлений, мета-содержимое последней страницы URL, на которую перенаправляет URL.[ Твит может содержать несколько URL ]
Я запускал то же самое в Python, используя панд и разбивая их на куски, код выполняется уже более 9 дней.В любом случае вы могли бы предположить, что это может быть ускорено?
for chunk in pd.read_csv('BotData.csv', chunksize=3000):
Bot_Data1 = chunk
pd.options.mode.chained_assignment = None # default='warn'
##get urls from tweet text
Bot_Data1['base_urls']=Bot_Data1['text'].apply(lambda
row:re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+',str(row)))
Bot_Data1['urls']=Bot_Data1['text'].apply(lambda
row:re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[^
|^#|^https|^http]*',str(row)))
##get avg number of retweets for the number of URLs present
#clean urls
for i, value in enumerate(Bot_Data1['urls']):
Bot_Data1['urls'][i]= [s.replace('…', ' ') for s in value]
for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
numURL=0
i=0
finalurl=[]
for url in Bot_Data1['urls'][loop][:]:
numURL=numURL+1
i=i-1
f_url=''
# print(url)
try:
r = requests.get(url)
for h in r.history:
i=i+1
f_url=h.url
except Exception as e : # Catches wrong url error
print(e)
finalurl.append(f_url)
Bot_Data1['final_url'][loop]=np.array(finalurl,dtype=object)
if numURL!=0:
Bot_Data1['avg_redirections'][loop]=i/numURL
else:
Bot_Data1['avg_redirections'][loop]=0
for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
final_base_url=[]
for index, x in np.ndenumerate(Bot_Data1['final_url'][loop]):
final_base_url=final_base_url+(re.findall('https?://(?:[-\w.]|(?:%
[\da-fA-F]{2}))+',x))
Bot_Data1['final_base_url'][loop]=np.array(final_base_url,dtype=object)
##
##get url meta description content and title
import requests
import requests.exceptions
from bs4 import BeautifulSoup
Bot_Data1['url_meta_content']=''
for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
metacontent=''
for i in range(0,len(Bot_Data1['final_base_url'][loop])):
url=Bot_Data1['final_base_url'][loop][i]
# print(url)
try:
response = requests.get(url)
soup = BeautifulSoup(response.text)
metas = soup.find_all('meta')
title = soup.find('title')
metacontent+=" "
metacontent+=''.join(map(str, [meta.attrs['content']
for meta in metas if 'name' in meta.attrs and meta.attrs['name']
== 'description'])) #converting the meta content to string as
the output is a list
# print([ meta.attrs['content'] for meta in metas if 'name' in
meta.attrs and meta.attrs['name'] == 'description' ])
# print(title.string)
metacontent+=" "
try:
metacontent+=title.text.strip()
except AttributeError as error:
# Output expected AttributeErrors.
print(error)
except Exception as e:
print(e)
i=i-1
Bot_Data1['url_meta_content'][loop]=metacontent
Bot_Data1['url_meta_content']=Bot_Data1['url_meta_content'].replace("/n","")
concatChunk=[BOT_Data,Bot_Data1]
BOT_Data=pd.concat(concatChunk)
##