Я использовал следующий код (не мой) для очистки status_ids разных пользователей из-за ограничения в 3200. Это работало нормально, и я сделал большую часть очистки, но в последнее время (с последнего 1 месяца) я не могу очистить, и в браузере появляется ошибка повторной попытки. Я новичок в кодировании, и это мой первый go при очистке.
Ошибка:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, WebDriverException
from time import sleep
import json
import datetime
# edit these three variables
user = 'TheDemocrats'
start = datetime.datetime(2008, 4, 13) # year, month, day
end = datetime.datetime(2019, 1, 31) # year, month, day
# only edit these if you're having problems
delay = 1 # time to wait on each page load before reading the page
driver = webdriver.Firefox(executable_path=r'C:\ProgramData\Microsoft\Windows\Start Menu\Programs\geckodriver.exe') # options are Chrome() Firefox() Safari()
# don't mess with this stuff
twitter_ids_filename = 'all_ids_dem.json'
days = (end - start).days + 1
id_selector = '.time a.tweet-timestamp'
tweet_selector = 'li.js-stream-item'
user = user.lower()
ids = []
def format_day(date):
day = '0' + str(date.day) if len(str(date.day)) == 1 else str(date.day)
month = '0' + str(date.month) if len(str(date.month)) == 1 else str(date.month)
year = str(date.year)
return '-'.join([year, month, day])
def form_url(since, until):
p1 = 'https://twitter.com/search?f=tweets&vertical=default&q=from%3A'
p2 = user + '%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd'
return p1 + p2
def increment_day(date, i):
return date + datetime.timedelta(days=i)
for day in range(days):
d1 = format_day(increment_day(start, 0))
d2 = format_day(increment_day(start, 1))
url = form_url(d1, d2)
print(url)
print(d1)
driver.get(url)
sleep(delay)
try:
found_tweets = driver.find_elements_by_css_selector(tweet_selector)
increment = 10
while len(found_tweets) >= increment:
print('scrolling down to load more tweets')
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(delay)
found_tweets = driver.find_elements_by_css_selector(tweet_selector)
increment += 10
print('{} tweets found, {} total'.format(len(found_tweets), len(ids)))
for tweet in found_tweets:
try:
id = tweet.find_element_by_css_selector(id_selector).get_attribute('href').split('/')[-1]
ids.append(id)
except StaleElementReferenceException as e:
print('lost element reference', tweet)
except NoSuchElementException:
print('no tweets on this day')
except WebDriverException as e:
if "networkProtocolError" in e()._str_():
print('failed to load page on this day, retrying')
start = increment_day(start,-1)
else:
raise e
start = increment_day(start, 1)
try:
with open(twitter_ids_filename) as f:
all_ids = ids + json.load(f)
data_to_write = list(set(all_ids))
print('tweets found on this scrape: ', len(ids))
print('total tweet count: ', len(data_to_write))
except FileNotFoundError:
with open(twitter_ids_filename, 'w') as f:
all_ids = ids
data_to_write = list(set(all_ids))
print('tweets found on this scrape: ', len(ids))
print('total tweet count: ', len(data_to_write))
with open(twitter_ids_filename, 'w') as outfile:
json.dump(data_to_write, outfile)
print('all done here')
driver.close()