Как хранить только текст твита с помощью Tweepy - PullRequest
0 голосов
/ 10 ноября 2018

Я смотрю эту серию https://www.youtube.com/watch?v=wlnx-7cm4Gg&list=PL5tcWHG-UPH2zBfOz40HSzcGUPAVOOnu1, которая посвящена майнингу твитов с помощью tweepy (python), и парень хранит твиты со всем (например, созданный_данным, id, id_str, текст), а затем он использует Dataframes в Панды для хранения только текста. Этот способ эффективен? Как я могу сохранить только «текст» в файле Json вместо всех других деталей?

Код:

ACCESS_TOKEN = "xxxxxxxxxxxxxxxxxxxxx"
ACCESS_TOKEN_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

import tweepy
import numpy as np
import pandas as pd
# import twitter_credentials

class TwitterAuthenticator():
    def authenticate_twitter_app(self):
        auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        return auth

class TwitterStreamer():
    """
        Class for streaming and processing live tweets.
    """
    def __init__(self):
        self.twitter_authenticator = TwitterAuthenticator()
    def stream_tweets(self, fetched_tweets_filename, hash_tag):
        # This handles Twitter authetification and the connection to Twitter Streaming API
        listener = TwitterListener(fetched_tweets_filename)
        auth = self.twitter_authenticator.authenticate_twitter_app()
        # api = tweepy.API(auth)


        stream = tweepy.Stream(auth,listener)
        stream.filter(track = hash_tag)


class TwitterListener(tweepy.StreamListener):
    """
    This is a basic listener class that just prints received tweets to stdout.
    """

    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename

    def on_data(self, data):
        try:
            print(data)
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("Error on_data %s" % str(e))
        return True

    def on_status(self, status):
        print(status)

    def on_error(self, status):
        if status == 420:
            # Returning False on_data method in case rate limit occurs.
            return False
        print(status)


# public_tweets = api.home_timeline()
# for tweet in public_tweets:
#     print tweet.text

if __name__ == '__main__':
    hash_tag = ["python"]
    fetched_tweets_filename = "tweets.json"

    twitter_streamer = TwitterStreamer()
    twitter_streamer.stream_tweets(fetched_tweets_filename,hash_tag)

    # print stream.text

Твит, сохраненный в файле json:

{"created_at":"Sun Nov 04 18:43:59 +0000 2018","id":1059154305498972160,"id_str":"1059154305498972160","text":"RT @hmason: When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn h\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":14858491,"id_str":"14858491","name":"Alexandra Lemus","screen_name":"nankyoku","location":"M\u00e9xico","url":null,"description":"Transitioning into the Permanent Beta state...","translator_type":"none","protected":false,"verified":false,"followers_count":173,"friends_count":585,"listed_count":18,"favourites_count":658,"statuses_count":572,"created_at":"Wed May 21 16:35:49 +0000 2008","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/14858491\/1381524599","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Nov 03 17:36:24 +0000 2018","id":1058774912201035776,"id_str":"1058774912201035776","text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yoursel\u2026 https:\/\/t.co\/9F7SmlGfyf","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":765548,"id_str":"765548","name":"Hilary Mason","screen_name":"hmason","location":"NYC","url":"http:\/\/www.hilarymason.com","description":"GM for Machine Learning at @Cloudera. Founder at @FastForwardLabs. Data Scientist in Residence at @accel. I \u2665 data and cheeseburgers.","translator_type":"none","protected":false,"verified":true,"followers_count":111311,"friends_count":1539,"listed_count":5276,"favourites_count":12049,"statuses_count":17602,"created_at":"Sun Feb 11 21:22:24 +0000 2007","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"282F8A","profile_sidebar_border_color":"87BC44","profile_sidebar_fill_color":"AB892B","profile_text_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/765548\/1353033581","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn how it works, and then use a library to benefit from robust code.\n\nHere's one article showing this with neural networks in Python: https:\/\/t.co\/3ehO86NFKI","display_text_range":[0,280],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/3ehO86NFKI","expanded_url":"https:\/\/towardsdatascience.com\/how-to-build-your-own-neural-network-from-scratch-in-python-68998a08e4f6","display_url":"towardsdatascience.com\/how-to-build-y\u2026","indices":[257,280]}],"user_mentions":[],"symbols":[]}},"quote_count":14,"reply_count":8,"retweet_count":290,"favorite_count":1019,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/9F7SmlGfyf","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1058774912201035776","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"hmason","name":"Hilary Mason","id":765548,"id_str":"765548","indices":[3,10]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1541357039223"}

Если вопрос не ясен, закомментируйте его, и я постараюсь отредактировать вопрос.

Ответы [ 2 ]

0 голосов
/ 10 ноября 2018

Если вы хотите, чтобы в файле json было сохранено только текстовое поле, вы можете настроить определение метода TwitterListener.on_data:

import json

def on_data(self, data):
    try:
        print(data)
        with open(self.fetched_tweets_filename, 'a') as tf:
            json_load = json.loads(data)
            text = {'text': json_load['text']}
            tf.write(json.dumps(text))
        return True
    except BaseException as e:
        print("Error on_data %s" % str(e))
    return True

Справедливое предупреждение, у меня не установлено / не установлено tweepy, поэтому я смог протестировать только версию вышеуказанного кода, используя файл json, который вы разместили выше. Дайте мне знать, если у вас возникнут какие-либо ошибки, и я посмотрю, что я могу сделать.

0 голосов
/ 10 ноября 2018

Похоже, что вы получаете от API и сохраняете в своей переменной "данные" текст в формате Юникод в формате json. Вы просто пишете этот текст прямо в файл. Используя вызов API, который вы делаете, вы всегда будете получать все данные, так что это не будет неэффективно. Если вы просто хотели получить / написать текст твита, попробуйте использовать json load и затем обработать оттуда.

...