wordcount_from_json_list.py 2.37 KB
"""
local-twitter

Therefore the maximum size of a python list on a 32 bit system is 536,870,912 elements.

----------------
in_reply_to_status_id
optional
"null"
The ID of an existing status that the update is in reply to.

Note:: This parameter will be ignored unless the author of the tweet this parameter references is mentioned within the
status text. Therefore, you must include @username, where username is the author of the referenced tweet, within the update.
----------------
"in_reply_to_status_id_str": null,
"in_reply_to_user_id_str": null
"in_reply_to_user_id": null,
"in_reply_to_screen_name": null,
----------------
So you're referring to Quoted Tweets (retweet with comments). There is no official method for that from the REST API
yet, however, there are couple of ways to do it.

Since all quoted tweets contain the short url to the original one, you can still use in_reply_to_status_id and filter
by short url of the original tweet
Search for tweets that contain the field quoted_status_id this can be done either through REST or STREAMING API.
quoted_status_id: This field only surfaces when the Tweet is a quote Tweet. This field contains the integer value
Tweet ID of the quoted Tweet.

@autor: cristina muntean
@date: 28/06/16
"""

import codecs
import json
import logging
import sys
from collections import defaultdict
import operator
from filter import city_filter
from twitter.Tweet import Tweet

def main():
    logger = logging.getLogger("wordcount_retweets.py")
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")

    if len(sys.argv) != 3:
        print "You need to pass the following 2 params: <inputFile> <outputFile> "
        sys.exit(-1)
    inputFile = sys.argv[1]
    outputFilename = sys.argv[2]
    outputWriter = codecs.open(outputFilename, "w", "utf-8")

    wordcountDict = defaultdict(int)
    tweetsAsDict = Tweet.getTweetAsDictionary(inputFile)
    try:
        for tweet in tweetsAsDict:
            ngrams = tweet['ngrams']
            for token in ngrams:
                wordcountDict[token] += 1
    except IOError:
        print "End of file "

    sorted_wc = sorted(wordcountDict.items(), key=operator.itemgetter(1), reverse=True)
    for wc_tuple in sorted_wc:
        outputWriter.write('{}\t{}\n'.format(json.dumps(wc_tuple[0]).replace('"', ''), str(wc_tuple[1])))
    outputWriter.close()

if __name__ == '__main__':
    main()