wordcount_from_json_list_with_edge_features.py 5.15 KB
"""
local-twitter

Therefore the maximum size of a python list on a 32 bit system is 536,870,912 elements.

----------------
in_reply_to_status_id
optional
"null"
The ID of an existing status that the update is in reply to.

Note:: This parameter will be ignored unless the author of the tweet this parameter references is mentioned within the
status text. Therefore, you must include @username, where username is the author of the referenced tweet, within the update.
----------------
"in_reply_to_status_id_str": null,
"in_reply_to_user_id_str": null
"in_reply_to_user_id": null,
"in_reply_to_screen_name": null,
----------------
So you're referring to Quoted Tweets (retweet with comments). There is no official method for that from the REST API
yet, however, there are couple of ways to do it.

Since all quoted tweets contain the short url to the original one, you can still use in_reply_to_status_id and filter
by short url of the original tweet
Search for tweets that contain the field quoted_status_id this can be done either through REST or STREAMING API.
quoted_status_id: This field only surfaces when the Tweet is a quote Tweet. This field contains the integer value
Tweet ID of the quoted Tweet.

@autor: cristina muntean
@date: 28/06/16
"""

import codecs
import json
import logging
import sys
from collections import defaultdict
import operator
from filter import city_filter
from graph.Node import Node
from twitter.Tweet import Tweet

def isMention(tweet):
    text = tweet["text"]
    tokens = text.split(" ")
    for t in tokens:
        if t.startswith("@"):
            return True
    return False

def isReply(tweet):
    # text = tweet["text"]
    # if text.startswith("@"):
    #     return True
    # return False
    if tweet["in_reply_to_status_id_str"] is not None:
        return True
    else:
        return False

def isQuote(tweet):
    if "quoted_status" in tweet:
        return True
    else:
        return False

def isRetweet(tweet):
    if "retweeted_status" in tweet:
        return True
    else:
        return False

def hasEdge(tweet):
    if isMention(tweet) or isReply(tweet) or isQuote(tweet) or isRetweet(tweet):
        return True
    else:
        return False


def processTokenTweet(node, tweet, cityNamesDict, cities15000):
    # wordcount
    node.incrementNode()

    # simple counts
    if hasEdge(tweet):
        node.incrementEdge()
    if isMention(tweet):
        node.incMention()
    if isReply(tweet):
        node.incReply()
    if isRetweet(tweet):
        node.incRT()
    if isQuote(tweet):
        node.incQuote()

    # city related counts
    currentCity = city_filter.get_US_City(tweet, cityNamesDict, cities15000)
    if isRetweet(tweet):
        originTweet = tweet["retweeted_status"]
        originCity = city_filter.get_US_City(originTweet, cityNamesDict, cities15000)
        if originCity:
            if originCity == currentCity:
                node.incInnerRT()
            else:
                node.incOuterRT()
    if isQuote(tweet):
        originTweet = tweet["quoted_status"]
        originCity = city_filter.get_US_City(originTweet, cityNamesDict, cities15000)
        if originCity:
            if originCity == currentCity:
                node.incInnerQuote()
            else:
                node.incOuterQuote()
    return node


def main():
    logger = logging.getLogger("wordcount_from_json_list_with_edge_features.py")
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")

    if len(sys.argv) != 3:
        print "You need to pass the following 2 params: <inputFile> <outputFile> "
        sys.exit(-1)
    inputFile = sys.argv[1]
    outputFilename = sys.argv[2]
    outputWriter = codecs.open(outputFilename, "w", "utf-8")

    # load cityNamesDict
    cityNamesDict = city_filter.normalizeCityNames()
    cities15000 = city_filter.loadCities15000(filename="resources/cities15000.txt")

    nodeDict = dict()  # containd node with lots of info
    wordcountDict = defaultdict(int)  # simple wordcount to sort on, keep track of important ngrams
    tweetsAsDict = Tweet.getTweetAsDictionary(inputFile)

    i=0
    try:
        for tweet in tweetsAsDict:
            ngrams = tweet['ngrams']  # we might be interesting in not looking at 1grams that are not #tags
            for token in ngrams:
                wordcountDict[token] += 1
                if token in nodeDict:
                    node = nodeDict[token]
                    ## same object from the dict - no reassign
                    node = processTokenTweet(node, tweet, cityNamesDict, cities15000)
                else:
                    i += 1
                    node = Node(i, token)
                    node = processTokenTweet(node, tweet, cityNamesDict, cities15000)
                    nodeDict[token] = node

    except IOError:
        print "End of file "

    sorted_wc = sorted(wordcountDict.items(), key=operator.itemgetter(1), reverse=True)
    for wc_tuple in sorted_wc:
        # outputWriter.write('{}\t{}\n'.format(json.dumps(wc_tuple[0]).replace('"', ''), str(wc_tuple[1])))
        nodeObj = nodeDict[wc_tuple[0]]
        outputWriter.write('{}\n'.format(nodeObj.tabPrint()))
    outputWriter.close()

if __name__ == '__main__':
    main()