filter_tweets_by_city.py 2.75 KB
"""
local-twitter

@autor: cristina muntean
@date: 28/06/16
"""

import codecs
import json
import logging
import sys
from filter import city_filter
from twitter.Tweet import Tweet
from util import ngrams


def wordcountPlain(tweet, onlyHashtags=False, ngram=1):
    """

    :param tweet:
    :param onlyHashtags:
    :param ngram:
    :return: list of ngrams with hashtags
    """
    tweetText = tweet['text']
    tokens = Tweet.tokenizeTweetText(tweetText)

    ngramsList = list()
    tokenList = [t for t in tokens if (len(t) > 2 and (not ngrams.is_url_or_mention(t)))]

    if ngram > 1:
        ngramsList = ngramsList + tokenList
        for ng in range(1, ngram):
            ngramsList = ngramsList + [ntoken for ntoken in ngrams.window(tokenList, ng + 1)]
        return ngramsList
    else:
        if onlyHashtags:
            for token in tokenList:  # len(token) > 2
                if token.startswith('#'):
                    ngramsList.append(token)
            return ngramsList
        else:
            return tokenList

def dumpDictValuesToFile(tweetAsDict, filename):
    """
    The city is already embedded in this tweet
    :param tweetAsDict:
    :param filename:
    :return:
    """
    line = json.dumps(tweetAsDict) + "\n"
    filename.write(line)


def main():
    logger = logging.getLogger("filter_tweets_by_city.py")
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")

    if len(sys.argv) != 3:
        print "You need to pass the following 2 params: <jsonTweetsFile> <outputDir>"
        sys.exit(-1)
    inputFile = sys.argv[1]
    date = filter(str.isdigit, inputFile)
    outputDir = sys.argv[2]
    writerDict = dict()

    # create 10 writers!
    for city in city_filter.US_CITIES:
        outputFile = outputDir + "/" + city.replace(" ","_") + "_" + date + ".json"
        outputWriter = codecs.open(outputFile, "w", "utf-8")
        writerDict[city] = outputWriter


    # load cityNamesDict
    cityNamesDict = city_filter.normalizeCityNames()
    cities15000 = city_filter.loadCities15000(filename="resources/cities15000.txt")


    # filter tweets per city
    tweetsAsDict = Tweet.getTweetAsDictionaryFromGZ(inputFile)
    i = 0
    try:
        for tweet in tweetsAsDict:
            i += 1
            # get US city
            city = city_filter.get_US_City(tweet, cityNamesDict, cities15000)
            if city:
                tweet["city"] = city
                ngrams = wordcountPlain(tweet, False, 4)
                tweet["ngrams"] = ngrams
                wr = writerDict[city]
                dumpDictValuesToFile(tweet, wr)
    except IOError:
        print "End of file ", tweet["id_str"]

    # close writers
    for city, wr in writerDict.iteritems():
        wr.close()

if __name__ == '__main__':
    main()