filter_tweets_by_city.py
2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
local-twitter
@autor: cristina muntean
@date: 28/06/16
"""
import codecs
import json
import logging
import sys
from filter import city_filter
from twitter.Tweet import Tweet
from util import ngrams
def wordcountPlain(tweet, onlyHashtags=False, ngram=1):
"""
:param tweet:
:param onlyHashtags:
:param ngram:
:return: list of ngrams with hashtags
"""
tweetText = tweet['text']
tokens = Tweet.tokenizeTweetText(tweetText)
ngramsList = list()
tokenList = [t for t in tokens if (len(t) > 2 and (not ngrams.is_url_or_mention(t)))]
if ngram > 1:
ngramsList = ngramsList + tokenList
for ng in range(1, ngram):
ngramsList = ngramsList + [ntoken for ntoken in ngrams.window(tokenList, ng + 1)]
return ngramsList
else:
if onlyHashtags:
for token in tokenList: # len(token) > 2
if token.startswith('#'):
ngramsList.append(token)
return ngramsList
else:
return tokenList
def dumpDictValuesToFile(tweetAsDict, filename):
"""
The city is already embedded in this tweet
:param tweetAsDict:
:param filename:
:return:
"""
line = json.dumps(tweetAsDict) + "\n"
filename.write(line)
def main():
logger = logging.getLogger("filter_tweets_by_city.py")
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")
if len(sys.argv) != 3:
print "You need to pass the following 2 params: <jsonTweetsFile> <outputDir>"
sys.exit(-1)
inputFile = sys.argv[1]
date = filter(str.isdigit, inputFile)
outputDir = sys.argv[2]
writerDict = dict()
# create 10 writers!
for city in city_filter.US_CITIES:
outputFile = outputDir + "/" + city.replace(" ","_") + "_" + date + ".json"
outputWriter = codecs.open(outputFile, "w", "utf-8")
writerDict[city] = outputWriter
# load cityNamesDict
cityNamesDict = city_filter.normalizeCityNames()
cities15000 = city_filter.loadCities15000(filename="resources/cities15000.txt")
# filter tweets per city
tweetsAsDict = Tweet.getTweetAsDictionaryFromGZ(inputFile)
i = 0
try:
for tweet in tweetsAsDict:
i += 1
# get US city
city = city_filter.get_US_City(tweet, cityNamesDict, cities15000)
if city:
tweet["city"] = city
ngrams = wordcountPlain(tweet, False, 4)
tweet["ngrams"] = ngrams
wr = writerDict[city]
dumpDictValuesToFile(tweet, wr)
except IOError:
print "End of file ", tweet["id_str"]
# close writers
for city, wr in writerDict.iteritems():
wr.close()
if __name__ == '__main__':
main()