wordcount_from_json_list_with_edge_features.py
5.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
local-twitter
Therefore the maximum size of a python list on a 32 bit system is 536,870,912 elements.
----------------
in_reply_to_status_id
optional
"null"
The ID of an existing status that the update is in reply to.
Note:: This parameter will be ignored unless the author of the tweet this parameter references is mentioned within the
status text. Therefore, you must include @username, where username is the author of the referenced tweet, within the update.
----------------
"in_reply_to_status_id_str": null,
"in_reply_to_user_id_str": null
"in_reply_to_user_id": null,
"in_reply_to_screen_name": null,
----------------
So you're referring to Quoted Tweets (retweet with comments). There is no official method for that from the REST API
yet, however, there are couple of ways to do it.
Since all quoted tweets contain the short url to the original one, you can still use in_reply_to_status_id and filter
by short url of the original tweet
Search for tweets that contain the field quoted_status_id this can be done either through REST or STREAMING API.
quoted_status_id: This field only surfaces when the Tweet is a quote Tweet. This field contains the integer value
Tweet ID of the quoted Tweet.
@autor: cristina muntean
@date: 28/06/16
"""
import codecs
import json
import logging
import sys
from collections import defaultdict
import operator
from filter import city_filter
from graph.Node import Node
from twitter.Tweet import Tweet
def isMention(tweet):
text = tweet["text"]
tokens = text.split(" ")
for t in tokens:
if t.startswith("@"):
return True
return False
def isReply(tweet):
# text = tweet["text"]
# if text.startswith("@"):
# return True
# return False
if tweet["in_reply_to_status_id_str"] is not None:
return True
else:
return False
def isQuote(tweet):
if "quoted_status" in tweet:
return True
else:
return False
def isRetweet(tweet):
if "retweeted_status" in tweet:
return True
else:
return False
def hasEdge(tweet):
if isMention(tweet) or isReply(tweet) or isQuote(tweet) or isRetweet(tweet):
return True
else:
return False
def processTokenTweet(node, tweet, cityNamesDict, cities15000):
# wordcount
node.incrementNode()
# simple counts
if hasEdge(tweet):
node.incrementEdge()
if isMention(tweet):
node.incMention()
if isReply(tweet):
node.incReply()
if isRetweet(tweet):
node.incRT()
if isQuote(tweet):
node.incQuote()
# city related counts
currentCity = city_filter.get_US_City(tweet, cityNamesDict, cities15000)
if isRetweet(tweet):
originTweet = tweet["retweeted_status"]
originCity = city_filter.get_US_City(originTweet, cityNamesDict, cities15000)
if originCity:
if originCity == currentCity:
node.incInnerRT()
else:
node.incOuterRT()
if isQuote(tweet):
originTweet = tweet["quoted_status"]
originCity = city_filter.get_US_City(originTweet, cityNamesDict, cities15000)
if originCity:
if originCity == currentCity:
node.incInnerQuote()
else:
node.incOuterQuote()
return node
def main():
logger = logging.getLogger("wordcount_from_json_list_with_edge_features.py")
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")
if len(sys.argv) != 3:
print "You need to pass the following 2 params: <inputFile> <outputFile> "
sys.exit(-1)
inputFile = sys.argv[1]
outputFilename = sys.argv[2]
outputWriter = codecs.open(outputFilename, "w", "utf-8")
# load cityNamesDict
cityNamesDict = city_filter.normalizeCityNames()
cities15000 = city_filter.loadCities15000(filename="resources/cities15000.txt")
nodeDict = dict() # containd node with lots of info
wordcountDict = defaultdict(int) # simple wordcount to sort on, keep track of important ngrams
tweetsAsDict = Tweet.getTweetAsDictionary(inputFile)
i=0
try:
for tweet in tweetsAsDict:
ngrams = tweet['ngrams'] # we might be interesting in not looking at 1grams that are not #tags
for token in ngrams:
wordcountDict[token] += 1
if token in nodeDict:
node = nodeDict[token]
## same object from the dict - no reassign
node = processTokenTweet(node, tweet, cityNamesDict, cities15000)
else:
i += 1
node = Node(i, token)
node = processTokenTweet(node, tweet, cityNamesDict, cities15000)
nodeDict[token] = node
except IOError:
print "End of file "
sorted_wc = sorted(wordcountDict.items(), key=operator.itemgetter(1), reverse=True)
for wc_tuple in sorted_wc:
# outputWriter.write('{}\t{}\n'.format(json.dumps(wc_tuple[0]).replace('"', ''), str(wc_tuple[1])))
nodeObj = nodeDict[wc_tuple[0]]
outputWriter.write('{}\n'.format(nodeObj.tabPrint()))
outputWriter.close()
if __name__ == '__main__':
main()