Commit cc3eca96f606290d08fd63530ef894de62bbfaaa

Authored by Cristina Muntean
1 parent 35f8a7d7

changes

graph/Node.py 0 → 100644
  1 +#!/usr/bin/env python
  2 +'''
  3 +local-twitter : Node
  4 +@euthor: Cristina Muntean (cristina.muntean@isti.cnr.it)
  5 +@date: 7/6/16
  6 +-----------------------------
  7 +
  8 +
  9 +'''
  10 +
  11 +
  12 +class Node:
  13 +
  14 + def __init__(self, node_id, description):
  15 + self.id = node_id
  16 + self.description = description
  17 + self.nodeCount = 0
  18 + self.edgeCount = 0
  19 + self.mentionCount = 0
  20 + self.replyCount = 0
  21 + self.RTCount = 0
  22 + self.innerRTCount = 0
  23 + self.outerRTCount = 0
  24 + self.quoteCount = 0
  25 + self.innerQuoteCount = 0
  26 + self.outerQuoteCount = 0
  27 +
  28 + def incrementNode(self):
  29 + self.nodeCount += 1
  30 +
  31 + def incrementEdge(self):
  32 + self.edgeCount += 1
  33 +
  34 + # attributes where we do not have the original
  35 + def incMention(self):
  36 + self.mentionCount += 1
  37 +
  38 + def incReply(self):
  39 + self.replyCount += 1
  40 +
  41 + # attributes where we have the original and can tell if inner or outer or other
  42 + def incRT(self):
  43 + self.RTCount += 1
  44 +
  45 + def incInnerRT(self):
  46 + self.innerRTCount += 1
  47 +
  48 + def incOuterRT(self):
  49 + self.outerRTCount += 1
  50 +
  51 + def incQuote(self):
  52 + self.quoteCount += 1
  53 +
  54 + def incInnerQuote(self):
  55 + self.innerQuoteCount += 1
  56 +
  57 + def incOuterQuote(self):
  58 + self.outerQuoteCount += 1
  59 +
  60 + def tabPrint(self):
  61 + return "{}\t{}\t{}\t{}\t{}".format(self.description, self.nodeCount, self.edgeCount, self.mentionCount,
  62 + self.replyCount)
  63 +
  64 +
  65 +def main():
  66 + pass
  67 +
  68 +
  69 +if __name__ == '__main__':
  70 + main()
0 71 \ No newline at end of file
... ...
simple_plot_scatter_2_distrib.py
... ... @@ -37,8 +37,8 @@ def readPreprocessedData(filename):
37 37 for line in codecs.open(filename, "r", "utf-8"):
38 38 line = line.replace("\n", "")
39 39 data = line.split("\t")
40   - X.append(int(data[0]))
41   - Y.append(int(data[1]))
  40 + X.append(float(data[0]))
  41 + Y.append(float(data[1]))
42 42 labels.append(data[2])
43 43 return X,Y,labels
44 44  
... ... @@ -59,7 +59,7 @@ def scatter_plot(X,Y, labels, plotname):
59 59 plt.scatter(X, Y, s=75, c=T, alpha=.5)
60 60  
61 61 # Plot diagonal line (45 degrees)
62   - plt.plot(np.arange(0, 120000), np.arange(0, 120000))
  62 + plt.plot(np.arange(0.0, 1.0, 0.01), np.arange(0.0, 1.0, 0.01))
63 63  
64 64 # for i, xy in enumerate(zip(X, Y)): # <--
65 65 # # ax.annotate('(%s, %s)' % xy, xy=xy, textcoords='data') # <--
... ... @@ -75,7 +75,7 @@ def scatter_plot(X,Y, labels, plotname):
75 75  
76 76 plt.tight_layout()
77 77 plt.savefig(plotname)
78   - plt.show()
  78 + #plt.show()
79 79  
80 80  
81 81 if __name__ == '__main__':
... ... @@ -92,10 +92,10 @@ if __name__ == &#39;__main__&#39;:
92 92 setStyle()
93 93 X,Y,labels = readPreprocessedData(inputFile)
94 94 print len(X), len(Y), len(labels)
95   - # scatter_plot(X, Y, labels, plotName)
  95 + scatter_plot(X, Y, labels, plotName)
96 96  
97   - for (x,y,label) in zip(X,Y,labels):
98   - if x > 20000 and y < 45000 : print label, x
  97 + # for (x,y,label) in zip(X,Y,labels):
  98 + # if x > 20000 and y < 45000 : print label, x
99 99  
100 100  
101 101 # Yprime = [y-x for (x,y,label) in zip(X,Y, labels) ]
... ...
test_filter.ipynb
... ... @@ -263,7 +263,9 @@
263 263 "collapsed": true
264 264 },
265 265 "outputs": [],
266   - "source": []
  266 + "source": [
  267 + ""
  268 + ]
267 269 },
268 270 {
269 271 "cell_type": "code",
... ... @@ -272,7 +274,9 @@
272 274 "collapsed": true
273 275 },
274 276 "outputs": [],
275   - "source": []
  277 + "source": [
  278 + ""
  279 + ]
276 280 }
277 281 ],
278 282 "metadata": {
... ... @@ -284,7 +288,7 @@
284 288 "language_info": {
285 289 "codemirror_mode": {
286 290 "name": "ipython",
287   - "version": 2
  291 + "version": 2.0
288 292 },
289 293 "file_extension": ".py",
290 294 "mimetype": "text/x-python",
... ... @@ -296,4 +300,4 @@
296 300 },
297 301 "nbformat": 4,
298 302 "nbformat_minor": 0
299 303 -}
  304 +}
300 305 \ No newline at end of file
... ...
wordcount_from_json_list.py
... ... @@ -3,7 +3,28 @@ local-twitter
3 3  
4 4 Therefore the maximum size of a python list on a 32 bit system is 536,870,912 elements.
5 5  
  6 +----------------
  7 +in_reply_to_status_id
  8 +optional
  9 +"null"
  10 +The ID of an existing status that the update is in reply to.
6 11  
  12 +Note:: This parameter will be ignored unless the author of the tweet this parameter references is mentioned within the
  13 +status text. Therefore, you must include @username, where username is the author of the referenced tweet, within the update.
  14 +----------------
  15 +"in_reply_to_status_id_str": null,
  16 +"in_reply_to_user_id_str": null
  17 +"in_reply_to_user_id": null,
  18 +"in_reply_to_screen_name": null,
  19 +----------------
  20 +So you're referring to Quoted Tweets (retweet with comments). There is no official method for that from the REST API
  21 +yet, however, there are couple of ways to do it.
  22 +
  23 +Since all quoted tweets contain the short url to the original one, you can still use in_reply_to_status_id and filter
  24 +by short url of the original tweet
  25 +Search for tweets that contain the field quoted_status_id this can be done either through REST or STREAMING API.
  26 +quoted_status_id: This field only surfaces when the Tweet is a quote Tweet. This field contains the integer value
  27 +Tweet ID of the quoted Tweet.
7 28  
8 29 @autor: cristina muntean
9 30 @date: 28/06/16
... ...
wordcount_from_json_list_with_edge_features.py 0 → 100644
  1 +"""
  2 +local-twitter
  3 +
  4 +Therefore the maximum size of a python list on a 32 bit system is 536,870,912 elements.
  5 +
  6 +----------------
  7 +in_reply_to_status_id
  8 +optional
  9 +"null"
  10 +The ID of an existing status that the update is in reply to.
  11 +
  12 +Note:: This parameter will be ignored unless the author of the tweet this parameter references is mentioned within the
  13 +status text. Therefore, you must include @username, where username is the author of the referenced tweet, within the update.
  14 +----------------
  15 +"in_reply_to_status_id_str": null,
  16 +"in_reply_to_user_id_str": null
  17 +"in_reply_to_user_id": null,
  18 +"in_reply_to_screen_name": null,
  19 +----------------
  20 +So you're referring to Quoted Tweets (retweet with comments). There is no official method for that from the REST API
  21 +yet, however, there are couple of ways to do it.
  22 +
  23 +Since all quoted tweets contain the short url to the original one, you can still use in_reply_to_status_id and filter
  24 +by short url of the original tweet
  25 +Search for tweets that contain the field quoted_status_id this can be done either through REST or STREAMING API.
  26 +quoted_status_id: This field only surfaces when the Tweet is a quote Tweet. This field contains the integer value
  27 +Tweet ID of the quoted Tweet.
  28 +
  29 +@autor: cristina muntean
  30 +@date: 28/06/16
  31 +"""
  32 +
  33 +import codecs
  34 +import json
  35 +import logging
  36 +import sys
  37 +from collections import defaultdict
  38 +import operator
  39 +from filter import city_filter
  40 +from graph import Node
  41 +from twitter.Tweet import Tweet
  42 +
  43 +def isMention(tweet):
  44 + text = tweet["text"]
  45 + tokens = text.split(" ")
  46 + for t in tokens:
  47 + if t.startswith("@"):
  48 + return True
  49 + return False
  50 +
  51 +def isReply(tweet):
  52 + # text = tweet["text"]
  53 + # if text.startswith("@"):
  54 + # return True
  55 + # return False
  56 + if tweet["in_reply_to_status_id_str"] is not None:
  57 + return True
  58 + else:
  59 + return False
  60 +
  61 +def isQuote(tweet):
  62 + if "quoted_status" in tweet:
  63 + return True
  64 + else:
  65 + return False
  66 +
  67 +def isRetweet(tweet):
  68 + if "retweeted_status" in tweet:
  69 + return True
  70 + else:
  71 + return False
  72 +
  73 +def hasEdge(tweet):
  74 + if isMention(tweet) or isReply(tweet) or isQuote(tweet) or isRetweet(tweet):
  75 + return True
  76 + else:
  77 + return False
  78 +
  79 +
  80 +def processTokenTweet(node, tweet, cityNamesDict, cities15000):
  81 + # wordcount
  82 + node.incrementNode()
  83 +
  84 + # simple counts
  85 + if hasEdge(tweet):
  86 + node.incrementEdge()
  87 + if isMention(tweet):
  88 + node.incMention()
  89 + if isReply(tweet):
  90 + node.incReply()
  91 + if isRetweet(tweet):
  92 + node.incRT()
  93 + if isQuote(tweet):
  94 + node.incQuote()
  95 +
  96 + # city related counts
  97 + currentCity = city_filter.get_US_City(tweet, cityNamesDict, cities15000)
  98 + if isRetweet(tweet):
  99 + originTweet = tweet["retweeted_status"]
  100 + originCity = city_filter.get_US_City(originTweet, cityNamesDict, cities15000)
  101 + if originCity:
  102 + if originCity == currentCity:
  103 + node.incInnerRT()
  104 + else:
  105 + node.incOuterRT()
  106 + if isQuote(tweet):
  107 + originTweet = tweet["quoted_status"]
  108 + originCity = city_filter.get_US_City(originTweet, cityNamesDict, cities15000)
  109 + if originCity:
  110 + if originCity == currentCity:
  111 + node.incInnerQuote()
  112 + else:
  113 + node.incOuterQuote()
  114 + return node
  115 +
  116 +
  117 +def main():
  118 + logger = logging.getLogger("wordcount_retweets.py")
  119 + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")
  120 +
  121 + if len(sys.argv) != 3:
  122 + print "You need to pass the following 2 params: <inputFile> <outputFile> "
  123 + sys.exit(-1)
  124 + inputFile = sys.argv[1]
  125 + outputFilename = sys.argv[2]
  126 + outputWriter = codecs.open(outputFilename, "w", "utf-8")
  127 +
  128 + # load cityNamesDict
  129 + cityNamesDict = city_filter.normalizeCityNames()
  130 + cities15000 = city_filter.loadCities15000(filename="resources/cities15000.txt")
  131 +
  132 + nodeDict = dict()
  133 + wordcountDict
  134 + tweetsAsDict = Tweet.getTweetAsDictionary(inputFile)
  135 + try:
  136 + for tweet in tweetsAsDict:
  137 + ngrams = tweet['ngrams']
  138 + for token in ngrams:
  139 + if token in nodeDict:
  140 + node = nodeDict[token]
  141 + node = processTokenTweet(node, tweet, cityNamesDict, cities15000)
  142 +
  143 + except IOError:
  144 + print "End of file "
  145 +
  146 + sorted_wc = sorted(wordcountDict.items(), key=operator.itemgetter(1), reverse=True)
  147 + for wc_tuple in sorted_wc:
  148 + outputWriter.write('{}\t{}\n'.format(json.dumps(wc_tuple[0]).replace('"', ''), str(wc_tuple[1])))
  149 + outputWriter.close()
  150 +
  151 +if __name__ == '__main__':
  152 + main()
... ...