merge_wordcount_with_edge_features.py
4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
'''
local-twitter : merge_wordcount_with_edge_features
@euthor: Cristina Muntean (cristina.muntean@isti.cnr.it)
@date: 9/21/16
-----------------------------
We now have two options for merging:
1. give a list of files to merge - 2 files?
2. give a name of a city and a file (which shouldn't be merged) and merge the other 9
'''
import argparse
import codecs
import os
import sys
from graph.Node import Node
parser = argparse.ArgumentParser(description='Merge some files.')
parser.add_argument('files', metavar='ListOfFiles', nargs='+',
help='a list of files to be merged')
parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist'])
parser.add_argument('--c', '-city', nargs='?')
parser.add_argument('--out', '-outputFilename')
def updateNode(existing_node, temp_node):
"""
Givem an existing node and a new node we update the values in the existing node by summing up the existing values
with the values in the temp_node
:param existing_node:
:param temp_node:
:return:
"""
existing_node.incrementNode(temp_node.nodeCount)
existing_node.incrementEdge(temp_node.edgeCount)
existing_node.incMention(temp_node.mentionCount)
existing_node.incReply(temp_node.replyCount)
existing_node.incRT(temp_node.RTCount)
existing_node.incInnerRT(temp_node.innerRTCount)
existing_node.incOuterRT(temp_node.outerRTCount)
existing_node.incQuote(temp_node.quoteCount)
existing_node.incInnerQuote(temp_node.innerQuoteCount)
existing_node.incOuterQuote(temp_node.outerQuoteCount)
return existing_node
def mergeWhitelist(fileList):
"""
We are given a list of 2 to n files to merge together and dump in an output file
By merging we intend sum up the stats on each column
:param fileList: list of files to be merged
:return: a dictionary with uniques values and summed attribute (soon to be dumped to a file)
"""
tokenDict = dict()
for cityFilename in fileList:
inputFile = codecs.open(cityFilename, "r", "utf-8")
for line in inputFile:
node = Node.parseString(line)
if node.description in tokenDict:
# update node
existing_node = tokenDict[node.description]
updateNode(existing_node, node)
# for space issues we can destroy the object node
else:
tokenDict[node.description] = node
#return tokenDict
def mergeBlackList(city, cityFile):
"""
Gien a city and a file with the features for that city, we search the directory for similar files corresponding to
the remaining 9 cities and merge them together in one file.
:param city: a string with the name of the city NOT to be included in the merge
:param cityFile: the file of features for that city
:return:
"""
tokenDict = dict()
lines = 0
updates = 0
new_nodes = 0
fileDir = os.path.dirname(os.path.realpath(cityFile))
filetype = cityFile.split("/")[-1].replace(city, "")
for filename in os.listdir(fileDir):
if city not in filename and filetype in filename:
print "Opening file: ", filename
inputFile = codecs.open(fileDir+"/"+filename, "r", "utf-8")
for line in inputFile:
lines += 1
node = Node.parseString(line)
if node.nodeCount > 1:
if node.description in tokenDict:
# update node
existing_node = tokenDict[node.description]
updateNode(existing_node, node)
updates += 1
else:
tokenDict[node.description] = node
new_nodes += 1
print "Dict size: ", len(tokenDict)
print "Lines: ", lines
print "Updates: ", updates
print "New nodes: ", new_nodes
lines = 0
updates = 0
new_nodes = 0
return tokenDict
def main():
args = parser.parse_args()
print args
outputWriter = codecs.open(args.out, "w", "utf-8")
if args.o == "whitelist":
new_node_dict = mergeWhitelist(args.files)
else:
if args.c is None:
print "You need to give the name of the seed city (we merge the remaining 9 cities) using -c"
sys.exit()
if len(args.files) != 1:
print "You need to indicate the file ( of the city in -c ) which to exclude from the merge"
sys.exit()
new_node_dict = mergeBlackList(args.c, args.files[0])
for node_desc, node in new_node_dict.iteritems():
outputWriter.write('{}\n'.format(node.tabPrint()))
outputWriter.close()
if __name__ == '__main__':
main()