Commit d08f9507cb2f7f4bd4af9f2dbbbbaddf1252260b

Authored by Cristina Muntean
1 parent c3e2e271

added new min word frequency param

Showing 1 changed file with 29 additions and 12 deletions   Show diff stats
merge_wordcount_with_edge_features.py
@@ -24,6 +24,7 @@ parser.add_argument('files', metavar='ListOfFiles', nargs='+', @@ -24,6 +24,7 @@ parser.add_argument('files', metavar='ListOfFiles', nargs='+',
24 parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist']) 24 parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist'])
25 parser.add_argument('--c', '-city', nargs='?') 25 parser.add_argument('--c', '-city', nargs='?')
26 parser.add_argument('--out', '-outputFilename') 26 parser.add_argument('--out', '-outputFilename')
  27 +parser.add_argument('--minfreq', '-mininumWordCountFrequency', type=int)
27 28
28 29
29 def updateNode(existing_node, temp_node): 30 def updateNode(existing_node, temp_node):
@@ -47,7 +48,7 @@ def updateNode(existing_node, temp_node): @@ -47,7 +48,7 @@ def updateNode(existing_node, temp_node):
47 return existing_node 48 return existing_node
48 49
49 50
50 -def mergeWhitelist(fileList): 51 +def mergeWhitelist(fileList, minfreq):
51 """ 52 """
52 We are given a list of 2 to n files to merge together and dump in an output file 53 We are given a list of 2 to n files to merge together and dump in an output file
53 By merging we intend sum up the stats on each column 54 By merging we intend sum up the stats on each column
@@ -56,21 +57,37 @@ def mergeWhitelist(fileList): @@ -56,21 +57,37 @@ def mergeWhitelist(fileList):
56 """ 57 """
57 58
58 tokenDict = dict() 59 tokenDict = dict()
  60 + lines = 0
  61 + updates = 0
  62 + new_nodes = 0
  63 +
59 for cityFilename in fileList: 64 for cityFilename in fileList:
  65 + print "Opening file: ", cityFilename
60 inputFile = codecs.open(cityFilename, "r", "utf-8") 66 inputFile = codecs.open(cityFilename, "r", "utf-8")
61 for line in inputFile: 67 for line in inputFile:
  68 + lines += 1
62 node = Node.parseString(line) 69 node = Node.parseString(line)
63 - if node.description in tokenDict:  
64 - # update node  
65 - existing_node = tokenDict[node.description]  
66 - updateNode(existing_node, node)  
67 - # for space issues we can destroy the object node  
68 - else:  
69 - tokenDict[node.description] = node  
70 - #return tokenDict 70 + if node.nodeCount > minfreq:
  71 + if node.description in tokenDict:
  72 + # update node
  73 + existing_node = tokenDict[node.description]
  74 + updateNode(existing_node, node)
  75 + # for space issues we can destroy the object node
  76 + updates += 1
  77 + else:
  78 + tokenDict[node.description] = node
  79 + new_nodes += 1
  80 + print "Dict size: ", len(tokenDict)
  81 + print "Lines: ", lines
  82 + print "Updates: ", updates
  83 + print "New nodes: ", new_nodes
  84 + lines = 0
  85 + updates = 0
  86 + new_nodes = 0
  87 + return tokenDict
71 88
72 89
73 -def mergeBlackList(city, cityFile): 90 +def mergeBlackList(city, cityFile, minfreq):
74 """ 91 """
75 Gien a city and a file with the features for that city, we search the directory for similar files corresponding to 92 Gien a city and a file with the features for that city, we search the directory for similar files corresponding to
76 the remaining 9 cities and merge them together in one file. 93 the remaining 9 cities and merge them together in one file.
@@ -94,7 +111,7 @@ def mergeBlackList(city, cityFile): @@ -94,7 +111,7 @@ def mergeBlackList(city, cityFile):
94 for line in inputFile: 111 for line in inputFile:
95 lines += 1 112 lines += 1
96 node = Node.parseString(line) 113 node = Node.parseString(line)
97 - if node.nodeCount > 1: 114 + if node.nodeCount > minfreq: #!!! this restricts a lot the dataset
98 if node.description in tokenDict: 115 if node.description in tokenDict:
99 # update node 116 # update node
100 existing_node = tokenDict[node.description] 117 existing_node = tokenDict[node.description]
@@ -131,7 +148,7 @@ def main(): @@ -131,7 +148,7 @@ def main():
131 print "You need to indicate the file ( of the city in -c ) which to exclude from the merge" 148 print "You need to indicate the file ( of the city in -c ) which to exclude from the merge"
132 sys.exit() 149 sys.exit()
133 150
134 - new_node_dict = mergeBlackList(args.c, args.files[0]) 151 + new_node_dict = mergeBlackList(args.c, args.files[0], args.minfreq)
135 152
136 for node_desc, node in new_node_dict.iteritems(): 153 for node_desc, node in new_node_dict.iteritems():
137 outputWriter.write('{}\n'.format(node.tabPrint())) 154 outputWriter.write('{}\n'.format(node.tabPrint()))