Commit d08f9507cb2f7f4bd4af9f2dbbbbaddf1252260b

Authored by Cristina Muntean
1 parent c3e2e271

added new min word frequency param

Showing 1 changed file with 29 additions and 12 deletions   Show diff stats
merge_wordcount_with_edge_features.py
... ... @@ -24,6 +24,7 @@ parser.add_argument('files', metavar='ListOfFiles', nargs='+',
24 24 parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist'])
25 25 parser.add_argument('--c', '-city', nargs='?')
26 26 parser.add_argument('--out', '-outputFilename')
  27 +parser.add_argument('--minfreq', '-mininumWordCountFrequency', type=int)
27 28  
28 29  
29 30 def updateNode(existing_node, temp_node):
... ... @@ -47,7 +48,7 @@ def updateNode(existing_node, temp_node):
47 48 return existing_node
48 49  
49 50  
50   -def mergeWhitelist(fileList):
  51 +def mergeWhitelist(fileList, minfreq):
51 52 """
52 53 We are given a list of 2 to n files to merge together and dump in an output file
53 54 By merging we intend sum up the stats on each column
... ... @@ -56,21 +57,37 @@ def mergeWhitelist(fileList):
56 57 """
57 58  
58 59 tokenDict = dict()
  60 + lines = 0
  61 + updates = 0
  62 + new_nodes = 0
  63 +
59 64 for cityFilename in fileList:
  65 + print "Opening file: ", cityFilename
60 66 inputFile = codecs.open(cityFilename, "r", "utf-8")
61 67 for line in inputFile:
  68 + lines += 1
62 69 node = Node.parseString(line)
63   - if node.description in tokenDict:
64   - # update node
65   - existing_node = tokenDict[node.description]
66   - updateNode(existing_node, node)
67   - # for space issues we can destroy the object node
68   - else:
69   - tokenDict[node.description] = node
70   - #return tokenDict
  70 + if node.nodeCount > minfreq:
  71 + if node.description in tokenDict:
  72 + # update node
  73 + existing_node = tokenDict[node.description]
  74 + updateNode(existing_node, node)
  75 + # for space issues we can destroy the object node
  76 + updates += 1
  77 + else:
  78 + tokenDict[node.description] = node
  79 + new_nodes += 1
  80 + print "Dict size: ", len(tokenDict)
  81 + print "Lines: ", lines
  82 + print "Updates: ", updates
  83 + print "New nodes: ", new_nodes
  84 + lines = 0
  85 + updates = 0
  86 + new_nodes = 0
  87 + return tokenDict
71 88  
72 89  
73   -def mergeBlackList(city, cityFile):
  90 +def mergeBlackList(city, cityFile, minfreq):
74 91 """
75 92 Gien a city and a file with the features for that city, we search the directory for similar files corresponding to
76 93 the remaining 9 cities and merge them together in one file.
... ... @@ -94,7 +111,7 @@ def mergeBlackList(city, cityFile):
94 111 for line in inputFile:
95 112 lines += 1
96 113 node = Node.parseString(line)
97   - if node.nodeCount > 1:
  114 + if node.nodeCount > minfreq: #!!! this restricts a lot the dataset
98 115 if node.description in tokenDict:
99 116 # update node
100 117 existing_node = tokenDict[node.description]
... ... @@ -131,7 +148,7 @@ def main():
131 148 print "You need to indicate the file ( of the city in -c ) which to exclude from the merge"
132 149 sys.exit()
133 150  
134   - new_node_dict = mergeBlackList(args.c, args.files[0])
  151 + new_node_dict = mergeBlackList(args.c, args.files[0], args.minfreq)
135 152  
136 153 for node_desc, node in new_node_dict.iteritems():
137 154 outputWriter.write('{}\n'.format(node.tabPrint()))
... ...