diff --git a/merge_wordcount_with_edge_features.py b/merge_wordcount_with_edge_features.py index 5c68a16..169d77b 100644 --- a/merge_wordcount_with_edge_features.py +++ b/merge_wordcount_with_edge_features.py @@ -24,6 +24,7 @@ parser.add_argument('files', metavar='ListOfFiles', nargs='+', parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist']) parser.add_argument('--c', '-city', nargs='?') parser.add_argument('--out', '-outputFilename') +parser.add_argument('--minfreq', '-mininumWordCountFrequency', type=int) def updateNode(existing_node, temp_node): @@ -47,7 +48,7 @@ def updateNode(existing_node, temp_node): return existing_node -def mergeWhitelist(fileList): +def mergeWhitelist(fileList, minfreq): """ We are given a list of 2 to n files to merge together and dump in an output file By merging we intend sum up the stats on each column @@ -56,21 +57,37 @@ def mergeWhitelist(fileList): """ tokenDict = dict() + lines = 0 + updates = 0 + new_nodes = 0 + for cityFilename in fileList: + print "Opening file: ", cityFilename inputFile = codecs.open(cityFilename, "r", "utf-8") for line in inputFile: + lines += 1 node = Node.parseString(line) - if node.description in tokenDict: - # update node - existing_node = tokenDict[node.description] - updateNode(existing_node, node) - # for space issues we can destroy the object node - else: - tokenDict[node.description] = node - #return tokenDict + if node.nodeCount > minfreq: + if node.description in tokenDict: + # update node + existing_node = tokenDict[node.description] + updateNode(existing_node, node) + # for space issues we can destroy the object node + updates += 1 + else: + tokenDict[node.description] = node + new_nodes += 1 + print "Dict size: ", len(tokenDict) + print "Lines: ", lines + print "Updates: ", updates + print "New nodes: ", new_nodes + lines = 0 + updates = 0 + new_nodes = 0 + return tokenDict -def mergeBlackList(city, cityFile): +def mergeBlackList(city, cityFile, minfreq): """ Gien a city and a file with the features for that city, we search the directory for similar files corresponding to the remaining 9 cities and merge them together in one file. @@ -94,7 +111,7 @@ def mergeBlackList(city, cityFile): for line in inputFile: lines += 1 node = Node.parseString(line) - if node.nodeCount > 1: + if node.nodeCount > minfreq: #!!! this restricts a lot the dataset if node.description in tokenDict: # update node existing_node = tokenDict[node.description] @@ -131,7 +148,7 @@ def main(): print "You need to indicate the file ( of the city in -c ) which to exclude from the merge" sys.exit() - new_node_dict = mergeBlackList(args.c, args.files[0]) + new_node_dict = mergeBlackList(args.c, args.files[0], args.minfreq) for node_desc, node in new_node_dict.iteritems(): outputWriter.write('{}\n'.format(node.tabPrint())) -- libgit2 0.21.4