Commit d08f9507cb2f7f4bd4af9f2dbbbbaddf1252260b
1 parent
c3e2e271
added new min word frequency param
Showing
1 changed file
with
29 additions
and
12 deletions
Show diff stats
merge_wordcount_with_edge_features.py
@@ -24,6 +24,7 @@ parser.add_argument('files', metavar='ListOfFiles', nargs='+', | @@ -24,6 +24,7 @@ parser.add_argument('files', metavar='ListOfFiles', nargs='+', | ||
24 | parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist']) | 24 | parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist']) |
25 | parser.add_argument('--c', '-city', nargs='?') | 25 | parser.add_argument('--c', '-city', nargs='?') |
26 | parser.add_argument('--out', '-outputFilename') | 26 | parser.add_argument('--out', '-outputFilename') |
27 | +parser.add_argument('--minfreq', '-mininumWordCountFrequency', type=int) | ||
27 | 28 | ||
28 | 29 | ||
29 | def updateNode(existing_node, temp_node): | 30 | def updateNode(existing_node, temp_node): |
@@ -47,7 +48,7 @@ def updateNode(existing_node, temp_node): | @@ -47,7 +48,7 @@ def updateNode(existing_node, temp_node): | ||
47 | return existing_node | 48 | return existing_node |
48 | 49 | ||
49 | 50 | ||
50 | -def mergeWhitelist(fileList): | 51 | +def mergeWhitelist(fileList, minfreq): |
51 | """ | 52 | """ |
52 | We are given a list of 2 to n files to merge together and dump in an output file | 53 | We are given a list of 2 to n files to merge together and dump in an output file |
53 | By merging we intend sum up the stats on each column | 54 | By merging we intend sum up the stats on each column |
@@ -56,21 +57,37 @@ def mergeWhitelist(fileList): | @@ -56,21 +57,37 @@ def mergeWhitelist(fileList): | ||
56 | """ | 57 | """ |
57 | 58 | ||
58 | tokenDict = dict() | 59 | tokenDict = dict() |
60 | + lines = 0 | ||
61 | + updates = 0 | ||
62 | + new_nodes = 0 | ||
63 | + | ||
59 | for cityFilename in fileList: | 64 | for cityFilename in fileList: |
65 | + print "Opening file: ", cityFilename | ||
60 | inputFile = codecs.open(cityFilename, "r", "utf-8") | 66 | inputFile = codecs.open(cityFilename, "r", "utf-8") |
61 | for line in inputFile: | 67 | for line in inputFile: |
68 | + lines += 1 | ||
62 | node = Node.parseString(line) | 69 | node = Node.parseString(line) |
63 | - if node.description in tokenDict: | ||
64 | - # update node | ||
65 | - existing_node = tokenDict[node.description] | ||
66 | - updateNode(existing_node, node) | ||
67 | - # for space issues we can destroy the object node | ||
68 | - else: | ||
69 | - tokenDict[node.description] = node | ||
70 | - #return tokenDict | 70 | + if node.nodeCount > minfreq: |
71 | + if node.description in tokenDict: | ||
72 | + # update node | ||
73 | + existing_node = tokenDict[node.description] | ||
74 | + updateNode(existing_node, node) | ||
75 | + # for space issues we can destroy the object node | ||
76 | + updates += 1 | ||
77 | + else: | ||
78 | + tokenDict[node.description] = node | ||
79 | + new_nodes += 1 | ||
80 | + print "Dict size: ", len(tokenDict) | ||
81 | + print "Lines: ", lines | ||
82 | + print "Updates: ", updates | ||
83 | + print "New nodes: ", new_nodes | ||
84 | + lines = 0 | ||
85 | + updates = 0 | ||
86 | + new_nodes = 0 | ||
87 | + return tokenDict | ||
71 | 88 | ||
72 | 89 | ||
73 | -def mergeBlackList(city, cityFile): | 90 | +def mergeBlackList(city, cityFile, minfreq): |
74 | """ | 91 | """ |
75 | Gien a city and a file with the features for that city, we search the directory for similar files corresponding to | 92 | Gien a city and a file with the features for that city, we search the directory for similar files corresponding to |
76 | the remaining 9 cities and merge them together in one file. | 93 | the remaining 9 cities and merge them together in one file. |
@@ -94,7 +111,7 @@ def mergeBlackList(city, cityFile): | @@ -94,7 +111,7 @@ def mergeBlackList(city, cityFile): | ||
94 | for line in inputFile: | 111 | for line in inputFile: |
95 | lines += 1 | 112 | lines += 1 |
96 | node = Node.parseString(line) | 113 | node = Node.parseString(line) |
97 | - if node.nodeCount > 1: | 114 | + if node.nodeCount > minfreq: #!!! this restricts a lot the dataset |
98 | if node.description in tokenDict: | 115 | if node.description in tokenDict: |
99 | # update node | 116 | # update node |
100 | existing_node = tokenDict[node.description] | 117 | existing_node = tokenDict[node.description] |
@@ -131,7 +148,7 @@ def main(): | @@ -131,7 +148,7 @@ def main(): | ||
131 | print "You need to indicate the file ( of the city in -c ) which to exclude from the merge" | 148 | print "You need to indicate the file ( of the city in -c ) which to exclude from the merge" |
132 | sys.exit() | 149 | sys.exit() |
133 | 150 | ||
134 | - new_node_dict = mergeBlackList(args.c, args.files[0]) | 151 | + new_node_dict = mergeBlackList(args.c, args.files[0], args.minfreq) |
135 | 152 | ||
136 | for node_desc, node in new_node_dict.iteritems(): | 153 | for node_desc, node in new_node_dict.iteritems(): |
137 | outputWriter.write('{}\n'.format(node.tabPrint())) | 154 | outputWriter.write('{}\n'.format(node.tabPrint())) |