Commit cd0b644dbbe7d2bc7a7fdfd9cafac9a58e3ba283
1 parent
a0356894
minor
Showing
1 changed file
with
8 additions
and
2 deletions
Show diff stats
prepare_plot_scatter_2_distrib.py
... | ... | @@ -13,9 +13,12 @@ we do a scatterplot |
13 | 13 | import argparse |
14 | 14 | import codecs |
15 | 15 | import logging |
16 | +import os | |
16 | 17 | import sys |
17 | 18 | from collections import defaultdict |
18 | 19 | |
20 | +stopwords = open('/resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n') | |
21 | + | |
19 | 22 | |
20 | 23 | parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics') |
21 | 24 | parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens' |
... | ... | @@ -73,7 +76,8 @@ def readFromFileMultipleEdges(filename, columnNumber): |
73 | 76 | # read first line to get max |
74 | 77 | word = rowDataCols[1] |
75 | 78 | counter = int(rowDataCols[columnNumber-1]) # the index starts at 0 so column 9 corresponds to 8 in the array index |
76 | - docList.append(tuple([word, counter])) | |
79 | + if word not in stopwords and counter > 0: | |
80 | + docList.append(tuple([word, counter])) | |
77 | 81 | max_in_distrib = counter |
78 | 82 | |
79 | 83 | # read other lines |
... | ... | @@ -82,7 +86,9 @@ def readFromFileMultipleEdges(filename, columnNumber): |
82 | 86 | rowDataCols = line.replace("\n", "").split("\t") |
83 | 87 | word = rowDataCols[1] |
84 | 88 | counter = int(rowDataCols[columnNumber - 1]) |
85 | - docList.append(tuple([word, counter])) | |
89 | + ### we cand make a value check - min 1 !!! | |
90 | + if word not in stopwords and counter > 0: | |
91 | + docList.append(tuple([word, counter])) | |
86 | 92 | if counter > max_in_distrib: |
87 | 93 | max_in_distrib = counter |
88 | 94 | else: | ... | ... |