Commit cd0b644dbbe7d2bc7a7fdfd9cafac9a58e3ba283

Authored by Cristina Muntean
1 parent a0356894

minor

Showing 1 changed file with 8 additions and 2 deletions   Show diff stats
prepare_plot_scatter_2_distrib.py
... ... @@ -13,9 +13,12 @@ we do a scatterplot
13 13 import argparse
14 14 import codecs
15 15 import logging
  16 +import os
16 17 import sys
17 18 from collections import defaultdict
18 19  
  20 +stopwords = open('/resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n')
  21 +
19 22  
20 23 parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics')
21 24 parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens'
... ... @@ -73,7 +76,8 @@ def readFromFileMultipleEdges(filename, columnNumber):
73 76 # read first line to get max
74 77 word = rowDataCols[1]
75 78 counter = int(rowDataCols[columnNumber-1]) # the index starts at 0 so column 9 corresponds to 8 in the array index
76   - docList.append(tuple([word, counter]))
  79 + if word not in stopwords and counter > 0:
  80 + docList.append(tuple([word, counter]))
77 81 max_in_distrib = counter
78 82  
79 83 # read other lines
... ... @@ -82,7 +86,9 @@ def readFromFileMultipleEdges(filename, columnNumber):
82 86 rowDataCols = line.replace("\n", "").split("\t")
83 87 word = rowDataCols[1]
84 88 counter = int(rowDataCols[columnNumber - 1])
85   - docList.append(tuple([word, counter]))
  89 + ### we cand make a value check - min 1 !!!
  90 + if word not in stopwords and counter > 0:
  91 + docList.append(tuple([word, counter]))
86 92 if counter > max_in_distrib:
87 93 max_in_distrib = counter
88 94 else:
... ...