Commit cd0b644dbbe7d2bc7a7fdfd9cafac9a58e3ba283

Authored by Cristina Muntean
1 parent a0356894

minor

Showing 1 changed file with 8 additions and 2 deletions   Show diff stats
prepare_plot_scatter_2_distrib.py
@@ -13,9 +13,12 @@ we do a scatterplot @@ -13,9 +13,12 @@ we do a scatterplot
13 import argparse 13 import argparse
14 import codecs 14 import codecs
15 import logging 15 import logging
  16 +import os
16 import sys 17 import sys
17 from collections import defaultdict 18 from collections import defaultdict
18 19
  20 +stopwords = open('/resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n')
  21 +
19 22
20 parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics') 23 parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics')
21 parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens' 24 parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens'
@@ -73,7 +76,8 @@ def readFromFileMultipleEdges(filename, columnNumber): @@ -73,7 +76,8 @@ def readFromFileMultipleEdges(filename, columnNumber):
73 # read first line to get max 76 # read first line to get max
74 word = rowDataCols[1] 77 word = rowDataCols[1]
75 counter = int(rowDataCols[columnNumber-1]) # the index starts at 0 so column 9 corresponds to 8 in the array index 78 counter = int(rowDataCols[columnNumber-1]) # the index starts at 0 so column 9 corresponds to 8 in the array index
76 - docList.append(tuple([word, counter])) 79 + if word not in stopwords and counter > 0:
  80 + docList.append(tuple([word, counter]))
77 max_in_distrib = counter 81 max_in_distrib = counter
78 82
79 # read other lines 83 # read other lines
@@ -82,7 +86,9 @@ def readFromFileMultipleEdges(filename, columnNumber): @@ -82,7 +86,9 @@ def readFromFileMultipleEdges(filename, columnNumber):
82 rowDataCols = line.replace("\n", "").split("\t") 86 rowDataCols = line.replace("\n", "").split("\t")
83 word = rowDataCols[1] 87 word = rowDataCols[1]
84 counter = int(rowDataCols[columnNumber - 1]) 88 counter = int(rowDataCols[columnNumber - 1])
85 - docList.append(tuple([word, counter])) 89 + ### we cand make a value check - min 1 !!!
  90 + if word not in stopwords and counter > 0:
  91 + docList.append(tuple([word, counter]))
86 if counter > max_in_distrib: 92 if counter > max_in_distrib:
87 max_in_distrib = counter 93 max_in_distrib = counter
88 else: 94 else: