Commit 266c3b7c160d2248611cc6cef555b9cddcd258f7

Authored by Cristina Muntean
1 parent 60c5d91f

new local param to prepare_plot ..

Showing 1 changed file with 29 additions and 9 deletions   Show diff stats
prepare_plot_scatter_2_distrib.py
... ... @@ -10,11 +10,26 @@ we do a scatterplot
10 10 @autor: cristina muntean
11 11 @date: 28/06/16
12 12 """
  13 +import argparse
13 14 import codecs
14 15 import logging
15 16 import sys
16 17 from collections import defaultdict
17 18  
  19 +
  20 +parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics')
  21 +parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens')
  22 +parser.add_argument('--input2', '-inputFile2', help='the file against which we compare the input1')
  23 +parser.add_argument('--output', '-outputFile', help='the files in which we save the two distributions of values for each token')
  24 +parser.add_argument('--col', '-columnNumber', type=int, help='the value correspnding to which column in the feature file; '
  25 + 'index starts at 1')
  26 +parser.add_argument('--k', '-topKtokensInInput1', type=int, default=-1, help='negative value if we want to look at the while file; '
  27 + 'positive if we want to stop before; default -1.')
  28 +parser.add_argument('--coef', '-localityCoeficient', type=float, help='the difference between the two frequencies so as '
  29 + 'to be local e.g A = 0.25 * B')
  30 +parser.add_argument('--minfreq', '-mininumWordCountFrequency', type=int, help='')
  31 +
  32 +
18 33 def readFromFileWordcount(filename):
19 34 max_in_distrib = 0
20 35 docList = list()
... ... @@ -80,14 +95,15 @@ if __name__ == &#39;__main__&#39;:
80 95 logger = logging.getLogger("prepare_plot_scatter_2_distrib.py")
81 96 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")
82 97  
83   - if len(sys.argv) != 6:
84   - print "You need to pass the following 5 params: <inputFile1-SORTED-ON-COL> <inputFile2> <columnNumber> <k> <data-file>"
85   - sys.exit(-1)
86   - inputFile1 = sys.argv[1]
87   - inputFile2 = sys.argv[2]
88   - columnNumber = int(sys.argv[3])
89   - k = int(sys.argv[4])
90   - dataFile = codecs.open(sys.argv[5], "w", "utf8")
  98 + args = parser.parse_args()
  99 + print args
  100 +
  101 + inputFile1 = args.input1
  102 + inputFile2 = args.input2
  103 + columnNumber = args.col
  104 + k = args.k
  105 + localCoef = args.coef
  106 + dataFile = codecs.open(args.output, "w", "utf8")
91 107  
92 108 # read distributions!
93 109 a, a_max = readFromFileMultipleEdges(inputFile1, columnNumber) # sorted
... ... @@ -96,6 +112,10 @@ if __name__ == &#39;__main__&#39;:
96 112 print len(a), len(b)
97 113 print "Maxes: ", a_max, b_max
98 114  
  115 + # check for k
  116 + if k < 0:
  117 + k = len(a)
  118 +
99 119 # make b a default dict as we search for elements from a
100 120 bDict = {rows[0]: int(rows[1]) for rows in b}
101 121 bDict = defaultdict(int, bDict)
... ... @@ -110,7 +130,7 @@ if __name__ == &#39;__main__&#39;:
110 130 for word, counter in a[:k]:
111 131 X.append(int(counter)/a_max)
112 132 Y.append(int(bDict[word])/b_max)
113   - if int(bDict[word]) < 0.25 * float(counter):
  133 + if int(bDict[word]) < localCoef * float(counter):
114 134 local_topics.append(word)
115 135 labels = [row[0] for row in a[:k]]
116 136  
... ...