Blame view

prepare_plot_scatter_2_distrib.py 5.41 KB
fa82b72e   Cristina Muntean   plot utility
1
2
3
  """
  local-twitter
  
9bb47579   Cristina Muntean   normalizing distr...
4
  We read 2 distributions from 2 different files
fa82b72e   Cristina Muntean   plot utility
5
6
7
8
9
10
11
12
  the first 1 - we take k top topics - so we read the file up to and index
  then we search for the ngram in the second file and take the frequencies
  we do a scatterplot
  
  
  @autor: cristina muntean
  @date: 28/06/16
  """
266c3b7c   Cristina Muntean   new local param t...
13
  import argparse
fa82b72e   Cristina Muntean   plot utility
14
15
  import codecs
  import logging
fa82b72e   Cristina Muntean   plot utility
16
17
  from collections import defaultdict
  
769fad15   Cristina Muntean   minor
18
  stopwords = open('./resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n')
cd0b644d   Cristina Muntean   minor
19
  
266c3b7c   Cristina Muntean   new local param t...
20
21
  
  parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics')
cf4b8e4a   Cristina Muntean   minor to param
22
23
  parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens'
                                                      'often SORTED per desired column')
266c3b7c   Cristina Muntean   new local param t...
24
25
26
27
28
29
30
31
  parser.add_argument('--input2', '-inputFile2', help='the file against which we compare the input1')
  parser.add_argument('--output', '-outputFile', help='the files in which we save the two distributions of values for each token')
  parser.add_argument('--col', '-columnNumber', type=int, help='the value correspnding to which column in the feature file; '
                                                               'index starts at 1')
  parser.add_argument('--k', '-topKtokensInInput1', type=int, default=-1, help='negative value if we want to look at the while file; '
                                                                   'positive if we want to stop before; default -1.')
  parser.add_argument('--coef', '-localityCoeficient', type=float, help='the difference between the two frequencies so as '
                                                                        'to be local e.g A = 0.25 * B')
266c3b7c   Cristina Muntean   new local param t...
32
33
  
  
5902e36c   Cristina Muntean   changes to plot
34
  def readFromFileWordcount(filename):
9bb47579   Cristina Muntean   normalizing distr...
35
      max_in_distrib = 0
fa82b72e   Cristina Muntean   plot utility
36
      docList = list()
9bb47579   Cristina Muntean   normalizing distr...
37
38
39
40
41
42
43
44
45
46
47
      f = codecs.open(filename, "r", "utf-8")
      first_line = f.readline()
  
      # read first line to get max
      if len(first_line.split("\t")) == 2:
          word, counter = first_line.replace("\n", "").split("\t")
          docList.append(tuple([word, int(counter)]))
          max_in_distrib = int(counter)
  
      # read other lines
      for line in f:
fa82b72e   Cristina Muntean   plot utility
48
49
50
          if len(line.split("\t")) == 2:
              word, counter = line.replace("\n", "").split("\t")
              docList.append(tuple([word, int(counter)]))
9bb47579   Cristina Muntean   normalizing distr...
51
      return docList, float(max_in_distrib)
fa82b72e   Cristina Muntean   plot utility
52
53
  
  
5902e36c   Cristina Muntean   changes to plot
54
55
56
57
58
59
  def readFromFileMultipleEdges(filename, columnNumber):
      '''
      124    	like   	1938   	1261   	1212   	205    	933    	66     	54     	53     	9      	1
  
      Meaning: self.id, self.description, self.nodeCount, self.edgeCount, self.mentionCount, self.replyCount,
      self.RTCount, self.innerRTCount, self.outerRTCount, self.quoteCount, self.innerQuoteCount, self.outerQuoteCount
723e14fb   Cristina Muntean   problem with prep...
60
61
  
  
5902e36c   Cristina Muntean   changes to plot
62
63
64
65
      :param filename:
      :param columnNumber:
      :return:
      '''
5902e36c   Cristina Muntean   changes to plot
66
67
68
69
70
71
72
73
74
75
      docList = list()
      f = codecs.open(filename, "r", "utf-8")
      first_line = f.readline()
  
      # we assume the first line of the file is correct and we save the number of columns from there
      rowDataCols = first_line.replace("\n", "").split("\t")
      numCols = len(rowDataCols)
  
      # read first line to get max
      word = rowDataCols[1]
6a47cd9a   Cristina Muntean   test val
76
      counter = int(rowDataCols[columnNumber-1])  # the index starts at 0 so column 9 corresponds to 8 in the array index
cd0b644d   Cristina Muntean   minor
77
78
      if word not in stopwords and counter > 0:
          docList.append(tuple([word, counter]))
6a47cd9a   Cristina Muntean   test val
79
      max_in_distrib = counter
5902e36c   Cristina Muntean   changes to plot
80
81
82
83
84
85
  
      # read other lines
      for line in f:
          if len(line.split("\t")) == numCols:
              rowDataCols = line.replace("\n", "").split("\t")
              word = rowDataCols[1]
6a47cd9a   Cristina Muntean   test val
86
              counter = int(rowDataCols[columnNumber - 1])
cd0b644d   Cristina Muntean   minor
87
88
89
              ### we cand make a value check - min 1 !!!
              if word not in stopwords and counter > 0:
                  docList.append(tuple([word, counter]))
86896363   Cristina Muntean   prepairing plot ...
90
91
              if counter > max_in_distrib:
                  max_in_distrib = counter
cf2ef076   Cristina Muntean   fixed mac_distrib
92
93
          else:
              print "Num columns out of range", line
60c5d91f   Cristina Muntean   fixed max_distrib
94
      return docList, float(max_in_distrib)
5902e36c   Cristina Muntean   changes to plot
95
96
  
  
fa82b72e   Cristina Muntean   plot utility
97
98
  if __name__ == '__main__':
  
9bb47579   Cristina Muntean   normalizing distr...
99
      logger = logging.getLogger("prepare_plot_scatter_2_distrib.py")
fa82b72e   Cristina Muntean   plot utility
100
101
      logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")
  
266c3b7c   Cristina Muntean   new local param t...
102
103
104
105
106
107
108
109
110
      args = parser.parse_args()
      print args
  
      inputFile1 = args.input1
      inputFile2 = args.input2
      columnNumber = args.col
      k = args.k
      localCoef = args.coef
      dataFile = codecs.open(args.output, "w", "utf8")
fa82b72e   Cristina Muntean   plot utility
111
  
9bb47579   Cristina Muntean   normalizing distr...
112
      # read distributions!
7d9a4b72   Cristina Muntean   minor
113
      a, a_max = readFromFileMultipleEdges(inputFile1, columnNumber)  # sorted
723e14fb   Cristina Muntean   problem with prep...
114
115
      b, b_max = readFromFileMultipleEdges(inputFile2, columnNumber)  ### we can improve this and keep only the ones
      # present in a list
fa82b72e   Cristina Muntean   plot utility
116
      print len(a), len(b)
36a797c0   Cristina Muntean   prepairing plot ...
117
      print "Maxes: ", a_max, b_max
fa82b72e   Cristina Muntean   plot utility
118
  
266c3b7c   Cristina Muntean   new local param t...
119
120
121
122
      # check for k
      if k < 0:
          k = len(a)
  
cf2ef076   Cristina Muntean   fixed mac_distrib
123
124
125
126
127
128
129
130
131
132
133
134
135
136
      # make b a default dict as we search for elements from a
      bDict = {rows[0]: int(rows[1]) for rows in b}
      bDict = defaultdict(int, bDict)
  
      # initialize the lists of elements selected
      X = list()
      Y = list()
  
      # populate the lists with the subset of k values
      # now we normalize
      local_topics = []
      for word, counter in a[:k]:
          X.append(int(counter)/a_max)
          Y.append(int(bDict[word])/b_max)
266c3b7c   Cristina Muntean   new local param t...
137
          if int(bDict[word]) < localCoef * float(counter):
cf2ef076   Cristina Muntean   fixed mac_distrib
138
139
140
              local_topics.append(word)
      labels = [row[0] for row in a[:k]]
  
a0356894   Cristina Muntean   minor
141
      print unicode(", ".join(local_topics)).decode('utf8')
cf2ef076   Cristina Muntean   fixed mac_distrib
142
143
144
145
      print len(X), len(Y), len(labels)
  
      # write files for plots
      for (x,y,label) in zip(X,Y,labels):
dac2795f   Cristina Muntean   minor
146
          dataFile.write("{}\t{}\t{}\n".format(str(x),str(y),label))
cf2ef076   Cristina Muntean   fixed mac_distrib
147
      dataFile.close()
fa82b72e   Cristina Muntean   plot utility