prepare_plot_scatter_2_distrib.py
5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
local-twitter
We read 2 distributions from 2 different files
the first 1 - we take k top topics - so we read the file up to and index
then we search for the ngram in the second file and take the frequencies
we do a scatterplot
@autor: cristina muntean
@date: 28/06/16
"""
import argparse
import codecs
import logging
import sys
from collections import defaultdict
parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics')
parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens'
'often SORTED per desired column')
parser.add_argument('--input2', '-inputFile2', help='the file against which we compare the input1')
parser.add_argument('--output', '-outputFile', help='the files in which we save the two distributions of values for each token')
parser.add_argument('--col', '-columnNumber', type=int, help='the value correspnding to which column in the feature file; '
'index starts at 1')
parser.add_argument('--k', '-topKtokensInInput1', type=int, default=-1, help='negative value if we want to look at the while file; '
'positive if we want to stop before; default -1.')
parser.add_argument('--coef', '-localityCoeficient', type=float, help='the difference between the two frequencies so as '
'to be local e.g A = 0.25 * B')
def readFromFileWordcount(filename):
max_in_distrib = 0
docList = list()
f = codecs.open(filename, "r", "utf-8")
first_line = f.readline()
# read first line to get max
if len(first_line.split("\t")) == 2:
word, counter = first_line.replace("\n", "").split("\t")
docList.append(tuple([word, int(counter)]))
max_in_distrib = int(counter)
# read other lines
for line in f:
if len(line.split("\t")) == 2:
word, counter = line.replace("\n", "").split("\t")
docList.append(tuple([word, int(counter)]))
return docList, float(max_in_distrib)
def readFromFileMultipleEdges(filename, columnNumber):
'''
124 like 1938 1261 1212 205 933 66 54 53 9 1
Meaning: self.id, self.description, self.nodeCount, self.edgeCount, self.mentionCount, self.replyCount,
self.RTCount, self.innerRTCount, self.outerRTCount, self.quoteCount, self.innerQuoteCount, self.outerQuoteCount
:param filename:
:param columnNumber:
:return:
'''
docList = list()
f = codecs.open(filename, "r", "utf-8")
first_line = f.readline()
# we assume the first line of the file is correct and we save the number of columns from there
rowDataCols = first_line.replace("\n", "").split("\t")
numCols = len(rowDataCols)
# read first line to get max
word = rowDataCols[1]
counter = int(rowDataCols[columnNumber-1]) # the index starts at 0 so column 9 corresponds to 8 in the array index
docList.append(tuple([word, counter]))
max_in_distrib = counter
# read other lines
for line in f:
if len(line.split("\t")) == numCols:
rowDataCols = line.replace("\n", "").split("\t")
word = rowDataCols[1]
counter = int(rowDataCols[columnNumber - 1])
docList.append(tuple([word, counter]))
if counter > max_in_distrib:
max_in_distrib = counter
else:
print "Num columns out of range", line
return docList, float(max_in_distrib)
if __name__ == '__main__':
logger = logging.getLogger("prepare_plot_scatter_2_distrib.py")
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")
args = parser.parse_args()
print args
inputFile1 = args.input1
inputFile2 = args.input2
columnNumber = args.col
k = args.k
localCoef = args.coef
dataFile = codecs.open(args.output, "w", "utf8")
# read distributions!
a, a_max = readFromFileMultipleEdges(inputFile1, columnNumber) # sorted
b, b_max = readFromFileMultipleEdges(inputFile2, columnNumber) ### we can improve this and keep only the ones
# present in a list
print len(a), len(b)
print "Maxes: ", a_max, b_max
# check for k
if k < 0:
k = len(a)
# make b a default dict as we search for elements from a
bDict = {rows[0]: int(rows[1]) for rows in b}
bDict = defaultdict(int, bDict)
# initialize the lists of elements selected
X = list()
Y = list()
# populate the lists with the subset of k values
# now we normalize
local_topics = []
for word, counter in a[:k]:
X.append(int(counter)/a_max)
Y.append(int(bDict[word])/b_max)
if int(bDict[word]) < localCoef * float(counter):
local_topics.append(word)
labels = [row[0] for row in a[:k]]
print unicode(", ".join(local_topics)).decode('utf8')
print len(X), len(Y), len(labels)
# write files for plots
for (x,y,label) in zip(X,Y,labels):
dataFile.write("{}\t{}\t{}\n".format(str(x),str(y),label))
dataFile.close()