""" local-twitter We read 2 distributions from 2 different files the first 1 - we take k top topics - so we read the file up to and index then we search for the ngram in the second file and take the frequencies we do a scatterplot @autor: cristina muntean @date: 28/06/16 """ import argparse import codecs import logging from collections import defaultdict stopwords = open('./resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n') parser = argparse.ArgumentParser(description=' 0: docList.append(tuple([word, counter])) max_in_distrib = counter # read other lines for line in f: if len(line.split("\t")) == numCols: rowDataCols = line.replace("\n", "").split("\t") word = rowDataCols[1] counter = int(rowDataCols[columnNumber - 1]) ### we cand make a value check - min 1 !!! if word not in stopwords and counter > 0: docList.append(tuple([word, counter])) if counter > max_in_distrib: max_in_distrib = counter else: print "Num columns out of range", line return docList, float(max_in_distrib) if __name__ == '__main__': logger = logging.getLogger("prepare_plot_scatter_2_distrib.py") logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s") args = parser.parse_args() print args inputFile1 = args.input1 inputFile2 = args.input2 columnNumber = args.col k = args.k localCoef = args.coef dataFile = codecs.open(args.output, "w", "utf8") # read distributions! a, a_max = readFromFileMultipleEdges(inputFile1, columnNumber) # sorted b, b_max = readFromFileMultipleEdges(inputFile2, columnNumber) ### we can improve this and keep only the ones # present in a list print len(a), len(b) print "Maxes: ", a_max, b_max # check for k if k < 0: k = len(a) # make b a default dict as we search for elements from a bDict = {rows[0]: int(rows[1]) for rows in b} bDict = defaultdict(int, bDict) # initialize the lists of elements selected X = list() Y = list() # populate the lists with the subset of k values # now we normalize local_topics = [] for word, counter in a[:k]: X.append(int(counter)/a_max) Y.append(int(bDict[word])/b_max) if int(bDict[word]) < localCoef * float(counter): local_topics.append(word) labels = [row[0] for row in a[:k]] print unicode(", ".join(local_topics)).decode('utf8') print len(X), len(Y), len(labels) # write files for plots for (x,y,label) in zip(X,Y,labels): dataFile.write("{}\t{}\t{}\n".format(str(x),str(y),label)) dataFile.close()