prepare_plot_scatter_2_distrib.py 5.43 KB
"""
local-twitter

We read 2 distributions from 2 different files
the first 1 - we take k top topics - so we read the file up to and index
then we search for the ngram in the second file and take the frequencies
we do a scatterplot


@autor: cristina muntean
@date: 28/06/16
"""
import argparse
import codecs
import logging
import os
import sys
from collections import defaultdict

stopwords = open('/resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n')


parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics')
parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens'
                                                    'often SORTED per desired column')
parser.add_argument('--input2', '-inputFile2', help='the file against which we compare the input1')
parser.add_argument('--output', '-outputFile', help='the files in which we save the two distributions of values for each token')
parser.add_argument('--col', '-columnNumber', type=int, help='the value correspnding to which column in the feature file; '
                                                             'index starts at 1')
parser.add_argument('--k', '-topKtokensInInput1', type=int, default=-1, help='negative value if we want to look at the while file; '
                                                                 'positive if we want to stop before; default -1.')
parser.add_argument('--coef', '-localityCoeficient', type=float, help='the difference between the two frequencies so as '
                                                                      'to be local e.g A = 0.25 * B')


def readFromFileWordcount(filename):
    max_in_distrib = 0
    docList = list()
    f = codecs.open(filename, "r", "utf-8")
    first_line = f.readline()

    # read first line to get max
    if len(first_line.split("\t")) == 2:
        word, counter = first_line.replace("\n", "").split("\t")
        docList.append(tuple([word, int(counter)]))
        max_in_distrib = int(counter)

    # read other lines
    for line in f:
        if len(line.split("\t")) == 2:
            word, counter = line.replace("\n", "").split("\t")
            docList.append(tuple([word, int(counter)]))
    return docList, float(max_in_distrib)


def readFromFileMultipleEdges(filename, columnNumber):
    '''
    124    	like   	1938   	1261   	1212   	205    	933    	66     	54     	53     	9      	1

    Meaning: self.id, self.description, self.nodeCount, self.edgeCount, self.mentionCount, self.replyCount,
    self.RTCount, self.innerRTCount, self.outerRTCount, self.quoteCount, self.innerQuoteCount, self.outerQuoteCount


    :param filename:
    :param columnNumber:
    :return:
    '''
    docList = list()
    f = codecs.open(filename, "r", "utf-8")
    first_line = f.readline()

    # we assume the first line of the file is correct and we save the number of columns from there
    rowDataCols = first_line.replace("\n", "").split("\t")
    numCols = len(rowDataCols)

    # read first line to get max
    word = rowDataCols[1]
    counter = int(rowDataCols[columnNumber-1])  # the index starts at 0 so column 9 corresponds to 8 in the array index
    if word not in stopwords and counter > 0:
        docList.append(tuple([word, counter]))
    max_in_distrib = counter

    # read other lines
    for line in f:
        if len(line.split("\t")) == numCols:
            rowDataCols = line.replace("\n", "").split("\t")
            word = rowDataCols[1]
            counter = int(rowDataCols[columnNumber - 1])
            ### we cand make a value check - min 1 !!!
            if word not in stopwords and counter > 0:
                docList.append(tuple([word, counter]))
            if counter > max_in_distrib:
                max_in_distrib = counter
        else:
            print "Num columns out of range", line
    return docList, float(max_in_distrib)


if __name__ == '__main__':

    logger = logging.getLogger("prepare_plot_scatter_2_distrib.py")
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")

    args = parser.parse_args()
    print args

    inputFile1 = args.input1
    inputFile2 = args.input2
    columnNumber = args.col
    k = args.k
    localCoef = args.coef
    dataFile = codecs.open(args.output, "w", "utf8")

    # read distributions!
    a, a_max = readFromFileMultipleEdges(inputFile1, columnNumber)  # sorted
    b, b_max = readFromFileMultipleEdges(inputFile2, columnNumber)  ### we can improve this and keep only the ones
    # present in a list
    print len(a), len(b)
    print "Maxes: ", a_max, b_max

    # check for k
    if k < 0:
        k = len(a)

    # make b a default dict as we search for elements from a
    bDict = {rows[0]: int(rows[1]) for rows in b}
    bDict = defaultdict(int, bDict)

    # initialize the lists of elements selected
    X = list()
    Y = list()

    # populate the lists with the subset of k values
    # now we normalize
    local_topics = []
    for word, counter in a[:k]:
        X.append(int(counter)/a_max)
        Y.append(int(bDict[word])/b_max)
        if int(bDict[word]) < localCoef * float(counter):
            local_topics.append(word)
    labels = [row[0] for row in a[:k]]

    print unicode(", ".join(local_topics)).decode('utf8')
    print len(X), len(Y), len(labels)

    # write files for plots
    for (x,y,label) in zip(X,Y,labels):
        dataFile.write("{}\t{}\t{}\n".format(str(x),str(y),label))
    dataFile.close()