prepare_plot_scatter_2_distrib.py 3.77 KB
"""
local-twitter

We read 2 distributions from 2 different files
the first 1 - we take k top topics - so we read the file up to and index
then we search for the ngram in the second file and take the frequencies
we do a scatterplot


@autor: cristina muntean
@date: 28/06/16
"""
import codecs
import logging
import sys
from collections import defaultdict

def readFromFileWordcount(filename):
    max_in_distrib = 0
    docList = list()
    f = codecs.open(filename, "r", "utf-8")
    first_line = f.readline()

    # read first line to get max
    if len(first_line.split("\t")) == 2:
        word, counter = first_line.replace("\n", "").split("\t")
        docList.append(tuple([word, int(counter)]))
        max_in_distrib = int(counter)

    # read other lines
    for line in f:
        if len(line.split("\t")) == 2:
            word, counter = line.replace("\n", "").split("\t")
            docList.append(tuple([word, int(counter)]))
    return docList, float(max_in_distrib)


def readFromFileMultipleEdges(filename, columnNumber):
    '''
    124    	like   	1938   	1261   	1212   	205    	933    	66     	54     	53     	9      	1

    Meaning: self.id, self.description, self.nodeCount, self.edgeCount, self.mentionCount, self.replyCount,
    self.RTCount, self.innerRTCount, self.outerRTCount, self.quoteCount, self.innerQuoteCount, self.outerQuoteCount
    :param filename:
    :param columnNumber:
    :return:
    '''
    max_in_distrib = 0
    docList = list()
    f = codecs.open(filename, "r", "utf-8")
    first_line = f.readline()

    # we assume the first line of the file is correct and we save the number of columns from there
    rowDataCols = first_line.replace("\n", "").split("\t")
    numCols = len(rowDataCols)

    # read first line to get max
    word = rowDataCols[1]
    counter = rowDataCols[columnNumber-1]  # the index starts at 0 so column 9 corresponds to 8 in the array index
    docList.append(tuple([word, int(counter)]))
    max_in_distrib = int(counter)

    # read other lines
    for line in f:
        if len(line.split("\t")) == numCols:
            rowDataCols = line.replace("\n", "").split("\t")
            word = rowDataCols[1]
            counter = rowDataCols[columnNumber - 1]
            docList.append(tuple([word, int(counter)]))
    return docList, float(max_in_distrib)


if __name__ == '__main__':

    logger = logging.getLogger("prepare_plot_scatter_2_distrib.py")
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")

    if len(sys.argv) != 6:
        print "You need to pass the following 5 params: <inputFile1-SORTED-ON-COL> <inputFile2> <columnNumber> <k> <data-file>"
        sys.exit(-1)
    inputFile1 = sys.argv[1]
    inputFile2 = sys.argv[2]
    columnNumber = int(sys.argv[3])
    k = int(sys.argv[4])
    dataFile = codecs.open(sys.argv[5], "w", "utf8")

    # read distributions!
    a, a_max = readFromFileMultipleEdges(inputFile1, columnNumber)  # sorted
    b, b_max = readFromFileMultipleEdges(inputFile2, columnNumber)
    print len(a), len(b)

    # make b a default dict as we search for elements from a
    bDict = {rows[0]: int(rows[1]) for rows in b}
    bDict = defaultdict(int, bDict)

    # initialize the lists of elements selected
    X = list()
    Y = list()

    # populate the lists with the subset of k values
    # now we normalize
    local_topics = []
    for word, counter in a[:k]:
        X.append(int(counter)/a_max)
        Y.append(int(bDict[word])/b_max)
        if int(bDict[word]) < 0.25 * float(counter):
            local_topics.append(word)
    labels = [row[0] for row in a[:k]]

    print local_topics
    print len(X), len(Y), len(labels)

    # write files for plots
    for (x,y,label) in zip(X,Y,labels):
        dataFile.write("{}\t{}\t{}\n".format(str(x),str(y),label))
    dataFile.close()