merge_wordcount_with_edge_features.py 4.28 KB
#!/usr/bin/env python
'''
local-twitter : merge_wordcount_with_edge_features
@euthor: Cristina Muntean (cristina.muntean@isti.cnr.it)
@date: 9/21/16
-----------------------------


We now have two options for merging:
1. give a list of files to merge - 2 files?
2. give a name of a city and a file (which shouldn't be merged) and merge the other 9


'''
import argparse
import codecs
import os
import sys
from graph.Node import Node

parser = argparse.ArgumentParser(description='Merge some files.')
parser.add_argument('files', metavar='ListOfFiles', nargs='+',
                    help='a list of files to be merged')
parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist'])
parser.add_argument('--c', '-city', nargs='?')
parser.add_argument('--out', '-outputFilename')


def updateNode(existing_node, temp_node):
    """
    Givem an existing node and a new node we update the values in the existing node by summing up the existing values
    with the values in the temp_node
    :param existing_node:
    :param temp_node:
    :return:
    """
    existing_node.incrementNode(temp_node.nodeCount)
    existing_node.incrementEdge(temp_node.edgeCount)
    existing_node.incMention(temp_node.mentionCount)
    existing_node.incReply(temp_node.replyCount)
    existing_node.incRT(temp_node.RTCount)
    existing_node.incInnerRT(temp_node.innerRTCount)
    existing_node.incOuterRT(temp_node.outerRTCount)
    existing_node.incQuote(temp_node.quoteCount)
    existing_node.incInnerQuote(temp_node.innerQuoteCount)
    existing_node.incOuterQuote(temp_node.outerQuoteCount)
    return existing_node


def mergeWhitelist(fileList):
    """
    We are given a list of 2 to n files to merge together and dump in an output file
    By merging we intend sum up the stats on each column
    :param fileList: list of files to be merged
    :return: a dictionary with uniques values and summed attribute (soon to be dumped to a file)
    """

    tokenDict = dict()
    for cityFilename in fileList:
        inputFile = codecs.open(cityFilename, "r", "utf-8")
        for line in inputFile:
            node = Node.parseString(line)
            if node.description in tokenDict:
                # update node
                existing_node = tokenDict[node.description]
                updateNode(existing_node, node)
                # for space issues we can destroy the object node
            else:
                tokenDict[node.description] = node
    return tokenDict


def mergeBlackList(city, cityFile):
    """
    Gien a city and a file with the features for that city, we search the directory for similar files corresponding to
    the remaining 9 cities and merge them together in one file.

    :param city: a string with the name of the city NOT to be included in the merge
    :param cityFile: the file of features for that city
    :return:
    """
    tokenDict = dict()

    fileDir = os.path.dirname(os.path.realpath(cityFile))
    filetype = cityFile.split("/")[-1].replace(city, "")
    for filename in os.listdir(fileDir):
        if city not in filename and filetype in filename:
            inputFile = codecs.open(filename, "r", "utf-8")
            for line in inputFile:
                node = Node.parseString(line)
                if node.description in tokenDict:
                    # update node
                    existing_node = tokenDict[node.description]
                    updateNode(existing_node, node)
                    # for space issues we can destroy the object node
                else:
                    tokenDict[node.description] = node
    return tokenDict


def main():
    args = parser.parse_args()
    print args

    outputWriter = codecs.open(args.out, "w", "utf-8")

    if args.o == "whitelist":
        new_node_dict = mergeWhitelist(args.files)

    else:
        if args.c is None:
            print "You need to give the name of the seed city (we merge the remaining 9 cities) using -c"
            sys.exit()

        if len(args.files) != 1:
            print "You need to indicate the file ( of the city in -c ) which to exclude from the merge"
            sys.exit()

        new_node_dict = mergeBlackList(args.c, args.files[0])

    for node_desc, node in new_node_dict.iteritems():
        outputWriter.write('{}\n'.format(node.tabPrint()))
    outputWriter.close()

if __name__ == '__main__':
    main()