Blame view

merge_wordcount_with_edge_features.py 4.28 KB
286e4ed5   Cristina Muntean   merging functiona...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
  #!/usr/bin/env python
  '''
  local-twitter : merge_wordcount_with_edge_features
  @euthor: Cristina Muntean (cristina.muntean@isti.cnr.it)
  @date: 9/21/16
  -----------------------------
  
  
  We now have two options for merging:
  1. give a list of files to merge - 2 files?
  2. give a name of a city and a file (which shouldn't be merged) and merge the other 9
  
  
  '''
  import argparse
  import codecs
  import os
  import sys
3714f1d3   Cristina Muntean   minor
19
  from graph.Node import Node
286e4ed5   Cristina Muntean   merging functiona...
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  
  parser = argparse.ArgumentParser(description='Merge some files.')
  parser.add_argument('files', metavar='ListOfFiles', nargs='+',
                      help='a list of files to be merged')
  parser.add_argument('--o', '-option', choices=['whitelist', 'blacklist'])
  parser.add_argument('--c', '-city', nargs='?')
  parser.add_argument('--out', '-outputFilename')
  
  
  def updateNode(existing_node, temp_node):
      """
      Givem an existing node and a new node we update the values in the existing node by summing up the existing values
      with the values in the temp_node
      :param existing_node:
      :param temp_node:
      :return:
      """
      existing_node.incrementNode(temp_node.nodeCount)
      existing_node.incrementEdge(temp_node.edgeCount)
      existing_node.incMention(temp_node.mentionCount)
      existing_node.incReply(temp_node.replyCount)
      existing_node.incRT(temp_node.RTCount)
      existing_node.incInnerRT(temp_node.innerRTCount)
      existing_node.incOuterRT(temp_node.outerRTCount)
      existing_node.incQuote(temp_node.quoteCount)
      existing_node.incInnerQuote(temp_node.innerQuoteCount)
      existing_node.incOuterQuote(temp_node.outerQuoteCount)
      return existing_node
  
  
  def mergeWhitelist(fileList):
      """
      We are given a list of 2 to n files to merge together and dump in an output file
      By merging we intend sum up the stats on each column
      :param fileList: list of files to be merged
      :return: a dictionary with uniques values and summed attribute (soon to be dumped to a file)
      """
  
      tokenDict = dict()
      for cityFilename in fileList:
9c2db0d4   Cristina Muntean   minor
60
          inputFile = codecs.open(cityFilename, "r", "utf-8")
286e4ed5   Cristina Muntean   merging functiona...
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
          for line in inputFile:
              node = Node.parseString(line)
              if node.description in tokenDict:
                  # update node
                  existing_node = tokenDict[node.description]
                  updateNode(existing_node, node)
                  # for space issues we can destroy the object node
              else:
                  tokenDict[node.description] = node
      return tokenDict
  
  
  def mergeBlackList(city, cityFile):
      """
      Gien a city and a file with the features for that city, we search the directory for similar files corresponding to
      the remaining 9 cities and merge them together in one file.
  
      :param city: a string with the name of the city NOT to be included in the merge
      :param cityFile: the file of features for that city
      :return:
      """
      tokenDict = dict()
  
      fileDir = os.path.dirname(os.path.realpath(cityFile))
      filetype = cityFile.split("/")[-1].replace(city, "")
      for filename in os.listdir(fileDir):
          if city not in filename and filetype in filename:
f40b7b87   Cristina Muntean   minor
88
              inputFile = codecs.open(filename, "r", "utf-8")
286e4ed5   Cristina Muntean   merging functiona...
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
              for line in inputFile:
                  node = Node.parseString(line)
                  if node.description in tokenDict:
                      # update node
                      existing_node = tokenDict[node.description]
                      updateNode(existing_node, node)
                      # for space issues we can destroy the object node
                  else:
                      tokenDict[node.description] = node
      return tokenDict
  
  
  def main():
      args = parser.parse_args()
      print args
  
0db4b56f   Cristina Muntean   merging print out...
105
106
      outputWriter = codecs.open(args.out, "w", "utf-8")
  
286e4ed5   Cristina Muntean   merging functiona...
107
      if args.o == "whitelist":
0db4b56f   Cristina Muntean   merging print out...
108
          new_node_dict = mergeWhitelist(args.files)
286e4ed5   Cristina Muntean   merging functiona...
109
110
111
112
113
114
115
116
117
118
  
      else:
          if args.c is None:
              print "You need to give the name of the seed city (we merge the remaining 9 cities) using -c"
              sys.exit()
  
          if len(args.files) != 1:
              print "You need to indicate the file ( of the city in -c ) which to exclude from the merge"
              sys.exit()
  
0db4b56f   Cristina Muntean   merging print out...
119
          new_node_dict = mergeBlackList(args.c, args.files[0])
286e4ed5   Cristina Muntean   merging functiona...
120
  
0db4b56f   Cristina Muntean   merging print out...
121
122
123
      for node_desc, node in new_node_dict.iteritems():
          outputWriter.write('{}\n'.format(node.tabPrint()))
      outputWriter.close()
286e4ed5   Cristina Muntean   merging functiona...
124
125
126
  
  if __name__ == '__main__':
      main()