Commit beb513b5b11b00d80056923730ce773709ed1fb9

Authored by Cristina Muntean
1 parent 47b06752

some new files

prepare_plot_3D.py 0 → 100644
  1 +"""
  2 +local-twitter
  3 +
  4 +We read 2 distributions from 2 different files
  5 +the first 1 - we take k top topics - so we read the file up to and index
  6 +then we search for the ngram in the second file and take the frequencies
  7 +we do a scatterplot
  8 +
  9 +
  10 +@autor: cristina muntean
  11 +@date: 28/06/16
  12 +"""
  13 +import argparse
  14 +import codecs
  15 +import logging
  16 +from collections import defaultdict
  17 +
  18 +stopwords = open('./resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n')
  19 +
  20 +
  21 +parser = argparse.ArgumentParser(description='<ix two files with frequencies to discover local topics')
  22 +parser.add_argument('--input1', '-inputFile1', help='the first input file, usually the locality seeding file for tokens'
  23 + 'often SORTED per desired column')
  24 +parser.add_argument('--input2', '-inputFile2', help='the file against which we compare the input1')
  25 +parser.add_argument('--output', '-outputFile', help='the files in which we save the two distributions of values for each token')
  26 +parser.add_argument('--col', '-columnNumber', type=int, help='the value correspnding to which column in the feature file; '
  27 + 'index starts at 1')
  28 +parser.add_argument('--k', '-topKtokensInInput1', type=int, default=-1, help='negative value if we want to look at the while file; '
  29 + 'positive if we want to stop before; default -1.')
  30 +parser.add_argument('--coef', '-localityCoeficient', type=float, help='the difference between the two frequencies so as '
  31 + 'to be local e.g A = 0.25 * B')
  32 +
  33 +
  34 +def readFromFileWordcount(filename):
  35 + max_in_distrib = 0
  36 + docList = list()
  37 + f = codecs.open(filename, "r", "utf-8")
  38 + first_line = f.readline()
  39 +
  40 + # read first line to get max
  41 + if len(first_line.split("\t")) == 2:
  42 + word, counter = first_line.replace("\n", "").split("\t")
  43 + docList.append(tuple([word, int(counter)]))
  44 + max_in_distrib = int(counter)
  45 +
  46 + # read other lines
  47 + for line in f:
  48 + if len(line.split("\t")) == 2:
  49 + word, counter = line.replace("\n", "").split("\t")
  50 + docList.append(tuple([word, int(counter)]))
  51 + return docList, float(max_in_distrib)
  52 +
  53 +
  54 +def readFromFileMultipleEdges(filename, columnNumber):
  55 + '''
  56 + 124 like 1938 1261 1212 205 933 66 54 53 9 1
  57 +
  58 + Meaning: self.id, self.description, self.nodeCount, self.edgeCount, self.mentionCount, self.replyCount,
  59 + self.RTCount, self.innerRTCount, self.outerRTCount, self.quoteCount, self.innerQuoteCount, self.outerQuoteCount
  60 +
  61 +
  62 + :param filename:
  63 + :param columnNumber:
  64 + :return:
  65 + '''
  66 + docList = list()
  67 + f = codecs.open(filename, "r", "utf-8")
  68 + first_line = f.readline()
  69 +
  70 + # we assume the first line of the file is correct and we save the number of columns from there
  71 + rowDataCols = first_line.replace("\n", "").split("\t")
  72 + numCols = len(rowDataCols)
  73 +
  74 + # read first line to get max
  75 + word = rowDataCols[1]
  76 + counter = int(rowDataCols[columnNumber-1]) # the index starts at 0 so column 9 corresponds to 8 in the array index
  77 + if word not in stopwords and counter > 0:
  78 + docList.append(tuple([word, counter]))
  79 + max_in_distrib = counter
  80 +
  81 + # read other lines
  82 + for line in f:
  83 + if len(line.split("\t")) == numCols:
  84 + rowDataCols = line.replace("\n", "").split("\t")
  85 + word = rowDataCols[1]
  86 + counter = int(rowDataCols[columnNumber - 1])
  87 + ### we cand make a value check - min 1 !!!
  88 + if word not in stopwords and counter > 0:
  89 + docList.append(tuple([word, counter]))
  90 + if counter > max_in_distrib:
  91 + max_in_distrib = counter
  92 + else:
  93 + print "Num columns out of range", line
  94 + return docList, float(max_in_distrib)
  95 +
  96 +
  97 +if __name__ == '__main__':
  98 +
  99 + logger = logging.getLogger("prepare_plot_scatter_2_distrib.py")
  100 + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s;%(levelname)s;%(message)s")
  101 +
  102 + args = parser.parse_args()
  103 + print args
  104 +
  105 + inputFile1 = args.input1
  106 + inputFile2 = args.input2
  107 + columnNumber = args.col
  108 + k = args.k
  109 + localCoef = args.coef
  110 + dataFile = codecs.open(args.output, "w", "utf8")
  111 +
  112 + # read distributions!
  113 + a, a_max = readFromFileMultipleEdges(inputFile1, columnNumber) # sorted
  114 + b, b_max = readFromFileMultipleEdges(inputFile2, columnNumber) ### we can improve this and keep only the ones
  115 + # present in a list
  116 + print len(a), len(b)
  117 + print "Maxes: ", a_max, b_max
  118 +
  119 + # check for k
  120 + if k < 0:
  121 + k = len(a)
  122 +
  123 + # make b a default dict as we search for elements from a
  124 + bDict = {rows[0]: int(rows[1]) for rows in b}
  125 + bDict = defaultdict(int, bDict)
  126 +
  127 + # initialize the lists of elements selected
  128 + X = list()
  129 + Y = list()
  130 +
  131 + # populate the lists with the subset of k values
  132 + # now we normalize
  133 + local_topics = []
  134 + for word, counter in a[:k]:
  135 + X.append(int(counter)/a_max)
  136 + Y.append(int(bDict[word])/b_max)
  137 + if int(bDict[word]) < localCoef * float(counter):
  138 + local_topics.append(word)
  139 + labels = [row[0] for row in a[:k]]
  140 +
  141 + print unicode(", ".join(local_topics)).decode('utf8')
  142 + print len(X), len(Y), len(labels)
  143 +
  144 + # write files for plots
  145 + for (x,y,label) in zip(X,Y,labels):
  146 + dataFile.write("{}\t{}\t{}\n".format(str(x),str(y),label))
  147 + dataFile.close()
  148 +
  149 +
... ...
scripts/compute-new-features.sh 0 → 100644
  1 +#!/bin/bash
  2 +
  3 +INPUTFILE=$1
  4 +OUTPUT_NAME=`basename $FILENAME | cut -d'.' -f1` #takes the name f the city and aggregate type e.g. "seattle_3of3"
  5 +
  6 +#
  7 +awk '{ print $2, ($8)/($9+$8+1) }'
0 8 \ No newline at end of file
... ...
simple_plot_scatter_2_distrib.py
... ... @@ -61,9 +61,32 @@ def scatter_plot(X,Y, labels, plotname):
61 61 # Plot diagonal line (45 degrees)
62 62 plt.plot(np.arange(0.0, 1.0, 0.01), np.arange(0.0, 1.0, 0.01))
63 63  
64   - # for i, xy in enumerate(zip(X, Y)): # <--
65   - # # ax.annotate('(%s, %s)' % xy, xy=xy, textcoords='data') # <--
66   - # ax.annotate(labels[i], xy=xy, textcoords='data') # <--
  64 + # # Plot label
  65 + # plt.annotate(
  66 + # label,
  67 + # xy=(x, y), xytext=(-20, 20),
  68 + # textcoords='offset points', ha='right', va='bottom',
  69 + # bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
  70 + # arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')
  71 +
  72 + for i, xy in enumerate(zip(X, Y)): # <--
  73 + # ax.annotate('(%s, %s)' % xy, xy=xy, textcoords='data') # <--
  74 + if "#bospoli" in labels[i]: # #bospoli, #marinameets, #5daysfor_flipside
  75 + # plt.annotate(labels[i], #xy=xy,
  76 + # # xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom',
  77 + # # bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
  78 + # # arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) # <--
  79 + #
  80 + # # xycoords='data',
  81 + # # xytext=(0.05, 0.05), textcoords='offset points',
  82 + # # arrowprops=dict(arrowstyle="->",facecolor='black', # shrink=0.05,
  83 + # # connectionstyle="arc")
  84 + #
  85 + # # xytext=(0.05, 0.05), arrowprops=dict(facecolor='black', shrink=0.05),
  86 + #
  87 + # )
  88 +
  89 + plt.plot(xy[0], xy[1], linestyle='None', marker='$\lambda$', color="k", markersize=10)
67 90  
68 91 # plt.xlim(-1.5, 1.5)
69 92 # plt.xticks(())
... ... @@ -75,7 +98,7 @@ def scatter_plot(X,Y, labels, plotname):
75 98  
76 99 plt.tight_layout()
77 100 plt.savefig(plotname)
78   - #plt.show()
  101 + plt.show()
79 102  
80 103  
81 104 if __name__ == '__main__':
... ...