Commit 8c61b7c34844934fc3c577edb1ce5dfdea7a5a12

Authored by Cristina Muntean
1 parent 769fad15

some unicode fix when reding the file

merge_wordcount_with_edge_features.py
@@ -110,7 +110,7 @@ def mergeBlackList(city, cityFile, minfreq): @@ -110,7 +110,7 @@ def mergeBlackList(city, cityFile, minfreq):
110 inputFile = codecs.open(fileDir+"/"+filename, "r", "utf-8") 110 inputFile = codecs.open(fileDir+"/"+filename, "r", "utf-8")
111 for line in inputFile: 111 for line in inputFile:
112 lines += 1 112 lines += 1
113 - node = Node.parseString(line) 113 + node = Node.parseString(unicode(line))
114 if node.nodeCount > minfreq: #!!! this restricts a lot the dataset 114 if node.nodeCount > minfreq: #!!! this restricts a lot the dataset
115 if node.description in tokenDict: 115 if node.description in tokenDict:
116 # update node 116 # update node
prepare_plot_scatter_2_distrib.py
@@ -13,8 +13,6 @@ we do a scatterplot @@ -13,8 +13,6 @@ we do a scatterplot
13 import argparse 13 import argparse
14 import codecs 14 import codecs
15 import logging 15 import logging
16 -import os  
17 -import sys  
18 from collections import defaultdict 16 from collections import defaultdict
19 17
20 stopwords = open('./resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n') 18 stopwords = open('./resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n')
resources/stop-word-list.txt
@@ -342,6 +342,7 @@ you're @@ -342,6 +342,7 @@ you're
342 > 342 >
343 i'll 343 i'll
344 .... 344 ....
  345 +...
345 < 346 <
346 !!!! 347 !!!!
347 just 348 just