Commit 8c61b7c34844934fc3c577edb1ce5dfdea7a5a12

Authored by Cristina Muntean
1 parent 769fad15

some unicode fix when reding the file

merge_wordcount_with_edge_features.py
... ... @@ -110,7 +110,7 @@ def mergeBlackList(city, cityFile, minfreq):
110 110 inputFile = codecs.open(fileDir+"/"+filename, "r", "utf-8")
111 111 for line in inputFile:
112 112 lines += 1
113   - node = Node.parseString(line)
  113 + node = Node.parseString(unicode(line))
114 114 if node.nodeCount > minfreq: #!!! this restricts a lot the dataset
115 115 if node.description in tokenDict:
116 116 # update node
... ...
prepare_plot_scatter_2_distrib.py
... ... @@ -13,8 +13,6 @@ we do a scatterplot
13 13 import argparse
14 14 import codecs
15 15 import logging
16   -import os
17   -import sys
18 16 from collections import defaultdict
19 17  
20 18 stopwords = open('./resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n')
... ...
resources/stop-word-list.txt
... ... @@ -342,6 +342,7 @@ you're
342 342 >
343 343 i'll
344 344 ....
  345 +...
345 346 <
346 347 !!!!
347 348 just
... ...