From 8c61b7c34844934fc3c577edb1ce5dfdea7a5a12 Mon Sep 17 00:00:00 2001 From: Cristina Muntean Date: Tue, 4 Oct 2016 10:09:34 +0200 Subject: [PATCH] some unicode fix when reding the file --- merge_wordcount_with_edge_features.py | 2 +- prepare_plot_scatter_2_distrib.py | 2 -- resources/stop-word-list.txt | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/merge_wordcount_with_edge_features.py b/merge_wordcount_with_edge_features.py index 169d77b..b111b97 100644 --- a/merge_wordcount_with_edge_features.py +++ b/merge_wordcount_with_edge_features.py @@ -110,7 +110,7 @@ def mergeBlackList(city, cityFile, minfreq): inputFile = codecs.open(fileDir+"/"+filename, "r", "utf-8") for line in inputFile: lines += 1 - node = Node.parseString(line) + node = Node.parseString(unicode(line)) if node.nodeCount > minfreq: #!!! this restricts a lot the dataset if node.description in tokenDict: # update node diff --git a/prepare_plot_scatter_2_distrib.py b/prepare_plot_scatter_2_distrib.py index 21eb04d..52f7a49 100644 --- a/prepare_plot_scatter_2_distrib.py +++ b/prepare_plot_scatter_2_distrib.py @@ -13,8 +13,6 @@ we do a scatterplot import argparse import codecs import logging -import os -import sys from collections import defaultdict stopwords = open('./resources/stop-word-list.txt', 'r').read().decode('utf-8').split('\r\n') diff --git a/resources/stop-word-list.txt b/resources/stop-word-list.txt index c80beee..f15fcd8 100755 --- a/resources/stop-word-list.txt +++ b/resources/stop-word-list.txt @@ -342,6 +342,7 @@ you're > i'll .... +... < !!!! just -- libgit2 0.21.4