Commit a23faae1d8750bdd9769219d4b6b8c675b4d168d

Authored by Cristina Muntean
1 parent 2a605d23

put all scripts in a folder; added small description

aggregate-wc-with-edge.sh renamed to scripts/aggregate-wc-with-edge.sh
1 1 #!/bin/bash
2 2  
  3 +# this script merges all features for 10 - 1 city given as input
  4 +# basically merges the remaining 9 cities give 1 city as input
  5 +
3 6 CITY=$1
4 7 COMMAND="time python merge_wordcount_with_edge_features.py"
5 8 OPTION="blacklist"
... ...
gen-features-aggregates.sh renamed to scripts/gen-features-aggregates.sh
1 1 #!/bin/bash
2 2  
  3 +# this script generates features for batches of days for tweets per city.
  4 +# e.g. seattle_2of3.tsv
  5 +
3 6 CORES=16
4 7 INPUT_DIR=/data/muntean/filter-10-cities-november-tweets
5 8 OUTPUT_DIR=/data/muntean/edge-features-10-cities-november
... ...
gen-features.sh renamed to scripts/gen-features.sh
1 1 #!/bin/bash
2 2  
  3 +# this script generates features for daily tweets per city.
  4 +# e.g. seattle_20151128.tsv
  5 +
3 6 CORES=16
4 7 INPUT_DIR=/data/muntean/filter-10-cities-november-tweets
5 8 OUTPUT_DIR=/data/muntean/edge-features-10-cities-november
6 9 COMMAND="time python wordcount_from_json_list_with_edge_features.py"
7 10  
8   -for LINE in `ls $INPUT_DIR/*2015111*`
  11 +for LINE in `ls $INPUT_DIR/*201511*`
9 12 do
10 13 #echo $LINE
11 14 OUTPUT_NAME=`basename $LINE | cut -d'.' -f1`
... ...
gen-plot-data.sh renamed to scripts/gen-plot-data.sh
  1 +#!/bin/bash
  2 +
1 3 python prepare_plot_scatter_2_distrib.py /data/muntean/filter-10-cities-tweets/wordcount_boston_1_week.json /data/muntean/filter-10-cities-tweets/wordcount_10_cities_1_week.json 1000 /data/muntean/filter-10-cities-tweets/plot-data/boston_vs_all_top1000.tsv
2 4 python prepare_plot_scatter_2_distrib.py /data/muntean/filter-10-cities-tweets/wordcount_chicago_1_week.json /data/muntean/filter-10-cities-tweets/wordcount_10_cities_1_week.json 1000 /data/muntean/filter-10-cities-tweets/plot-data/chicago_vs_all_top1000.tsv
3 5 python prepare_plot_scatter_2_distrib.py /data/muntean/filter-10-cities-tweets/wordcount_dallas_1_week.json /data/muntean/filter-10-cities-tweets/wordcount_10_cities_1_week.json 1000 /data/muntean/filter-10-cities-tweets/plot-data/dallas_vs_all_top1000.tsv
... ...
run-11.sh renamed to scripts/run-11.sh
1 1 #!/bin/bash
2 2  
3   -#set -e
  3 +#this is a parallel script that processes regular tweet files in JSON and filters only the ones coming from one of
  4 +#the 10 selected cities, while also tokenizing the text
4 5  
5 6 CORES=16
6 7 INPUT_DIR=/data/muntean/english-tweets
7 8 OUTPUT_DIR=/data/muntean/filter-10-cities-november-tweets
8 9 COMMAND="time python filter_tweets_by_city.py"
9 10  
10   -for LINE in `ls $INPUT_DIR/english-tweets-2015112*`
  11 +for LINE in `ls $INPUT_DIR/english-tweets-201511*`
11 12 do
12   - #echo $LINE
13   - #echo $COMMAND
14   - #echo $OUTPUT_DIR
15 13 NUMBER=$(echo $LINE | tr -dc '0-9')
16 14 #echo $NUMBER
17 15 sem -j $CORES $COMMAND $LINE $OUTPUT_DIR 2>&1 > $OUTPUT_DIR/stats-$NUMBER-FTBC.tsv
18 16 done
19 17 sem --wait
20 18  
21   -#RET_CODE=${PIPESTATUS[0]}
22   - # if ${RET_CODE} -ne 0; then
23   - # printf "\e[0;31m%s\e[0m\n" "FAILED .... Error Code: ${RET_CODE}."
24   - # exit ${RET_CODE}
25   - # fi
26   -
27 19 exit 0
... ...
sort-on-feat-aggregates.sh renamed to scripts/sort-on-feat-aggregates.sh
1 1 #!/bin/bash
2 2  
  3 +# This script sorts an aggregated feature file (batches of 10 days for each city) on the column selected
  4 +
3 5 #columns:
4 6 #1 id
5 7 #2 description
... ... @@ -20,7 +22,7 @@ COLUMN=4
20 22  
21 23 for LINE in `ls $INPUT_DIR/*`
22 24 do
23   - #echo $LINE
  25 + #echo $LINE
24 26 OUTPUT_NAME=`basename $LINE | cut -d'.' -f1`
25 27 #echo $OUTPUT_NAME
26 28 sort -t$'\t' -n -r -k$COLUMN $LINE > $INPUT_DIR/${OUTPUT_NAME}-sorted-col-${COLUMN}.tsv
... ...
sort-on-feat.sh renamed to scripts/sort-on-feat.sh
1 1 #!/bin/bash
2 2  
  3 +# This script sorts a simple feature file (the features for a daily city file) on the column selected
  4 +
3 5 #columns:
4 6 #1 id
5 7 #2 description
... ... @@ -16,16 +18,13 @@
16 18  
17 19 CORES=16
18 20 INPUT_DIR=/data/muntean/edge-features-10-cities-november
19   -#OUTPUT_DIR=/data/muntean/filter-10-cities-november-tweets
20   -#COMMAND="time python filter_tweets_by_city.py"
21 21 COLUMN=4
22 22  
23 23 for LINE in `ls $INPUT_DIR/*`
24 24 do
25   - #echo $LINE
  25 + #echo $LINE
26 26 OUTPUT_NAME=`basename $LINE | cut -d'.' -f1`
27 27 #echo $OUTPUT_NAME
28 28 sort -t$'\t' -n -r -k$COLUMN $LINE > $INPUT_DIR/${OUTPUT_NAME}-sorted-col-${COLUMN}.tsv
29 29 done
30   -#sem --wait
31 30 exit 0
... ...