Commit 0710465eb5161f5aaf093e45649850367f3ef8b7

Authored by Diego Ceccarelli
0 parents

init commit

.gitignore 0 → 100644
  1 +++ a/.gitignore
  1 +*.class
  2 +
  3 +# Package Files #
  4 +*.war
  5 +*.ear
  6 +*.svn
  7 +.settings
  8 +.classpath
  9 +.project
  10 +data
  11 +target
  12 +libs
  13 +.DS_Store
... ...
logback.xml 0 → 100644
  1 +++ a/logback.xml
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +
  3 +<!-- For assistance related to logback-translator or configuration -->
  4 +<!-- files in general, please contact the logback user mailing list -->
  5 +<!-- at http://www.qos.ch/mailman/listinfo/logback-user -->
  6 +<!-- -->
  7 +<!-- For professional support please see -->
  8 +<!-- http://www.qos.ch/shop/products/professionalSupport -->
  9 +<!-- -->
  10 +<configuration>
  11 + <appender name="A1" class="ch.qos.logback.core.ConsoleAppender">
  12 + <encoder>
  13 + <pattern>%date %-4r [%t] %-5p %c - %m%n</pattern>
  14 + </encoder>
  15 + </appender>
  16 +
  17 +
  18 + <root level="${log:-INFO}">
  19 + <appender-ref ref="A1" />
  20 + </root>
  21 +
  22 +
  23 +
  24 + <logger name="it.cnr.isti.hpc.property.ProjectProperties" level="ERROR" />
  25 +
  26 +
  27 +
  28 +
  29 +
  30 +</configuration>
... ...
pom.xml 0 → 100644
  1 +++ a/pom.xml
  1 +<?xml version="1.0"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  4 + <modelVersion>4.0.0</modelVersion>
  5 + <groupId>it.cnr.isti.hpc</groupId>
  6 + <artifactId>twitter-trends</artifactId>
  7 + <version>0.0.1-SNAPSHOT</version>
  8 + <properties>
  9 + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  10 + </properties>
  11 + <packaging>jar</packaging>
  12 + <name>twitter-trends</name>
  13 + <url>http://hpc.isti.cnr.it</url>
  14 + <dependencies>
  15 + <dependency>
  16 + <groupId>junit</groupId>
  17 + <artifactId>junit</artifactId>
  18 + <version>4.11</version>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>it.cnr.isti.hpc</groupId>
  22 + <artifactId>hpc-utils</artifactId>
  23 + <version>0.0.6</version>
  24 + </dependency>
  25 +
  26 + <dependency>
  27 + <groupId>org.twitter4j</groupId>
  28 + <artifactId>twitter4j-stream</artifactId>
  29 + <version>3.0.5</version>
  30 + </dependency>
  31 +
  32 + <dependency>
  33 + <groupId>com.moparisthebest</groupId>
  34 + <artifactId>junidecode</artifactId>
  35 + <version>0.1.1</version>
  36 + </dependency>
  37 +
  38 +
  39 + </dependencies>
  40 + <build>
  41 + <plugins>
  42 + <plugin>
  43 + <groupId>org.apache.maven.plugins</groupId>
  44 + <artifactId>maven-compiler-plugin</artifactId>
  45 + <version>2.3.2</version>
  46 + <configuration>
  47 + <source>1.5</source>
  48 + <target>1.5</target>
  49 + </configuration>
  50 + </plugin>
  51 + <plugin>
  52 + <artifactId>maven-assembly-plugin</artifactId>
  53 + <configuration>
  54 + <descriptorRefs>
  55 + <descriptorRef>jar-with-dependencies</descriptorRef>
  56 + </descriptorRefs>
  57 + </configuration>
  58 + </plugin>
  59 + </plugins>
  60 + </build>
  61 + <repositories>
  62 + <repository>
  63 + <id>dropbox</id>
  64 + <url>https://dl.dropboxusercontent.com/u/4663256/mvn-repository/</url>
  65 + </repository>
  66 + </repositories>
  67 +</project>
... ...
project.properties 0 → 100644
  1 +++ a/project.properties
  1 +secure.db=secure
  2 +eng.resource.service=http://localhost:8080/secure/rest/eng/getResourceById
  3 +db=secure.sqlite
  4 +dexter.rest.url=http://node1.novello.isti.cnr.it:9898/dexter-webapp/api/rest
  5 +hpc.enrich.service=http://localhost:8080/secure/rest/enrich/enrich
  6 +lucene.index=./resources/lucene
  7 +category.threshold=0.2f
  8 +keyword.to.category=resources/keywords/keyword-to-category-2.tsv
  9 +hpc.rest.service=http://localhost:8080/secure/rest
  10 +trends.twitter.core=./resources/trends/trends-new-5.json
  11 +resiltec.trend.service=yonose
0 12 \ No newline at end of file
... ...
scripts/config.sh 0 → 100644
  1 +++ a/scripts/config.sh
  1 +#!/usr/bin/env bash
  2 +
  3 +VERSION="0.0.1-SNAPSHOT"
  4 +XMX="-Xmx2000m"
  5 +LOG=INFO
  6 +##LOG=DEBUG
  7 +LOGAT=1000
  8 +E_BADARGS=65
  9 +JAVA="java $XMX -Dlogat=$LOGAT -Dlog=$LOG -cp .:./target/FIXME"
  10 +CLI=it.cnr.isti.hpc.FIXME
  11 +
  12 +export LC_ALL=C
... ...
scripts/example.sh 0 → 100644
  1 +++ a/scripts/example.sh
  1 +#!/usr/bin/env bash
  2 +source ./scripts/config.sh
  3 +
  4 +EXPECTED_ARGS=2
  5 +
  6 +if [ $# -ne $EXPECTED_ARGS ]
  7 +then
  8 + echo "Usage: `basename $0` input1 input2"
  9 + exit $E_BADARGS
  10 +fi
  11 +
  12 +echo "hello world param $1, param $2"
... ...
src/main/java/it/cnr/isti/hpc/trends/Bucket.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/Bucket.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends;
  17 +
  18 +import it.cnr.isti.hpc.trends.util.CleanAndTokenizeTweet;
  19 +
  20 +import java.util.ArrayList;
  21 +import java.util.Collection;
  22 +import java.util.Collections;
  23 +import java.util.Date;
  24 +import java.util.HashMap;
  25 +import java.util.List;
  26 +import java.util.Map;
  27 +
  28 +import twitter4j.Status;
  29 +
  30 +/**
  31 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  32 + *
  33 + * Created on Feb 4, 2015
  34 + */
  35 +public class Bucket {
  36 +
  37 + private final Map<Long, Status> tweets = new HashMap<Long, Status>();
  38 + private final Map<String, List<Long>> keywords = new HashMap<String, List<Long>>();
  39 + private final long bucketInterval;
  40 + private final long start;
  41 + private long end;
  42 +
  43 + private void addKeyword(String keyword, long id) {
  44 + if (!keywords.containsKey(keyword)) {
  45 + keywords.put(keyword, new ArrayList<Long>());
  46 + }
  47 + keywords.get(keyword).add(id);
  48 + }
  49 +
  50 + public Bucket(Status firstTweet, long bucketInterval) {
  51 + this.bucketInterval = bucketInterval;
  52 + this.start = firstTweet.getCreatedAt().getTime();
  53 + }
  54 +
  55 + public Bucket(Status status) {
  56 + this(status, 60 * 1000 * 30);
  57 + }
  58 +
  59 + public Integer getKeywordFrequency(String keyword) {
  60 + if (keywords.containsKey(keyword))
  61 + return keywords.get(keyword).size();
  62 + return 0;
  63 + }
  64 +
  65 + private void index(Status status) {
  66 + Collection<String> keywords = CleanAndTokenizeTweet
  67 + .cleanTweetTextRawTokenizer(status.getText());
  68 + long id = status.getId();
  69 + for (String key : keywords) {
  70 + addKeyword(key, id);
  71 + }
  72 + tweets.put(id, status);
  73 + }
  74 +
  75 + public boolean add(Status status) {
  76 + long time = status.getCreatedAt().getTime();
  77 + if (time - start > bucketInterval)
  78 + return false;
  79 + index(status);
  80 + end = status.getCreatedAt().getTime();
  81 + return true;
  82 +
  83 + }
  84 +
  85 + public Collection<String> getKeywords() {
  86 + return keywords.keySet();
  87 + }
  88 +
  89 + public List<Status> getKeywordTweets(String keyword) {
  90 + List<Long> tweetIds = keywords.get(keyword);
  91 + if (tweetIds == null)
  92 + return Collections.emptyList();
  93 + List<Status> tweetList = new ArrayList<Status>();
  94 + if (tweetIds.size() > 12)
  95 + tweetIds = tweetIds.subList(0, 12);
  96 + // ROTFL
  97 + for (Long id : tweetIds) {
  98 + tweetList.add(tweets.get(id));
  99 + }
  100 + return tweetList;
  101 + }
  102 +
  103 + public Date getTime() {
  104 + return new Date(end);
  105 +
  106 + }
  107 +
  108 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/Keywords.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/Keywords.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends;
  17 +
  18 +/**
  19 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  20 + *
  21 + * Created on Feb 4, 2015
  22 + */
  23 +public class Keywords {
  24 +
  25 + public String[] getKeywords() {
  26 +
  27 + return "pisa,pontedera,attacco,ultras,casapound,terrorismo,elezioni,presidente,renzi"
  28 + .split(",");
  29 + }
  30 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/RecentBuckets.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/RecentBuckets.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends;
  17 +
  18 +import java.util.ArrayDeque;
  19 +import java.util.ArrayList;
  20 +import java.util.List;
  21 +import java.util.Queue;
  22 +
  23 +import org.slf4j.Logger;
  24 +import org.slf4j.LoggerFactory;
  25 +
  26 +/**
  27 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  28 + *
  29 + * Created on Feb 4, 2015
  30 + */
  31 +public class RecentBuckets {
  32 +
  33 + private static final Logger logger = LoggerFactory
  34 + .getLogger(RecentBuckets.class);
  35 + Queue<Bucket> buckets = new ArrayDeque<Bucket>();
  36 + Bucket current = null;
  37 +
  38 + private final int size = 10;
  39 +
  40 + public RecentBuckets() {
  41 +
  42 + }
  43 +
  44 + public Bucket getCurrentBucket() {
  45 + return current;
  46 + }
  47 +
  48 + public void add(Bucket b) {
  49 + buckets.add(b);
  50 + current = b;
  51 + if (buckets.size() > size) {
  52 + buckets.remove();
  53 + }
  54 + }
  55 +
  56 + public List<Integer> getKeywordFrequencies(String keyword) {
  57 + List<Integer> frequencies = new ArrayList<Integer>();
  58 + StringBuilder sb = new StringBuilder();
  59 + sb.append("[").append(keyword).append("]:");
  60 + for (Bucket b : buckets) {
  61 + frequencies.add(b.getKeywordFrequency(keyword));
  62 + sb.append("[").append(b.getKeywordFrequency(keyword)).append("]");
  63 + }
  64 + // System.out.println(sb.toString());
  65 + return frequencies;
  66 + }
  67 +
  68 + public int getSize() {
  69 + return buckets.size();
  70 + }
  71 +
  72 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/TrendDetector.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/TrendDetector.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends;
  17 +
  18 +import it.cnr.isti.hpc.trends.output.Trend;
  19 +
  20 +import java.util.ArrayList;
  21 +import java.util.List;
  22 +
  23 +import org.slf4j.Logger;
  24 +import org.slf4j.LoggerFactory;
  25 +
  26 +import twitter4j.Status;
  27 +
  28 +/**
  29 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  30 + *
  31 + * Created on Feb 5, 2015
  32 + */
  33 +public class TrendDetector {
  34 +
  35 + private static final Logger logger = LoggerFactory
  36 + .getLogger(TrendDetector.class);
  37 +
  38 + private int minFreq = 3;
  39 + private long bucketInterval = 1000 * 60;
  40 +
  41 + Bucket currentBucket;
  42 + RecentBuckets bucketHistory = new RecentBuckets();
  43 +
  44 + public TrendDetector setMinFreq(int minFreq) {
  45 + this.minFreq = minFreq;
  46 + return this;
  47 + }
  48 +
  49 + public TrendDetector setBucketInterval(int intervalInMinutes) {
  50 + this.bucketInterval = intervalInMinutes * 1000 * 60;
  51 + return this;
  52 + }
  53 +
  54 + public void collect(Status status) {
  55 + if (currentBucket == null) {
  56 + currentBucket = new Bucket(status, bucketInterval);
  57 + return;
  58 + }
  59 + if (!currentBucket.add(status)) {
  60 + // bucket is full
  61 + bucketHistory.add(currentBucket);
  62 + // compute trend;
  63 + List<Trend> trends = updateTrends(bucketHistory);
  64 + for (Trend trend : trends) {
  65 + System.out.println(trend.asString());
  66 +
  67 + }
  68 + currentBucket = new Bucket(status, bucketInterval);
  69 + }
  70 + }
  71 +
  72 + public double getZscore(RecentBuckets rb, String keyword) {
  73 + List<Integer> frequencies = rb.getKeywordFrequencies(keyword);
  74 +
  75 + if (frequencies.size() == 1) {
  76 + // only the last bucket
  77 + logger.warn("no previous bucket, skipping");
  78 + return 0;
  79 + }
  80 + frequencies.remove(frequencies.size() - 1);
  81 + double sum = 0.0, score;
  82 + for (double freq : frequencies) {
  83 + sum += freq;
  84 + }
  85 + double mean = sum / rb.getSize();
  86 + double temp = 0;
  87 + for (double freq : frequencies) {
  88 + temp += (mean - freq) * (mean - freq);
  89 + }
  90 + double variance = temp / rb.getSize();
  91 + if (variance == 0) {
  92 + score = Double.MAX_VALUE;
  93 + } else {
  94 +
  95 + int baseFreq = rb.getCurrentBucket().getKeywordFrequency(keyword);
  96 + score = (baseFreq - mean) / variance;
  97 + }
  98 + return score;
  99 + }
  100 +
  101 + public List<Trend> updateTrends(RecentBuckets rb) {
  102 + System.out.println("----- update trends -----");
  103 + List<Trend> trends = new ArrayList<Trend>();
  104 + Bucket current = rb.getCurrentBucket();
  105 + for (String keyword : current.getKeywords()) {
  106 + if (isTrend(rb, keyword)) {
  107 + Trend trend = new Trend(keyword,
  108 + current.getKeywordFrequency(keyword));
  109 + trend.setTweets(current.getKeywordTweets(keyword));
  110 + trends.add(trend);
  111 + trend.setAtTime(current.getTime());
  112 + }
  113 +
  114 + }
  115 + return trends;
  116 + }
  117 +
  118 + // if ((Double.compare(zscore, 2.5d) >= 0) && (baseFreq > 10)) {
  119 + // // burstyMap.put(kw.getName(), kw);
  120 + // // cli.writeLineInOutput(plainTrendFormatting(kw));
  121 + // // cli.writeLineInOutput(gson.toJson(kw));
  122 + // sendTrends(keywords2TrendTransform(kw));
  123 + // cli.writeLineInOutput(gson.toJson(keywords2TrendTransform(kw)));
  124 + //
  125 + // }
  126 +
  127 + private boolean isTrend(RecentBuckets rb, String keyword) {
  128 + double zscore = getZscore(rb, keyword);
  129 + if (zscore > 2.5d) {
  130 + int baseFreq = rb.getCurrentBucket().getKeywordFrequency(keyword);
  131 + if (baseFreq > minFreq) {
  132 + return true;
  133 + }
  134 + }
  135 + return false;
  136 + }
  137 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/listener/AbstractTwitterListener.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/listener/AbstractTwitterListener.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends.listener;
  17 +
  18 +import it.cnr.isti.hpc.trends.TrendDetector;
  19 +
  20 +import java.util.ArrayList;
  21 +import java.util.List;
  22 +
  23 +import javax.activity.InvalidActivityException;
  24 +
  25 +import twitter4j.Status;
  26 +
  27 +/**
  28 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  29 + *
  30 + * Created on Feb 5, 2015
  31 + */
  32 +public abstract class AbstractTwitterListener implements TwitterListener {
  33 +
  34 + protected final List<TrendDetector> detectors = new ArrayList<TrendDetector>();
  35 +
  36 + public void addDetector(TrendDetector td) {
  37 + detectors.add(td);
  38 + }
  39 +
  40 + public void run() throws InvalidActivityException {
  41 + if (detectors.isEmpty()) {
  42 + throw new InvalidActivityException("no trend detectors set");
  43 + }
  44 + }
  45 +
  46 + public void collect(Status s) {
  47 + for (TrendDetector t : detectors) {
  48 + t.collect(s);
  49 + }
  50 + }
  51 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/listener/TwitterAPIListener.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/listener/TwitterAPIListener.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends.listener;
  17 +
  18 +import it.cnr.isti.hpc.property.ProjectProperties;
  19 +import it.cnr.isti.hpc.trends.Bucket;
  20 +import it.cnr.isti.hpc.trends.Keywords;
  21 +import it.cnr.isti.hpc.trends.TrendDetector;
  22 +
  23 +import javax.activity.InvalidActivityException;
  24 +
  25 +import twitter4j.FilterQuery;
  26 +import twitter4j.StallWarning;
  27 +import twitter4j.Status;
  28 +import twitter4j.StatusDeletionNotice;
  29 +import twitter4j.StatusListener;
  30 +import twitter4j.TwitterStream;
  31 +import twitter4j.TwitterStreamFactory;
  32 +import twitter4j.conf.ConfigurationBuilder;
  33 +
  34 +/**
  35 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  36 + *
  37 + * Created on Feb 4, 2015
  38 + */
  39 +public class TwitterAPIListener extends AbstractTwitterListener {
  40 +
  41 + private static ProjectProperties properties = new ProjectProperties(
  42 + Bucket.class);
  43 +
  44 + public TwitterAPIListener(TrendDetector td) {
  45 + addDetector(td);
  46 + }
  47 +
  48 + @Override
  49 + public void run() throws InvalidActivityException {
  50 + super.run();
  51 + ConfigurationBuilder configurationBuilder = new ConfigurationBuilder();
  52 + configurationBuilder
  53 + .setJSONStoreEnabled(false)
  54 + .setOAuthConsumerKey(properties.get("oauth.consumerKey"))
  55 + .setOAuthConsumerSecret(properties.get("oauth.consumerSecret"))
  56 + .setOAuthAccessToken(properties.get("oauth.accessToken"))
  57 + .setOAuthAccessTokenSecret(
  58 + properties.get("oauth.accessTokenSecret"));
  59 + TwitterStream twitterStream = new TwitterStreamFactory(
  60 + configurationBuilder.build()).getInstance();
  61 + FilterQuery tweetFilterQuery = new FilterQuery(); // See
  62 +
  63 + tweetFilterQuery.language(new String[] { "it" });
  64 + Keywords k = new Keywords();
  65 + tweetFilterQuery.track(k.getKeywords());
  66 +
  67 + twitterStream.addListener(new StatusListener() {
  68 + public void onStatus(Status status) {
  69 + collect(status);
  70 +
  71 + }
  72 +
  73 + public void onException(Exception arg0) {
  74 + // TODO Auto-generated method stub
  75 +
  76 + }
  77 +
  78 + public void onDeletionNotice(StatusDeletionNotice arg0) {
  79 + // TODO Auto-generated method stub
  80 +
  81 + }
  82 +
  83 + public void onScrubGeo(long arg0, long arg1) {
  84 + // TODO Auto-generated method stub
  85 +
  86 + }
  87 +
  88 + public void onStallWarning(StallWarning arg0) {
  89 + // TODO Auto-generated method stub
  90 +
  91 + }
  92 +
  93 + public void onTrackLimitationNotice(int arg0) {
  94 + // TODO Auto-generated method stub
  95 +
  96 + }
  97 +
  98 + });
  99 + twitterStream.filter(tweetFilterQuery);
  100 + }
  101 +
  102 + public static void main(String[] args) throws InvalidActivityException {
  103 + TwitterAPIListener listener = new TwitterAPIListener(
  104 + new TrendDetector().setMinFreq(15).setBucketInterval(30));
  105 +
  106 + listener.run();
  107 +
  108 + }
  109 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/listener/TwitterFileListener.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/listener/TwitterFileListener.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends.listener;
  17 +
  18 +import it.cnr.isti.hpc.io.reader.Filter;
  19 +import it.cnr.isti.hpc.io.reader.RecordReader;
  20 +import it.cnr.isti.hpc.property.ProjectProperties;
  21 +import it.cnr.isti.hpc.trends.Bucket;
  22 +import it.cnr.isti.hpc.trends.TrendDetector;
  23 +import it.cnr.isti.hpc.trends.util.StatusJSONImpl;
  24 +
  25 +import java.io.File;
  26 +
  27 +import javax.activity.InvalidActivityException;
  28 +
  29 +import org.slf4j.Logger;
  30 +import org.slf4j.LoggerFactory;
  31 +
  32 +import twitter4j.Status;
  33 +
  34 +import com.google.gson.Gson;
  35 +
  36 +/**
  37 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  38 + *
  39 + * Created on Feb 4, 2015
  40 + */
  41 +public class TwitterFileListener extends AbstractTwitterListener {
  42 +
  43 + /**
  44 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  45 + *
  46 + * Created on Feb 5, 2015
  47 + */
  48 + public class ItalianFilter implements Filter<StatusJSONImpl> {
  49 +
  50 + public boolean isFilter(StatusJSONImpl tweet) {
  51 +
  52 + return ((tweet.getIsoLanguageCode() == null) || (!tweet
  53 + .getIsoLanguageCode().equals("it")));
  54 + }
  55 + }
  56 +
  57 + private static final Logger logger = LoggerFactory
  58 + .getLogger(TwitterFileListener.class);
  59 +
  60 + private static ProjectProperties properties = new ProjectProperties(
  61 + Bucket.class);
  62 +
  63 + private File target;
  64 +
  65 + public TwitterFileListener(TrendDetector td) {
  66 + addDetector(td);
  67 + }
  68 +
  69 + public void setTarget(File target) {
  70 + this.target = target;
  71 + }
  72 +
  73 + @Override
  74 + public void run() throws InvalidActivityException {
  75 + super.run();
  76 + if (!target.exists()) {
  77 + throw new InvalidActivityException(target.getAbsolutePath()
  78 + + " does not exist");
  79 + }
  80 + if (target.isFile()) {
  81 + addAll(target);
  82 + }
  83 + if (target.isDirectory()) {
  84 + for (File f : target.listFiles()) {
  85 + if (f.isFile()) {
  86 + addAll(f);
  87 + }
  88 + }
  89 + }
  90 + }
  91 +
  92 + private void addAll(File f) throws InvalidActivityException {
  93 + if (!f.exists())
  94 + throw new InvalidActivityException(f.getAbsolutePath()
  95 + + " does not exist");
  96 +
  97 + RecordReader<StatusJSONImpl> reader = new RecordReader<StatusJSONImpl>(
  98 + f.getAbsolutePath(), StatusJSONImpl.class)
  99 + .filter(new ItalianFilter());
  100 + Gson gson = new Gson();
  101 + for (Status s : reader) {
  102 +
  103 + collect(s);
  104 + }
  105 +
  106 + }
  107 +
  108 + public static void main(String[] args) throws InvalidActivityException {
  109 + TwitterFileListener listener = new TwitterFileListener(
  110 + new TrendDetector().setMinFreq(15).setBucketInterval(30));
  111 + listener.setTarget(new File("/tmp/italian-tweets-20150201.json.gz"));
  112 + listener.run();
  113 +
  114 + }
  115 +
  116 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/listener/TwitterListener.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/listener/TwitterListener.java
  1 +/**
  2 + * Copyright 2015 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends.listener;
  17 +
  18 +import it.cnr.isti.hpc.trends.TrendDetector;
  19 +
  20 +import javax.activity.InvalidActivityException;
  21 +
  22 +/**
  23 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  24 + *
  25 + * Created on Feb 5, 2015
  26 + */
  27 +public interface TwitterListener {
  28 +
  29 + public abstract void addDetector(TrendDetector td);
  30 +
  31 + public abstract void run() throws InvalidActivityException;
  32 +
  33 +}
0 34 \ No newline at end of file
... ...
src/main/java/it/cnr/isti/hpc/trends/output/Classified.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/output/Classified.java
  1 +/**
  2 + *
  3 + */
  4 +package it.cnr.isti.hpc.trends.output;
  5 +
  6 +/**
  7 + * @author cris
  8 + *
  9 + */
  10 +public class Classified {
  11 + private int id;
  12 + private String name;
  13 + private String uri;
  14 + private String description;
  15 +
  16 +
  17 + public int getId() {
  18 + return id;
  19 + }
  20 +
  21 + public void setId(int id) {
  22 + this.id = id;
  23 + }
  24 +
  25 + public String getName() {
  26 + return name;
  27 + }
  28 +
  29 + public void setName(String name) {
  30 + this.name = name;
  31 + }
  32 +
  33 + public String getUri() {
  34 + return uri;
  35 + }
  36 +
  37 + public void setUri(String uri) {
  38 + this.uri = uri;
  39 + }
  40 +
  41 + public String getDescription() {
  42 + return description;
  43 + }
  44 +
  45 + public void setDescription(String description) {
  46 + this.description = description;
  47 + }
  48 +
  49 + /**
  50 + *
  51 + */
  52 + public Classified() {
  53 + // TODO Auto-generated constructor stub
  54 + name = "trend detection";
  55 + uri = "http://secure.eng.it/ontologySecure/microEvents.owl#Trend_Detection";
  56 + }
  57 +
  58 + /**
  59 + * @param args
  60 + */
  61 + public static void main(String[] args) {
  62 + // TODO Auto-generated method stub
  63 +
  64 + }
  65 +
  66 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/output/Entity.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/output/Entity.java
  1 +/**
  2 + *
  3 + */
  4 +package it.cnr.isti.hpc.trends.output;
  5 +
  6 +import java.util.ArrayList;
  7 +import java.util.Date;
  8 +import java.util.List;
  9 +
  10 +/**
  11 + * @author cris
  12 + *
  13 + */
  14 +public class Entity {
  15 + private int id;
  16 + private Date createDate;
  17 + private Date lastUpdate;
  18 + private List<EntityTag> entityTag;
  19 +
  20 + /**
  21 + *
  22 + */
  23 + public Entity() {
  24 + // TODO Auto-generated constructor stub
  25 + entityTag = new ArrayList<EntityTag>();
  26 + }
  27 +
  28 + public int getId() {
  29 + return id;
  30 + }
  31 +
  32 + public void setId(int id) {
  33 + this.id = id;
  34 + }
  35 +
  36 + public Date getCreateDate() {
  37 + return createDate;
  38 + }
  39 +
  40 + public void setCreateDate(Date createDate) {
  41 + this.createDate = createDate;
  42 + }
  43 +
  44 + public Date getLastUpdate() {
  45 + return lastUpdate;
  46 + }
  47 +
  48 + public void setLastUpdate(Date lastUpdate) {
  49 + this.lastUpdate = lastUpdate;
  50 + }
  51 +
  52 + public List<EntityTag> getEntityTag() {
  53 + return entityTag;
  54 + }
  55 +
  56 + public void setEntityTag(List<EntityTag> entityTag) {
  57 + this.entityTag = entityTag;
  58 + }
  59 +
  60 + /**
  61 + * @param args
  62 + */
  63 + public static void main(String[] args) {
  64 + // TODO Auto-generated method stub
  65 +
  66 + }
  67 +
  68 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/output/EntityTag.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/output/EntityTag.java
  1 +package it.cnr.isti.hpc.trends.output;
  2 +
  3 +import java.util.Date;
  4 +
  5 +public class EntityTag {
  6 +
  7 + private int id;
  8 + private String name;
  9 + private Date createDate;
  10 + private Date lastUpdate;
  11 +
  12 + public EntityTag(){
  13 +
  14 + }
  15 +
  16 + public EntityTag(String tag){
  17 + name = tag;
  18 + }
  19 +
  20 + public int getId() {
  21 + return id;
  22 + }
  23 + public void setId(int id) {
  24 + this.id = id;
  25 + }
  26 + public String getName() {
  27 + return name;
  28 + }
  29 + public void setName(String name) {
  30 + this.name = name;
  31 + }
  32 + public Date getCreateDate() {
  33 + return createDate;
  34 + }
  35 + public void setCreateDate(Date createDate) {
  36 + this.createDate = createDate;
  37 + }
  38 + public Date getLastUpdate() {
  39 + return lastUpdate;
  40 + }
  41 + public void setLastUpdate(Date lastUpdate) {
  42 + this.lastUpdate = lastUpdate;
  43 + }
  44 +
  45 +
  46 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/output/EventTag.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/output/EventTag.java
  1 +/**
  2 + *
  3 + */
  4 +package it.cnr.isti.hpc.trends.output;
  5 +
  6 +import java.util.Date;
  7 +
  8 +/**
  9 + * @author cris
  10 + *
  11 + */
  12 +public class EventTag {
  13 + private int id;
  14 + private String name;
  15 + private String tagCount;
  16 + private Date createDate;
  17 + private Date lastUpdate;
  18 +
  19 +
  20 +
  21 + public int getId() {
  22 + return id;
  23 + }
  24 +
  25 + public void setId(int id) {
  26 + this.id = id;
  27 + }
  28 +
  29 + public String getName() {
  30 + return name;
  31 + }
  32 +
  33 + public void setName(String name) {
  34 + this.name = name;
  35 + }
  36 +
  37 + public String getTagCount() {
  38 + return tagCount;
  39 + }
  40 +
  41 + public void setTagCount(String tagCount) {
  42 + this.tagCount = tagCount;
  43 + }
  44 +
  45 + public Date getCreateDate() {
  46 + return createDate;
  47 + }
  48 +
  49 + public void setCreateDate(Date createDate) {
  50 + this.createDate = createDate;
  51 + }
  52 +
  53 + public Date getLastUpdate() {
  54 + return lastUpdate;
  55 + }
  56 +
  57 + public void setLastUpdate(Date lastUpdate) {
  58 + this.lastUpdate = lastUpdate;
  59 + }
  60 +
  61 + /**
  62 + *
  63 + */
  64 + public EventTag() {
  65 + // TODO Auto-generated constructor stub
  66 + }
  67 +
  68 + /**
  69 + * @param args
  70 + */
  71 + public static void main(String[] args) {
  72 + // TODO Auto-generated method stub
  73 +
  74 + }
  75 +
  76 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/output/Trend.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/output/Trend.java
  1 +/**
  2 + * Cristina Muntean Dec 9, 2014
  3 + * twitter-core
  4 + */
  5 +package it.cnr.isti.hpc.trends.output;
  6 +
  7 +import java.util.ArrayList;
  8 +import java.util.Collections;
  9 +import java.util.Date;
  10 +import java.util.HashMap;
  11 +import java.util.List;
  12 +
  13 +import twitter4j.Status;
  14 +
  15 +public class Trend {
  16 +
  17 + private String id;
  18 + private Date atTime;
  19 + private Date circa;
  20 + private String description;
  21 + private String longitude;
  22 + private String latitude;
  23 + private String route;
  24 + private String street_number;
  25 + private String locality;
  26 + private String administrative_area_level_2;
  27 + private String administrative_area_level_1;
  28 + private String country;
  29 + private String posta_code;
  30 + private String severity;
  31 + private Boolean credibility = true;
  32 + private List<String> related;
  33 + private final List<HashMap<String, List<Status>>> eventResource;
  34 + // a list of resources, represented by tweets
  35 + // (containing a list of tweets)
  36 + private List<Entity> entities;
  37 + private List<EventTag> eventTag;
  38 + private List<Classified> classified = new ArrayList<Classified>();
  39 +
  40 + // public String concatTweets() {
  41 + // String tweets = "";
  42 + // List<JsonObject> listOfTweets = eventResource.get(0).get("Tweets");
  43 + // for (JsonObject t : listOfTweets) {
  44 + // tweets = tweets.concat(t.get("text") + " ");
  45 + // }
  46 + // return tweets.trim();
  47 + // }
  48 +
  49 + public String getId() {
  50 + return id;
  51 + }
  52 +
  53 + public void setId(String id) {
  54 + this.id = id;
  55 + }
  56 +
  57 + public Date getAtTime() {
  58 + return atTime;
  59 + }
  60 +
  61 + public void setAtTime(Date atTime) {
  62 + this.atTime = atTime;
  63 + }
  64 +
  65 + public Date getCirca() {
  66 + return circa;
  67 + }
  68 +
  69 + public void setCirca(Date circa) {
  70 + this.circa = circa;
  71 + }
  72 +
  73 + public String getDescription() {
  74 + return description;
  75 + }
  76 +
  77 + public void setDescription(String description) {
  78 + this.description = description;
  79 + }
  80 +
  81 + public String getLongitude() {
  82 + return longitude;
  83 + }
  84 +
  85 + public void setLongitude(String longitude) {
  86 + this.longitude = longitude;
  87 + }
  88 +
  89 + public String getLatitude() {
  90 + return latitude;
  91 + }
  92 +
  93 + public void setLatitude(String latitude) {
  94 + this.latitude = latitude;
  95 + }
  96 +
  97 + public String getRoute() {
  98 + return route;
  99 + }
  100 +
  101 + public void setRoute(String route) {
  102 + this.route = route;
  103 + }
  104 +
  105 + public String getStreet_number() {
  106 + return street_number;
  107 + }
  108 +
  109 + public void setStreet_number(String street_number) {
  110 + this.street_number = street_number;
  111 + }
  112 +
  113 + public String getLocality() {
  114 + return locality;
  115 + }
  116 +
  117 + public void setLocality(String locality) {
  118 + this.locality = locality;
  119 + }
  120 +
  121 + public String getAdministrative_area_level_2() {
  122 + return administrative_area_level_2;
  123 + }
  124 +
  125 + public void setAdministrative_area_level_2(
  126 + String administrative_area_level_2) {
  127 + this.administrative_area_level_2 = administrative_area_level_2;
  128 + }
  129 +
  130 + public String getAdministrative_area_level_1() {
  131 + return administrative_area_level_1;
  132 + }
  133 +
  134 + public void setAdministrative_area_level_1(
  135 + String administrative_area_level_1) {
  136 + this.administrative_area_level_1 = administrative_area_level_1;
  137 + }
  138 +
  139 + public String getCountry() {
  140 + return country;
  141 + }
  142 +
  143 + public void setCountry(String country) {
  144 + this.country = country;
  145 + }
  146 +
  147 + public String getPosta_code() {
  148 + return posta_code;
  149 + }
  150 +
  151 + public void setPosta_code(String posta_code) {
  152 + this.posta_code = posta_code;
  153 + }
  154 +
  155 + public String getSeverity() {
  156 + return severity;
  157 + }
  158 +
  159 + public void setSeverity(String severity) {
  160 + this.severity = severity;
  161 + }
  162 +
  163 + public Boolean getCredibility() {
  164 + return credibility;
  165 + }
  166 +
  167 + public void setCredibility(Boolean credibility) {
  168 + this.credibility = credibility;
  169 + }
  170 +
  171 + public List<String> getRelated() {
  172 + return related;
  173 + }
  174 +
  175 + public void setRelated(List<String> related) {
  176 + this.related = related;
  177 + }
  178 +
  179 + public void setTweets(List<Status> tweets) {
  180 + HashMap<String, List<Status>> t = new HashMap<String, List<Status>>();
  181 + t.put("Tweets", tweets);
  182 +
  183 + eventResource.add(t);
  184 + }
  185 +
  186 + public String asString() {
  187 + StringBuilder sb = new StringBuilder();
  188 + sb.append("------------------------------------\n");
  189 + sb.append("trend: ").append(description).append('\n');
  190 + sb.append("freq: ").append(eventTag.get(0).getTagCount()).append('\n');
  191 + sb.append("date: ").append(atTime).append('\n');
  192 +
  193 + sb.append("tweets:\n");
  194 + for (Status status : getTweets()) {
  195 + sb.append("\t")
  196 + .append(status.getText().replaceAll("[\n\r\t]", " "))
  197 + .append('\n');
  198 + }
  199 + sb.append("\n");
  200 + return sb.toString();
  201 +
  202 + }
  203 +
  204 + public List<Status> getTweets() {
  205 + if (eventResource.isEmpty())
  206 + return Collections.emptyList();
  207 + return eventResource.get(0).get("Tweets");
  208 + }
  209 +
  210 + public List<Entity> getEntities() {
  211 + return entities;
  212 + }
  213 +
  214 + public void setEntities(List<Entity> entities) {
  215 + this.entities = entities;
  216 + }
  217 +
  218 + public List<EventTag> getEventTag() {
  219 + return eventTag;
  220 + }
  221 +
  222 + public void setEventTag(List<EventTag> eventTag) {
  223 + this.eventTag = eventTag;
  224 + }
  225 +
  226 + public List<Classified> getClassified() {
  227 + return classified;
  228 + }
  229 +
  230 + public void setClassified(List<Classified> classified) {
  231 + this.classified = classified;
  232 + }
  233 +
  234 + public Trend(String description, int frequency) {
  235 + this.description = description;
  236 +
  237 + credibility = true;
  238 + related = new ArrayList<String>();
  239 + eventResource = new ArrayList<HashMap<String, List<Status>>>();
  240 + entities = new ArrayList<Entity>();
  241 + eventTag = new ArrayList<EventTag>();
  242 + EventTag tag = new EventTag();
  243 + tag.setName(description);
  244 + tag.setTagCount(String.valueOf(frequency));
  245 + eventTag.add(tag);
  246 + classified = new ArrayList<Classified>();
  247 +
  248 + }
  249 +
  250 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/secure/Category.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/secure/Category.java
  1 +/**
  2 + * Copyright 2014 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends.secure;
  17 +
  18 +import java.util.HashSet;
  19 +import java.util.LinkedList;
  20 +import java.util.List;
  21 +import java.util.Set;
  22 +
  23 +/**
  24 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  25 + *
  26 + * Created on Apr 1, 2014
  27 + */
  28 +public class Category implements Comparable<Category> {
  29 + String name;
  30 + String uri;
  31 + double score;
  32 + String description;
  33 + transient List<String> debug;
  34 + transient Set<String> mentions = null;
  35 +
  36 + @Override
  37 + public Category clone() {
  38 + Category c = new Category(name, score);
  39 + c.setUri(uri);
  40 + return c;
  41 + }
  42 +
  43 + public Category(String clazz, double score) {
  44 + super();
  45 + this.name = clazz;
  46 + this.score = score;
  47 + debug = new LinkedList<String>();
  48 +
  49 + }
  50 +
  51 + public void addMention(String s) {
  52 + if (mentions == null)
  53 + mentions = new HashSet<String>();
  54 + mentions.add(s);
  55 + }
  56 +
  57 + public Set<String> getMentions() {
  58 + return mentions;
  59 + }
  60 +
  61 + public void setMentions(Set<String> mentions) {
  62 + this.mentions = mentions;
  63 + }
  64 +
  65 + public void addDebug(String d) {
  66 + debug.add(d);
  67 + }
  68 +
  69 + @Override
  70 + public String toString() {
  71 + return "Tag [clazz=" + name + ", score=" + score + "]";
  72 + }
  73 +
  74 + public String getClazz() {
  75 + return name;
  76 + }
  77 +
  78 + public void setClazz(String clazz) {
  79 + this.name = clazz;
  80 + }
  81 +
  82 + public double getScore() {
  83 + return score;
  84 + }
  85 +
  86 + public void addScore(double score) {
  87 + this.score += score;
  88 + }
  89 +
  90 + public void setScore(double score) {
  91 + this.score = score;
  92 + }
  93 +
  94 + public int compareTo(Category o) {
  95 + if (score > o.score)
  96 + return -1;
  97 + if (score < o.score)
  98 + return 1;
  99 + return 0;
  100 + }
  101 +
  102 + @Override
  103 + public int hashCode() {
  104 + final int prime = 31;
  105 + int result = 1;
  106 + result = prime * result
  107 + + ((description == null) ? 0 : description.hashCode());
  108 + result = prime * result + ((name == null) ? 0 : name.hashCode());
  109 + return result;
  110 + }
  111 +
  112 + @Override
  113 + public boolean equals(Object obj) {
  114 + if (this == obj)
  115 + return true;
  116 + if (obj == null)
  117 + return false;
  118 + if (getClass() != obj.getClass())
  119 + return false;
  120 + Category other = (Category) obj;
  121 + if (description == null) {
  122 + if (other.description != null)
  123 + return false;
  124 + } else if (!description.equals(other.description))
  125 + return false;
  126 + if (name == null) {
  127 + if (other.name != null)
  128 + return false;
  129 + } else if (!name.equals(other.name))
  130 + return false;
  131 + return true;
  132 + }
  133 +
  134 + public String getName() {
  135 + return name;
  136 + }
  137 +
  138 + public void setName(String name) {
  139 + this.name = name;
  140 + }
  141 +
  142 + public String getUri() {
  143 + return uri;
  144 + }
  145 +
  146 + public void setUri(String uri) {
  147 + this.uri = uri;
  148 + }
  149 +
  150 + public void incrementScore() {
  151 + score += 1;
  152 +
  153 + }
  154 +
  155 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/secure/KeywordClassifier.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/secure/KeywordClassifier.java
  1 +/**
  2 + * Copyright 2014 Diego Ceccarelli
  3 + *
  4 + * Licensed under the Apache License, Version 2.0 (the "License");
  5 + * you may not use this file except in compliance with the License.
  6 + * You may obtain a copy of the License at
  7 + *
  8 + * http://www.apache.org/licenses/LICENSE-2.0
  9 + *
  10 + * Unless required by applicable law or agreed to in writing, software
  11 + * distributed under the License is distributed on an "AS IS" BASIS,
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing permissions and
  14 + * limitations under the License.
  15 + */
  16 +package it.cnr.isti.hpc.trends.secure;
  17 +
  18 +import it.cnr.isti.hpc.io.IOUtils;
  19 +import it.cnr.isti.hpc.property.ProjectProperties;
  20 +
  21 +import java.util.ArrayList;
  22 +import java.util.HashMap;
  23 +import java.util.List;
  24 +import java.util.Map;
  25 +import java.util.Scanner;
  26 +import java.util.StringTokenizer;
  27 +
  28 +import org.slf4j.Logger;
  29 +import org.slf4j.LoggerFactory;
  30 +
  31 +/**
  32 + * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
  33 + *
  34 + * Created on Oct 9, 2014
  35 + */
  36 +public class KeywordClassifier {
  37 +
  38 + public Map<String, Category> keyword2category;
  39 + static ProjectProperties properties = new ProjectProperties(
  40 + KeywordClassifier.class);
  41 +
  42 + private static final Logger logger = LoggerFactory
  43 + .getLogger(KeywordClassifier.class);
  44 +
  45 + public KeywordClassifier() {
  46 + keyword2category = new HashMap<String, Category>();
  47 + String keywordFile = properties.get("keyword.to.category");
  48 + String data = IOUtils.getFileAsString(keywordFile);
  49 + Scanner scanner = new Scanner(data);
  50 + while (scanner.hasNext()) {
  51 + String keyword = scanner.next();
  52 + keyword = keyword.toLowerCase().replace('_', ' ');
  53 + String uri = "";
  54 + if (scanner.hasNext()) {
  55 + uri = scanner.next();
  56 + logger.info("[{}] -> [{}] ", keyword, uri);
  57 + Category c = new Category(uri.replace('_', ' '), 1);
  58 + c.setUri("http://www.secure.it/category/" + uri);
  59 + keyword2category.put(keyword, c);
  60 +
  61 + }
  62 + }
  63 +
  64 + }
  65 +
  66 + public List<Category> classify(String text) {
  67 + List<Category> categories = new ArrayList<Category>();
  68 + text = text.toLowerCase();
  69 +
  70 + StringTokenizer tokenizer = new StringTokenizer(text);
  71 + while (tokenizer.hasMoreTokens()) {
  72 + String token = tokenizer.nextToken();
  73 +
  74 + for (Map.Entry<String, Category> entry : keyword2category
  75 + .entrySet()) {
  76 + if (token.startsWith(entry.getKey())) {
  77 + if (categories.contains(entry.getValue())) {
  78 + int i = categories.indexOf(entry.getValue());
  79 + categories.get(i).incrementScore();
  80 + } else {
  81 + categories.add(entry.getValue().clone());
  82 + }
  83 + }
  84 + }
  85 + }
  86 + return categories;
  87 + }
  88 +
  89 + public static void main(String[] args) {
  90 + new KeywordClassifier();
  91 + }
  92 +
  93 + private static KeywordClassifier instance = new KeywordClassifier();
  94 +
  95 + public static KeywordClassifier getInstance() {
  96 + return instance;
  97 + }
  98 +
  99 +}
... ...
src/main/java/it/cnr/isti/hpc/trends/util/CleanAndTokenizeTweet.java 0 → 100644
  1 +++ a/src/main/java/it/cnr/isti/hpc/trends/util/CleanAndTokenizeTweet.java
  1 +/**
  2 + *
  3 + */
  4 +package it.cnr.isti.hpc.trends.util;
  5 +
  6 +import java.util.ArrayList;
  7 +import java.util.Arrays;
  8 +import java.util.Collection;
  9 +import java.util.HashSet;
  10 +import java.util.List;
  11 +import java.util.Set;
  12 +
  13 +import net.sf.junidecode.Junidecode;
  14 +
  15 +/**
  16 + * @author cris
  17 + *
  18 + */
  19 +public class CleanAndTokenizeTweet {
  20 +
  21 + private final static Set<String> stopItalian = new HashSet<String>(
  22 + Arrays.asList(new String[] { "adesso", "ai", "al", "alla", "allo",
  23 + "allora", "altre", "altri", "altro", "anche", "ancora",
  24 + "avere", "aveva", "avevano", "ben", "buono", "che", "chi",
  25 + "cinque", "comprare", "con", "consecutivi", "consecutivo",
  26 + "cosa", "cui", "da", "del", "della", "dello", "dentro",
  27 + "deve", "devo", "di", "doppio", "due", "e", "ecco", "fare",
  28 + "fine", "fino", "fra", "gente", "giu", "ha", "hai",
  29 + "hanno", "ho", "il", "indietro", "invece", "io", "la",
  30 + "lavoro", "le", "lei", "lo", "loro", "lui", "lungo", "ma",
  31 + "me", "meglio", "molta", "molti", "molto", "nei", "nella",
  32 + "no", "noi", "nome", "nostro", "nove", "nuovi", "nuovo",
  33 + "o", "oltre", "ora", "otto", "peggio", "pero", "persone",
  34 + "piu", "poco", "primo", "promesso", "qua", "quarto",
  35 + "quasi", "quattro", "quello", "questo", "qui", "quindi",
  36 + "quinto", "rispetto", "sara", "secondo", "sei", "sembra",
  37 + "sembrava", "senza", "sette", "sia", "siamo", "siete",
  38 + "solo", "sono", "sopra", "soprattutto", "sotto", "stati",
  39 + "stato", "stesso", "su", "subito", "sul", "sulla", "tanto",
  40 + "te", "tempo", "terzo", "tra", "tre", "triplo", "ultimo",
  41 + "un", "una", "uno", "va", "vai", "voi", "volte", "vostro",
  42 + "per", "ora", "dei", "della", "ero", "suo", "dal", "piu",
  43 + "sta", "non" }));
  44 +
  45 + /**
  46 + *
  47 + */
  48 + public CleanAndTokenizeTweet() {
  49 + // TODO Auto-generated constructor stub
  50 + }
  51 +
  52 + public static Collection<String> cleanTweetTextRawTokenizer(String tweetText) {
  53 +
  54 + // the replace all is to remove puntuation (breaks URLs)
  55 + // List<String> words = new Text(tweetText.replaceAll("\\p{P}",
  56 + // "").toLowerCase()).getTerms();
  57 + // List<String> words = new
  58 + // Text(tweetText.replaceAll("([a-z]+)[?:!.,;]*",
  59 + // "$1").toLowerCase()).getTerms();
  60 +
  61 + // List<String> words = Twokenize.tokenize(tweetText);
  62 + List<String> words = new ArrayList<String>();
  63 + words = Twokenize.tokenizeRawTweetText(tweetText);
  64 + Set<String> cleanwords = new HashSet<String>();
  65 +
  66 + if (words.size() > 0) {
  67 + for (String s : words) {
  68 + s = s.toLowerCase();
  69 + s = Junidecode.unidecode(s);
  70 + if (!containsAlpha(s))
  71 + continue;
  72 + if (stopItalian.contains(s)) {
  73 + continue;
  74 + } else if (s.length() <= 2) {
  75 + continue;
  76 + } else if (s.startsWith("@")) {
  77 + continue;
  78 + } else if (s.contains("http")) {
  79 + continue;
  80 + }
  81 +
  82 + cleanwords.add(s);
  83 + }
  84 + }
  85 +
  86 + return cleanwords;
  87 + }
  88 +
  89 + private static boolean containsAlpha(String s) {
  90 + boolean atleastOneAlpha = s.matches(".*[a-zA-Z]+.*");
  91 + return atleastOneAlpha;
  92 + }
  93 +
  94 + // public static List<String> cleanTweetTextTokenizer(String tweetText) {
  95 + //
  96 + // List<String> words = new ArrayList<String>();
  97 + // words = Twokenize.tokenize(tweetText);
  98 + // List<String> cleanwords = new ArrayList<String>();
  99 + //
  100 + // if (words.size() > 0) {
  101 + // for (String s : words) {
  102 + // if (stopItalian.contains(s)) {
  103 + // continue;
  104 + // } else if (s.length() < 2) {
  105 + // continue;
  106 + // } else if (s.startsWith("@")) {
  107 + // continue;
  108 + // }
  109 + // cleanwords.add(s.toLowerCase());
  110 + // }
  111 + // }
  112 + // return cleanwords;
  113 + // }
  114 +
  115 + // public static List<String> cleanTweetTextRegex(String tweetText) {
  116 + //
  117 + // // the replace all is to remove puntuation (breaks URLs)
  118 + // List<String> words = new Text(tweetText.replaceAll("\\p{P}", "")
  119 + // .toLowerCase()).getTerms();
  120 + // // List<String> words = new
  121 + // // Text(tweetText.replaceAll("([a-z]+)[?:!.,;]*",
  122 + // // "$1").toLowerCase()).getTerms();
  123 + //
  124 + // List<String> cleanwords = new ArrayList<String>();
  125 + //
  126 + // if (words.size() > 0) {
  127 + // for (String s : words) {
  128 + // if (stopItalian.contains(s)) {
  129 + // continue;
  130 + // } else if (s.length() < 2) {
  131 + // continue;
  132 + // } else if (s.startsWith("@")) {
  133 + // continue;
  134 + // }
  135 + // cleanwords.add(s.toLowerCase());
  136 + // }
  137 + // }
  138 + // return cleanwords;
  139 + // }
  140 +
  141 + // public static List<String> cleanTweetTextNGrams(String tweetText) {
  142 + //
  143 + // List<String> words = new ArrayList<String>();
  144 + // // unigrams
  145 + // words = Twokenize.tokenizeRawTweetText(tweetText);
  146 + //
  147 + // // bigrams
  148 + // Reader reader = new StringReader(tweetText);
  149 + // TokenStream tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT,
  150 + // reader);
  151 + // tokenizer = new ShingleFilter(tokenizer, 2, 3);
  152 + // CharTermAttribute charTermAttribute = tokenizer
  153 + // .addAttribute(CharTermAttribute.class);
  154 + //
  155 + // try {
  156 + // while (tokenizer.incrementToken()) {
  157 + // String token = charTermAttribute.toString();
  158 + // if (StringUtils.split(token, " ").length > 1) {
  159 + // words.add(token); // there will be unfiltered TODO
  160 + // }
  161 + // }
  162 + // } catch (IOException e) {
  163 + // // TODO Auto-generated catch block
  164 + // e.printStackTrace();
  165 + // }
  166 + //
  167 + // List<String> cleanwords = new ArrayList<String>();
  168 + // if (words.size() > 0) {