Commit fe6b68ce0b25a7b7d1d4951d2cd767291ecf22a6

Authored by Diego Ceccarelli
1 parent 2c68f679

merged external links (by Mike Huffman)

src/main/java/it/cnr/isti/hpc/wikipedia/article/Article.java
... ... @@ -4,7 +4,7 @@
4 4 * Licensed under the Apache License, Version 2.0 (the "License");
5 5 * you may not use this file except in compliance with the License.
6 6 * You may obtain a copy of the License at
7   - *
  7 + *
8 8 * http://www.apache.org/licenses/LICENSE-2.0
9 9 *
10 10 * Unless required by applicable law or agreed to in writing, software
... ... @@ -23,7 +23,7 @@ import com.google.gson.Gson;
23 23  
24 24 /**
25 25 * Article represents an article in the Wikipedia dump.
26   - *
  26 + *
27 27 * @author Diego Ceccarelli, diego.ceccarelli@isti.cnr.it created on 19/nov/2011
28 28 */
29 29 public class Article {
... ... @@ -57,6 +57,7 @@ public class Article {
57 57 private List<Link> images;
58 58 protected List<List<String>> lists;
59 59 private List<Link> links;
  60 + private List<Link> externalLinks;
60 61 protected String redirect;
61 62 private List<String> sections;
62 63 private List<String> paragraphs;
... ... @@ -131,7 +132,7 @@ public class Article {
131 132  
132 133 /**
133 134 * the redirect without the anchor, e.g., da_vinci#life -> da_vinci
134   - *
  135 + *
135 136 * @return the redirect without the anchor
136 137 */
137 138 public String getRedirectNoAnchor() {
... ... @@ -386,6 +387,10 @@ public class Article {
386 387 for (Link l : getLinks())
387 388 sb.append("\t").append(l).append("\n");
388 389  
  390 + sb.append("EXTERNALLINKS:\n");
  391 + for (Link l : getExternalLinks())
  392 + sb.append("\t").append(l).append("\n");
  393 +
389 394 sb.append("CATEGORIES:\n");
390 395 for (Link l : getCategories())
391 396 sb.append("\t").append(l).append("\n");
... ... @@ -397,7 +402,7 @@ public class Article {
397 402  
398 403 /**
399 404 * Removes the TEMPLATE text from the row text of the article.
400   - *
  405 + *
401 406 * @param text
402 407 * @return the 'cleaned' text
403 408 */
... ... @@ -451,10 +456,20 @@ public class Article {
451 456 return links;
452 457 }
453 458  
  459 + public List<Link> getExternalLinks() {
  460 + if (externalLinks == null)
  461 + return Collections.emptyList();
  462 + return externalLinks;
  463 + }
  464 +
454 465 public void setLinks(List<Link> links) {
455 466 this.links = links;
456 467 }
457 468  
  469 + public void setExternalLinks(List<Link> links) {
  470 + this.externalLinks = links;
  471 + }
  472 +
458 473 public void addCategory(Link category) {
459 474 if (this.getCategories() == null)
460 475 categories = new ArrayList<Link>();
... ...
src/main/java/it/cnr/isti/hpc/wikipedia/parser/ArticleParser.java
... ... @@ -4,7 +4,7 @@
4 4 * Licensed under the Apache License, Version 2.0 (the "License");
5 5 * you may not use this file except in compliance with the License.
6 6 * You may obtain a copy of the License at
7   - *
  7 + *
8 8 * http://www.apache.org/licenses/LICENSE-2.0
9 9 *
10 10 * Unless required by applicable law or agreed to in writing, software
... ... @@ -44,11 +44,11 @@ import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;
44 44 /**
45 45 * Generates a Mediawiki parser given a language, (it will expect to find a
46 46 * locale file in <tt>src/main/resources/</tt>).
47   - *
  47 + *
48 48 * @see Locale
49   - *
  49 + *
50 50 * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
51   - *
  51 + *
52 52 * Created on Feb 14, 2013
53 53 */
54 54 public class ArticleParser {
... ... @@ -325,6 +325,7 @@ public class ArticleParser {
325 325 private void setLinks(Article article, ParsedPage page) {
326 326  
327 327 List<Link> links = new ArrayList<Link>(10);
  328 + List<Link> elinks = new ArrayList<Link>(10);
328 329  
329 330 for (de.tudarmstadt.ukp.wikipedia.parser.Link t : page.getLinks()) {
330 331 if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) {
... ... @@ -332,8 +333,14 @@ public class ArticleParser {
332 333 links.add(new Link(t.getTarget(), t.getText()));
333 334  
334 335 }
  336 + if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) {
  337 +
  338 + elinks.add(new Link(t.getTarget(), t.getText()));
  339 +
  340 + }
335 341 }
336 342 article.setLinks(links);
  343 + article.setExternalLinks(elinks);
337 344 }
338 345  
339 346 private void setTemplates(Article article, ParsedPage page) {
... ... @@ -355,7 +362,7 @@ public class ArticleParser {
355 362 }
356 363  
357 364 /**
358   - *
  365 + *
359 366 * @param templateParameters
360 367 */
361 368 private void parseTemplatesSchema(Article article,
... ...