Commit fe6b68ce0b25a7b7d1d4951d2cd767291ecf22a6
1 parent
2c68f679
merged external links (by Mike Huffman)
Showing
2 changed files
with
31 additions
and
9 deletions
Show diff stats
src/main/java/it/cnr/isti/hpc/wikipedia/article/Article.java
... | ... | @@ -4,7 +4,7 @@ |
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | 5 | * you may not use this file except in compliance with the License. |
6 | 6 | * You may obtain a copy of the License at |
7 | - * | |
7 | + * | |
8 | 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | 9 | * |
10 | 10 | * Unless required by applicable law or agreed to in writing, software |
... | ... | @@ -23,7 +23,7 @@ import com.google.gson.Gson; |
23 | 23 | |
24 | 24 | /** |
25 | 25 | * Article represents an article in the Wikipedia dump. |
26 | - * | |
26 | + * | |
27 | 27 | * @author Diego Ceccarelli, diego.ceccarelli@isti.cnr.it created on 19/nov/2011 |
28 | 28 | */ |
29 | 29 | public class Article { |
... | ... | @@ -57,6 +57,7 @@ public class Article { |
57 | 57 | private List<Link> images; |
58 | 58 | protected List<List<String>> lists; |
59 | 59 | private List<Link> links; |
60 | + private List<Link> externalLinks; | |
60 | 61 | protected String redirect; |
61 | 62 | private List<String> sections; |
62 | 63 | private List<String> paragraphs; |
... | ... | @@ -131,7 +132,7 @@ public class Article { |
131 | 132 | |
132 | 133 | /** |
133 | 134 | * the redirect without the anchor, e.g., da_vinci#life -> da_vinci |
134 | - * | |
135 | + * | |
135 | 136 | * @return the redirect without the anchor |
136 | 137 | */ |
137 | 138 | public String getRedirectNoAnchor() { |
... | ... | @@ -386,6 +387,10 @@ public class Article { |
386 | 387 | for (Link l : getLinks()) |
387 | 388 | sb.append("\t").append(l).append("\n"); |
388 | 389 | |
390 | + sb.append("EXTERNALLINKS:\n"); | |
391 | + for (Link l : getExternalLinks()) | |
392 | + sb.append("\t").append(l).append("\n"); | |
393 | + | |
389 | 394 | sb.append("CATEGORIES:\n"); |
390 | 395 | for (Link l : getCategories()) |
391 | 396 | sb.append("\t").append(l).append("\n"); |
... | ... | @@ -397,7 +402,7 @@ public class Article { |
397 | 402 | |
398 | 403 | /** |
399 | 404 | * Removes the TEMPLATE text from the row text of the article. |
400 | - * | |
405 | + * | |
401 | 406 | * @param text |
402 | 407 | * @return the 'cleaned' text |
403 | 408 | */ |
... | ... | @@ -451,10 +456,20 @@ public class Article { |
451 | 456 | return links; |
452 | 457 | } |
453 | 458 | |
459 | + public List<Link> getExternalLinks() { | |
460 | + if (externalLinks == null) | |
461 | + return Collections.emptyList(); | |
462 | + return externalLinks; | |
463 | + } | |
464 | + | |
454 | 465 | public void setLinks(List<Link> links) { |
455 | 466 | this.links = links; |
456 | 467 | } |
457 | 468 | |
469 | + public void setExternalLinks(List<Link> links) { | |
470 | + this.externalLinks = links; | |
471 | + } | |
472 | + | |
458 | 473 | public void addCategory(Link category) { |
459 | 474 | if (this.getCategories() == null) |
460 | 475 | categories = new ArrayList<Link>(); | ... | ... |
src/main/java/it/cnr/isti/hpc/wikipedia/parser/ArticleParser.java
... | ... | @@ -4,7 +4,7 @@ |
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | 5 | * you may not use this file except in compliance with the License. |
6 | 6 | * You may obtain a copy of the License at |
7 | - * | |
7 | + * | |
8 | 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | 9 | * |
10 | 10 | * Unless required by applicable law or agreed to in writing, software |
... | ... | @@ -44,11 +44,11 @@ import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; |
44 | 44 | /** |
45 | 45 | * Generates a Mediawiki parser given a language, (it will expect to find a |
46 | 46 | * locale file in <tt>src/main/resources/</tt>). |
47 | - * | |
47 | + * | |
48 | 48 | * @see Locale |
49 | - * | |
49 | + * | |
50 | 50 | * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it> |
51 | - * | |
51 | + * | |
52 | 52 | * Created on Feb 14, 2013 |
53 | 53 | */ |
54 | 54 | public class ArticleParser { |
... | ... | @@ -325,6 +325,7 @@ public class ArticleParser { |
325 | 325 | private void setLinks(Article article, ParsedPage page) { |
326 | 326 | |
327 | 327 | List<Link> links = new ArrayList<Link>(10); |
328 | + List<Link> elinks = new ArrayList<Link>(10); | |
328 | 329 | |
329 | 330 | for (de.tudarmstadt.ukp.wikipedia.parser.Link t : page.getLinks()) { |
330 | 331 | if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) { |
... | ... | @@ -332,8 +333,14 @@ public class ArticleParser { |
332 | 333 | links.add(new Link(t.getTarget(), t.getText())); |
333 | 334 | |
334 | 335 | } |
336 | + if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) { | |
337 | + | |
338 | + elinks.add(new Link(t.getTarget(), t.getText())); | |
339 | + | |
340 | + } | |
335 | 341 | } |
336 | 342 | article.setLinks(links); |
343 | + article.setExternalLinks(elinks); | |
337 | 344 | } |
338 | 345 | |
339 | 346 | private void setTemplates(Article article, ParsedPage page) { |
... | ... | @@ -355,7 +362,7 @@ public class ArticleParser { |
355 | 362 | } |
356 | 363 | |
357 | 364 | /** |
358 | - * | |
365 | + * | |
359 | 366 | * @param templateParameters |
360 | 367 | */ |
361 | 368 | private void parseTemplatesSchema(Article article, | ... | ... |