Commit 4b53bee23aaf770368b8264487f012561a82e16a

Authored by Diego Ceccarelli
2 parents 20bcb44a 986670ad

Merge branch 'master' of https://github.com/diegoceccarelli/json-wikipedia

CREDITS.txt 0 → 100644
  1 +## CREDITS.txt
  2 +
  3 +
  4 +### Contributors
  5 +
  6 +Mike Huffman - added the external links for each article;
  7 +Sara Montesi - http://www.saramontesi.it/ - json-wikipedia logo;
0 8 \ No newline at end of file
... ...
README.md
1 1 json-wikipedia ![json-wikipedia](https://dl.dropboxusercontent.com/u/4663256/tmp/json-wikipedia.png)
2 2 ==============
3 3  
4   - Json Wikipedia contains code to convert the Wikipedia XML dump in a [JSON][json] dump.
  4 + Json Wikipedia contains code to convert the Wikipedia XML dump into a [JSON][json] dump.
5 5  
6 6 #### Setup ####
7 7  
... ... @@ -19,7 +19,7 @@ or
19 19  
20 20 ./scripts/convert-xml-dump-to-json.sh [en|it] wikipedia-dump.xml.bz wikipedia-dump.json[.gz]
21 21  
22   -produces in `wikipedia-dump.json` the JSON version of the dump. Each line of the file contains an article
  22 +produces in `wikipedia-dump.json` the JSON version of the dump ([here you can find an example](https://dl.dropboxusercontent.com/u/4663256/tmp/json-wikipedia-sample.json)). Each line of the file contains an article
23 23 of dump encoded in JSON. Each JSON line can be deserialized in an [Article](http://sassicaia.isti.cnr.it/javadocs/json-wikipedia/it/cnr/isti/hpc/wikipedia/article/Article.html) object,
24 24 which represents an
25 25 _enriched_ version of the wikitext page. The Article object contains:
... ... @@ -34,13 +34,14 @@ _enriched_ version of the wikitext page. The Article object contains:
34 34 * a list of tables that appear in the article ;
35 35 * a list of lists that that appear in the article ;
36 36 * a list of internal links that appear in the article;
  37 + * a list of external links that appear in the article;
37 38 * if the article is a redirect, the pointed article;
38 39 * a list of section titles in the article;
39   - * the text of the article, divided in paragraphs;
  40 + * the text of the article, divided in paragraphs (PLAIN, no wikitext);
40 41 * the categories and the templates of the articles;
41 42 * the list of attributes found in the templates;
42 43 * a list of terms highlighted in the article;
43   - * if present, the infobox.
  44 + * if present, the infobox.
44 45  
45 46 #### Usage ####
46 47  
... ...
src/main/java/it/cnr/isti/hpc/wikipedia/article/Article.java
... ... @@ -4,7 +4,7 @@
4 4 * Licensed under the Apache License, Version 2.0 (the "License");
5 5 * you may not use this file except in compliance with the License.
6 6 * You may obtain a copy of the License at
7   - *
  7 + *
8 8 * http://www.apache.org/licenses/LICENSE-2.0
9 9 *
10 10 * Unless required by applicable law or agreed to in writing, software
... ... @@ -23,7 +23,7 @@ import com.google.gson.Gson;
23 23  
24 24 /**
25 25 * Article represents an article in the Wikipedia dump.
26   - *
  26 + *
27 27 * @author Diego Ceccarelli, diego.ceccarelli@isti.cnr.it created on 19/nov/2011
28 28 */
29 29 public class Article {
... ... @@ -57,6 +57,7 @@ public class Article {
57 57 private List<Link> images;
58 58 protected List<List<String>> lists;
59 59 private List<Link> links;
  60 + private List<Link> externalLinks;
60 61 protected String redirect;
61 62 private List<String> sections;
62 63 private List<String> paragraphs;
... ... @@ -131,7 +132,7 @@ public class Article {
131 132  
132 133 /**
133 134 * the redirect without the anchor, e.g., da_vinci#life -> da_vinci
134   - *
  135 + *
135 136 * @return the redirect without the anchor
136 137 */
137 138 public String getRedirectNoAnchor() {
... ... @@ -386,6 +387,10 @@ public class Article {
386 387 for (Link l : getLinks())
387 388 sb.append("\t").append(l).append("\n");
388 389  
  390 + sb.append("EXTERNALLINKS:\n");
  391 + for (Link l : getExternalLinks())
  392 + sb.append("\t").append(l).append("\n");
  393 +
389 394 sb.append("CATEGORIES:\n");
390 395 for (Link l : getCategories())
391 396 sb.append("\t").append(l).append("\n");
... ... @@ -397,7 +402,7 @@ public class Article {
397 402  
398 403 /**
399 404 * Removes the TEMPLATE text from the row text of the article.
400   - *
  405 + *
401 406 * @param text
402 407 * @return the 'cleaned' text
403 408 */
... ... @@ -451,10 +456,20 @@ public class Article {
451 456 return links;
452 457 }
453 458  
  459 + public List<Link> getExternalLinks() {
  460 + if (externalLinks == null)
  461 + return Collections.emptyList();
  462 + return externalLinks;
  463 + }
  464 +
454 465 public void setLinks(List<Link> links) {
455 466 this.links = links;
456 467 }
457 468  
  469 + public void setExternalLinks(List<Link> links) {
  470 + this.externalLinks = links;
  471 + }
  472 +
458 473 public void addCategory(Link category) {
459 474 if (this.getCategories() == null)
460 475 categories = new ArrayList<Link>();
... ...
src/main/java/it/cnr/isti/hpc/wikipedia/parser/ArticleParser.java
... ... @@ -4,7 +4,7 @@
4 4 * Licensed under the Apache License, Version 2.0 (the "License");
5 5 * you may not use this file except in compliance with the License.
6 6 * You may obtain a copy of the License at
7   - *
  7 + *
8 8 * http://www.apache.org/licenses/LICENSE-2.0
9 9 *
10 10 * Unless required by applicable law or agreed to in writing, software
... ... @@ -44,11 +44,11 @@ import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;
44 44 /**
45 45 * Generates a Mediawiki parser given a language, (it will expect to find a
46 46 * locale file in <tt>src/main/resources/</tt>).
47   - *
  47 + *
48 48 * @see Locale
49   - *
  49 + *
50 50 * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
51   - *
  51 + *
52 52 * Created on Feb 14, 2013
53 53 */
54 54 public class ArticleParser {
... ... @@ -325,6 +325,7 @@ public class ArticleParser {
325 325 private void setLinks(Article article, ParsedPage page) {
326 326  
327 327 List<Link> links = new ArrayList<Link>(10);
  328 + List<Link> elinks = new ArrayList<Link>(10);
328 329  
329 330 for (de.tudarmstadt.ukp.wikipedia.parser.Link t : page.getLinks()) {
330 331 if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) {
... ... @@ -332,8 +333,14 @@ public class ArticleParser {
332 333 links.add(new Link(t.getTarget(), t.getText()));
333 334  
334 335 }
  336 + if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) {
  337 +
  338 + elinks.add(new Link(t.getTarget(), t.getText()));
  339 +
  340 + }
335 341 }
336 342 article.setLinks(links);
  343 + article.setExternalLinks(elinks);
337 344 }
338 345  
339 346 private void setTemplates(Article article, ParsedPage page) {
... ... @@ -355,7 +362,7 @@ public class ArticleParser {
355 362 }
356 363  
357 364 /**
358   - *
  365 + *
359 366 * @param templateParameters
360 367 */
361 368 private void parseTemplatesSchema(Article article,
... ...