Commit 4b53bee23aaf770368b8264487f012561a82e16a
Merge branch 'master' of https://github.com/diegoceccarelli/json-wikipedia
Showing
4 changed files
with
43 additions
and
13 deletions
Show diff stats
README.md
1 | 1 | json-wikipedia  |
2 | 2 | ============== |
3 | 3 | |
4 | - Json Wikipedia contains code to convert the Wikipedia XML dump in a [JSON][json] dump. | |
4 | + Json Wikipedia contains code to convert the Wikipedia XML dump into a [JSON][json] dump. | |
5 | 5 | |
6 | 6 | #### Setup #### |
7 | 7 | |
... | ... | @@ -19,7 +19,7 @@ or |
19 | 19 | |
20 | 20 | ./scripts/convert-xml-dump-to-json.sh [en|it] wikipedia-dump.xml.bz wikipedia-dump.json[.gz] |
21 | 21 | |
22 | -produces in `wikipedia-dump.json` the JSON version of the dump. Each line of the file contains an article | |
22 | +produces in `wikipedia-dump.json` the JSON version of the dump ([here you can find an example](https://dl.dropboxusercontent.com/u/4663256/tmp/json-wikipedia-sample.json)). Each line of the file contains an article | |
23 | 23 | of dump encoded in JSON. Each JSON line can be deserialized in an [Article](http://sassicaia.isti.cnr.it/javadocs/json-wikipedia/it/cnr/isti/hpc/wikipedia/article/Article.html) object, |
24 | 24 | which represents an |
25 | 25 | _enriched_ version of the wikitext page. The Article object contains: |
... | ... | @@ -34,13 +34,14 @@ _enriched_ version of the wikitext page. The Article object contains: |
34 | 34 | * a list of tables that appear in the article ; |
35 | 35 | * a list of lists that that appear in the article ; |
36 | 36 | * a list of internal links that appear in the article; |
37 | + * a list of external links that appear in the article; | |
37 | 38 | * if the article is a redirect, the pointed article; |
38 | 39 | * a list of section titles in the article; |
39 | - * the text of the article, divided in paragraphs; | |
40 | + * the text of the article, divided in paragraphs (PLAIN, no wikitext); | |
40 | 41 | * the categories and the templates of the articles; |
41 | 42 | * the list of attributes found in the templates; |
42 | 43 | * a list of terms highlighted in the article; |
43 | - * if present, the infobox. | |
44 | + * if present, the infobox. | |
44 | 45 | |
45 | 46 | #### Usage #### |
46 | 47 | ... | ... |
src/main/java/it/cnr/isti/hpc/wikipedia/article/Article.java
... | ... | @@ -4,7 +4,7 @@ |
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | 5 | * you may not use this file except in compliance with the License. |
6 | 6 | * You may obtain a copy of the License at |
7 | - * | |
7 | + * | |
8 | 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | 9 | * |
10 | 10 | * Unless required by applicable law or agreed to in writing, software |
... | ... | @@ -23,7 +23,7 @@ import com.google.gson.Gson; |
23 | 23 | |
24 | 24 | /** |
25 | 25 | * Article represents an article in the Wikipedia dump. |
26 | - * | |
26 | + * | |
27 | 27 | * @author Diego Ceccarelli, diego.ceccarelli@isti.cnr.it created on 19/nov/2011 |
28 | 28 | */ |
29 | 29 | public class Article { |
... | ... | @@ -57,6 +57,7 @@ public class Article { |
57 | 57 | private List<Link> images; |
58 | 58 | protected List<List<String>> lists; |
59 | 59 | private List<Link> links; |
60 | + private List<Link> externalLinks; | |
60 | 61 | protected String redirect; |
61 | 62 | private List<String> sections; |
62 | 63 | private List<String> paragraphs; |
... | ... | @@ -131,7 +132,7 @@ public class Article { |
131 | 132 | |
132 | 133 | /** |
133 | 134 | * the redirect without the anchor, e.g., da_vinci#life -> da_vinci |
134 | - * | |
135 | + * | |
135 | 136 | * @return the redirect without the anchor |
136 | 137 | */ |
137 | 138 | public String getRedirectNoAnchor() { |
... | ... | @@ -386,6 +387,10 @@ public class Article { |
386 | 387 | for (Link l : getLinks()) |
387 | 388 | sb.append("\t").append(l).append("\n"); |
388 | 389 | |
390 | + sb.append("EXTERNALLINKS:\n"); | |
391 | + for (Link l : getExternalLinks()) | |
392 | + sb.append("\t").append(l).append("\n"); | |
393 | + | |
389 | 394 | sb.append("CATEGORIES:\n"); |
390 | 395 | for (Link l : getCategories()) |
391 | 396 | sb.append("\t").append(l).append("\n"); |
... | ... | @@ -397,7 +402,7 @@ public class Article { |
397 | 402 | |
398 | 403 | /** |
399 | 404 | * Removes the TEMPLATE text from the row text of the article. |
400 | - * | |
405 | + * | |
401 | 406 | * @param text |
402 | 407 | * @return the 'cleaned' text |
403 | 408 | */ |
... | ... | @@ -451,10 +456,20 @@ public class Article { |
451 | 456 | return links; |
452 | 457 | } |
453 | 458 | |
459 | + public List<Link> getExternalLinks() { | |
460 | + if (externalLinks == null) | |
461 | + return Collections.emptyList(); | |
462 | + return externalLinks; | |
463 | + } | |
464 | + | |
454 | 465 | public void setLinks(List<Link> links) { |
455 | 466 | this.links = links; |
456 | 467 | } |
457 | 468 | |
469 | + public void setExternalLinks(List<Link> links) { | |
470 | + this.externalLinks = links; | |
471 | + } | |
472 | + | |
458 | 473 | public void addCategory(Link category) { |
459 | 474 | if (this.getCategories() == null) |
460 | 475 | categories = new ArrayList<Link>(); | ... | ... |
src/main/java/it/cnr/isti/hpc/wikipedia/parser/ArticleParser.java
... | ... | @@ -4,7 +4,7 @@ |
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | 5 | * you may not use this file except in compliance with the License. |
6 | 6 | * You may obtain a copy of the License at |
7 | - * | |
7 | + * | |
8 | 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | 9 | * |
10 | 10 | * Unless required by applicable law or agreed to in writing, software |
... | ... | @@ -44,11 +44,11 @@ import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; |
44 | 44 | /** |
45 | 45 | * Generates a Mediawiki parser given a language, (it will expect to find a |
46 | 46 | * locale file in <tt>src/main/resources/</tt>). |
47 | - * | |
47 | + * | |
48 | 48 | * @see Locale |
49 | - * | |
49 | + * | |
50 | 50 | * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it> |
51 | - * | |
51 | + * | |
52 | 52 | * Created on Feb 14, 2013 |
53 | 53 | */ |
54 | 54 | public class ArticleParser { |
... | ... | @@ -325,6 +325,7 @@ public class ArticleParser { |
325 | 325 | private void setLinks(Article article, ParsedPage page) { |
326 | 326 | |
327 | 327 | List<Link> links = new ArrayList<Link>(10); |
328 | + List<Link> elinks = new ArrayList<Link>(10); | |
328 | 329 | |
329 | 330 | for (de.tudarmstadt.ukp.wikipedia.parser.Link t : page.getLinks()) { |
330 | 331 | if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) { |
... | ... | @@ -332,8 +333,14 @@ public class ArticleParser { |
332 | 333 | links.add(new Link(t.getTarget(), t.getText())); |
333 | 334 | |
334 | 335 | } |
336 | + if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) { | |
337 | + | |
338 | + elinks.add(new Link(t.getTarget(), t.getText())); | |
339 | + | |
340 | + } | |
335 | 341 | } |
336 | 342 | article.setLinks(links); |
343 | + article.setExternalLinks(elinks); | |
337 | 344 | } |
338 | 345 | |
339 | 346 | private void setTemplates(Article article, ParsedPage page) { |
... | ... | @@ -355,7 +362,7 @@ public class ArticleParser { |
355 | 362 | } |
356 | 363 | |
357 | 364 | /** |
358 | - * | |
365 | + * | |
359 | 366 | * @param templateParameters |
360 | 367 | */ |
361 | 368 | private void parseTemplatesSchema(Article article, | ... | ... |