WikiDump: Tool to get Wikipedia content

WikiDump tool has been made to get Wikipedia content from a list of article ids. The tool connect on http://en.wikipedia.org to get text data.

For more informations about getting Wikipedia content, you can see this post.

Here is the source code:


package com.devbypractice;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public class WikiDump
{
 /**
 * Get xhmlt content of Wikipedia source to String
 *
 * @param sUrlSource Url Source
 * @param sEncoding Encoding
 * @return xhtml content
 * @throws Exception
 */
 public static String dumpWikiXhtmlToString(String sUrlSource, String sEncoding)
 throws Exception
 {
 String inputLine;

 URL url;
 url = new URL(sUrlSource);
 URLConnection urlConn = url.openConnection();

 // TimeOut 30sec
 //
 urlConn.setConnectTimeout(30000);
 BufferedReader in = new BufferedReader(new InputStreamReader(
 urlConn.getInputStream(), sEncoding));

 StringBuilder sb = new StringBuilder();

 // XML header
 //
 sb.append("<?xml version=\"1.0\"?>");
 sb.append("<WikiContent>");

 // Copy response to buffer
 //
 while ((inputLine = in.readLine()) != null) {
 sb.append(inputLine);
 sb.append("\n");
 }
 in.close();

 // XML footer
 //
 sb.append("</WikiContent>");

 return sb.toString();
 }

 /**
 * Load article ids from files File must have an id on each line.
 *
 * @param sFileName
 * @return Articles ids
 * @throws Exception
 */
 public static List<Integer> loadArticleIdsFromFile(String sFileName)
 throws Exception
 {
 List<Integer> articleIds = new ArrayList<Integer>();

 FileReader reader = new FileReader(sFileName);
 BufferedReader buffReader = new BufferedReader(reader);

 String sLine = "";

 while ((sLine = buffReader.readLine()) != null) {
 Integer id = Integer.parseInt(sLine);
 articleIds.add(id);
 }

 buffReader.close();

 return articleIds;

 }

 /**
 * Display usage
 */
 public static void displayUsage()
 {
 System.out.println("Arguments:");
 System.out.println("<Article id source file path> <Output directory>");
 System.out.println("");
 System.out.println("Notes:");
 System.out.println("Article id file must have an id on each line. ");
 System.out.println("Output directory must exist.");

 }

 /**
 * Text cleanup. References [12], [3] etc... are deleted.
 *
 * @param text Text to clean
 * @return Cleaned text
 */
 public static String cleanup(String text)
 {
 return text.replaceAll("\\[[0-9]+\\]", "");
 }

 /**
 * Main entry
 *
 * @param args Arguments
 */
 public static void main(String[] args)
 {
 try {

 // UTF-16 chars count for output files
 // 1000000 ~= 2 Mo
 //
 final int fileSizeLimit = 1000000;

 // Current output file size
 // set to MAX + 1 for the file to be created at first time
 //
 int currentFileSize = fileSizeLimit + 1;

 // Url to get online Wikipedia article by id
 //
 String sWikiUrlFormat = "http://en.wikipedia.org/w/index.php?action=render&curid=%d";

 FileOutputStream outStream = null;
 OutputStreamWriter writer = null;

 if (args.length != 2) {
 displayUsage();

 }
 else {

 // Get article ids from file
 //
 List<Integer> articleIds = loadArticleIdsFromFile(args[0]);

 // Output dir
 //
 String sDestinationDir = args[1];

 // For each article id
 //
 for (Integer articleId : articleIds) {

 try {

 if (currentFileSize > fileSizeLimit) {
 if (writer != null) {
 writer.close();
 }
 outStream = new FileOutputStream(sDestinationDir + "\\"
 + articleId + ".txt");
 writer = new OutputStreamWriter(outStream, "UTF-16");

 currentFileSize = 0;

 }

 // Build article url
 //
 String sWikiUrl = String.format(sWikiUrlFormat, articleId);

 // Get article content
 //
 String sWiki = dumpWikiXhtmlToString(sWikiUrl, "UTF-8");

 // Load Xhtml content
 //
 InputSource is = new InputSource(new StringReader(sWiki));
 Document doc = DocumentBuilderFactory.newInstance()
 .newDocumentBuilder()
 .parse(is);

 // writer.write("========= " + articleId + " ==========\n");

 // Target desired wikipedia content
 //
 NodeList nl = XPathProcessor.getInstance().EvaluateMutli(
 "/WikiContent/p", doc);

 // For each node
 //
 for (int i = 0; i < nl.getLength(); i++) {
 String sContent = "";
 try {
 Node node = nl.item(i);

 // get text content
 //
 sContent = node.getTextContent();

 // clean text
 //
 sContent = cleanup(sContent);

 // Write in output file
 //
 writer.write(sContent);
 writer.write("\n\n");
 writer.flush();

 // Increase output file size count
 //
 currentFileSize += sContent.length() + 2;
 }
 catch (Exception e2) {
 String sLog = "Paragraph not get " + i
 + " for id=" + articleId;
 System.out.println(sLog);
 }
 }
 }
 catch (Exception e3) {
 String sLog = "Article not get for id = " + articleId
 + "\n";
 System.out.println(sLog);

 e3.printStackTrace();
 }

 }
 writer.close();
 }
 }

 catch (Exception e) {
 e.printStackTrace();

 }

 }

}

To compile the source you will need a XPath query manager available on this link.

A compiled version of WikiDump can be downloaded here.

Tags: ,

Leave a Reply