<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Dev By Practice</title>
	<atom:link href="http://www.devbypractice.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://www.devbypractice.com</link>
	<description>Tips, sources and references for developers</description>
	<lastBuildDate>Mon, 10 Jan 2011 21:31:58 +0000</lastBuildDate>
	<generator>http://wordpress.org/?v=2.9.2</generator>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>WikiDump: Tool to get Wikipedia content</title>
		<link>http://www.devbypractice.com/wikidump-tool-to-get-wikipedia-content/</link>
		<comments>http://www.devbypractice.com/wikidump-tool-to-get-wikipedia-content/#comments</comments>
		<pubDate>Wed, 30 Jun 2010 11:14:12 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[misc]]></category>
		<category><![CDATA[Java]]></category>
		<category><![CDATA[Wikipedia]]></category>

		<guid isPermaLink="false">http://www.devbypractice.com/?p=155</guid>
		<description><![CDATA[WikiDump tool has been made to get Wikipedia content from a list of article ids. The tool connect on http://en.wikipedia.org to get text data.
For more informations about getting Wikipedia content, you can see this post.
Here is the source code:


package com.devbypractice;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public [...]]]></description>
			<content:encoded><![CDATA[<p>WikiDump tool has been made to get Wikipedia content from a list of article ids. The tool connect on <a href="http://en.wikipedia.org">http://en.wikipedia.org</a> to get text data.</p>
<p>For more informations about getting Wikipedia content, you can see <a href="get-wikipedia-content/">this post</a>.</p>
<p>Here is the source code:</p>
<pre class="brush: java; title: ;">

package com.devbypractice;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public class WikiDump
{
 /**
 * Get xhmlt content of Wikipedia source to String
 *
 * @param sUrlSource Url Source
 * @param sEncoding Encoding
 * @return xhtml content
 * @throws Exception
 */
 public static String dumpWikiXhtmlToString(String sUrlSource, String sEncoding)
 throws Exception
 {
 String inputLine;

 URL url;
 url = new URL(sUrlSource);
 URLConnection urlConn = url.openConnection();

 // TimeOut 30sec
 //
 urlConn.setConnectTimeout(30000);
 BufferedReader in = new BufferedReader(new InputStreamReader(
 urlConn.getInputStream(), sEncoding));

 StringBuilder sb = new StringBuilder();

 // XML header
 //
 sb.append(&quot;&lt;?xml version=\&quot;1.0\&quot;?&gt;&quot;);
 sb.append(&quot;&lt;WikiContent&gt;&quot;);

 // Copy response to buffer
 //
 while ((inputLine = in.readLine()) != null) {
 sb.append(inputLine);
 sb.append(&quot;\n&quot;);
 }
 in.close();

 // XML footer
 //
 sb.append(&quot;&lt;/WikiContent&gt;&quot;);

 return sb.toString();
 }

 /**
 * Load article ids from files File must have an id on each line.
 *
 * @param sFileName
 * @return Articles ids
 * @throws Exception
 */
 public static List&lt;Integer&gt; loadArticleIdsFromFile(String sFileName)
 throws Exception
 {
 List&lt;Integer&gt; articleIds = new ArrayList&lt;Integer&gt;();

 FileReader reader = new FileReader(sFileName);
 BufferedReader buffReader = new BufferedReader(reader);

 String sLine = &quot;&quot;;

 while ((sLine = buffReader.readLine()) != null) {
 Integer id = Integer.parseInt(sLine);
 articleIds.add(id);
 }

 buffReader.close();

 return articleIds;

 }

 /**
 * Display usage
 */
 public static void displayUsage()
 {
 System.out.println(&quot;Arguments:&quot;);
 System.out.println(&quot;&lt;Article id source file path&gt; &lt;Output directory&gt;&quot;);
 System.out.println(&quot;&quot;);
 System.out.println(&quot;Notes:&quot;);
 System.out.println(&quot;Article id file must have an id on each line. &quot;);
 System.out.println(&quot;Output directory must exist.&quot;);

 }

 /**
 * Text cleanup. References [12], [3] etc... are deleted.
 *
 * @param text Text to clean
 * @return Cleaned text
 */
 public static String cleanup(String text)
 {
 return text.replaceAll(&quot;\\[[0-9]+\\]&quot;, &quot;&quot;);
 }

 /**
 * Main entry
 *
 * @param args Arguments
 */
 public static void main(String[] args)
 {
 try {

 // UTF-16 chars count for output files
 // 1000000 ~= 2 Mo
 //
 final int fileSizeLimit = 1000000;

 // Current output file size
 // set to MAX + 1 for the file to be created at first time
 //
 int currentFileSize = fileSizeLimit + 1;

 // Url to get online Wikipedia article by id
 //
 String sWikiUrlFormat = &quot;http://en.wikipedia.org/w/index.php?action=render&amp;curid=%d&quot;;

 FileOutputStream outStream = null;
 OutputStreamWriter writer = null;

 if (args.length != 2) {
 displayUsage();

 }
 else {

 // Get article ids from file
 //
 List&lt;Integer&gt; articleIds = loadArticleIdsFromFile(args[0]);

 // Output dir
 //
 String sDestinationDir = args[1];

 // For each article id
 //
 for (Integer articleId : articleIds) {

 try {

 if (currentFileSize &gt; fileSizeLimit) {
 if (writer != null) {
 writer.close();
 }
 outStream = new FileOutputStream(sDestinationDir + &quot;\\&quot;
 + articleId + &quot;.txt&quot;);
 writer = new OutputStreamWriter(outStream, &quot;UTF-16&quot;);

 currentFileSize = 0;

 }

 // Build article url
 //
 String sWikiUrl = String.format(sWikiUrlFormat, articleId);

 // Get article content
 //
 String sWiki = dumpWikiXhtmlToString(sWikiUrl, &quot;UTF-8&quot;);

 // Load Xhtml content
 //
 InputSource is = new InputSource(new StringReader(sWiki));
 Document doc = DocumentBuilderFactory.newInstance()
 .newDocumentBuilder()
 .parse(is);

 // writer.write(&quot;========= &quot; + articleId + &quot; ==========\n&quot;);

 // Target desired wikipedia content
 //
 NodeList nl = XPathProcessor.getInstance().EvaluateMutli(
 &quot;/WikiContent/p&quot;, doc);

 // For each node
 //
 for (int i = 0; i &lt; nl.getLength(); i++) {
 String sContent = &quot;&quot;;
 try {
 Node node = nl.item(i);

 // get text content
 //
 sContent = node.getTextContent();

 // clean text
 //
 sContent = cleanup(sContent);

 // Write in output file
 //
 writer.write(sContent);
 writer.write(&quot;\n\n&quot;);
 writer.flush();

 // Increase output file size count
 //
 currentFileSize += sContent.length() + 2;
 }
 catch (Exception e2) {
 String sLog = &quot;Paragraph not get &quot; + i
 + &quot; for id=&quot; + articleId;
 System.out.println(sLog);
 }
 }
 }
 catch (Exception e3) {
 String sLog = &quot;Article not get for id = &quot; + articleId
 + &quot;\n&quot;;
 System.out.println(sLog);

 e3.printStackTrace();
 }

 }
 writer.close();
 }
 }

 catch (Exception e) {
 e.printStackTrace();

 }

 }

}
</pre>
<p>To compile the source you will need a XPath query manager available on <a href="java-xpath-cache-query/" target="_blank">this link</a>.</p>
<p>A compiled version of WikiDump can be downloaded <a href="http://www.devbypractice.com/wp-content/uploads/2010/06/WikiDump1.zip">here</a>.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.devbypractice.com/wikidump-tool-to-get-wikipedia-content/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>

