WikiDump tool has been made to get Wikipedia content from a list of article ids. The tool connect on http://en.wikipedia.org to get text data.
For more informations about getting Wikipedia content, you can see this post.
Here is the source code:
package com.devbypractice;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
public class WikiDump
{
/**
* Get xhmlt content of Wikipedia source to String
*
* @param sUrlSource Url Source
* @param sEncoding Encoding
* @return xhtml content
* @throws Exception
*/
public static String dumpWikiXhtmlToString(String sUrlSource, String sEncoding)
throws Exception
{
String inputLine;
URL url;
url = new URL(sUrlSource);
URLConnection urlConn = url.openConnection();
// TimeOut 30sec
//
urlConn.setConnectTimeout(30000);
BufferedReader in = new BufferedReader(new InputStreamReader(
urlConn.getInputStream(), sEncoding));
StringBuilder sb = new StringBuilder();
// XML header
//
sb.append("<?xml version=\"1.0\"?>");
sb.append("<WikiContent>");
// Copy response to buffer
//
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine);
sb.append("\n");
}
in.close();
// XML footer
//
sb.append("</WikiContent>");
return sb.toString();
}
/**
* Load article ids from files File must have an id on each line.
*
* @param sFileName
* @return Articles ids
* @throws Exception
*/
public static List<Integer> loadArticleIdsFromFile(String sFileName)
throws Exception
{
List<Integer> articleIds = new ArrayList<Integer>();
FileReader reader = new FileReader(sFileName);
BufferedReader buffReader = new BufferedReader(reader);
String sLine = "";
while ((sLine = buffReader.readLine()) != null) {
Integer id = Integer.parseInt(sLine);
articleIds.add(id);
}
buffReader.close();
return articleIds;
}
/**
* Display usage
*/
public static void displayUsage()
{
System.out.println("Arguments:");
System.out.println("<Article id source file path> <Output directory>");
System.out.println("");
System.out.println("Notes:");
System.out.println("Article id file must have an id on each line. ");
System.out.println("Output directory must exist.");
}
/**
* Text cleanup. References [12], [3] etc... are deleted.
*
* @param text Text to clean
* @return Cleaned text
*/
public static String cleanup(String text)
{
return text.replaceAll("\\[[0-9]+\\]", "");
}
/**
* Main entry
*
* @param args Arguments
*/
public static void main(String[] args)
{
try {
// UTF-16 chars count for output files
// 1000000 ~= 2 Mo
//
final int fileSizeLimit = 1000000;
// Current output file size
// set to MAX + 1 for the file to be created at first time
//
int currentFileSize = fileSizeLimit + 1;
// Url to get online Wikipedia article by id
//
String sWikiUrlFormat = "http://en.wikipedia.org/w/index.php?action=render&curid=%d";
FileOutputStream outStream = null;
OutputStreamWriter writer = null;
if (args.length != 2) {
displayUsage();
}
else {
// Get article ids from file
//
List<Integer> articleIds = loadArticleIdsFromFile(args[0]);
// Output dir
//
String sDestinationDir = args[1];
// For each article id
//
for (Integer articleId : articleIds) {
try {
if (currentFileSize > fileSizeLimit) {
if (writer != null) {
writer.close();
}
outStream = new FileOutputStream(sDestinationDir + "\\"
+ articleId + ".txt");
writer = new OutputStreamWriter(outStream, "UTF-16");
currentFileSize = 0;
}
// Build article url
//
String sWikiUrl = String.format(sWikiUrlFormat, articleId);
// Get article content
//
String sWiki = dumpWikiXhtmlToString(sWikiUrl, "UTF-8");
// Load Xhtml content
//
InputSource is = new InputSource(new StringReader(sWiki));
Document doc = DocumentBuilderFactory.newInstance()
.newDocumentBuilder()
.parse(is);
// writer.write("========= " + articleId + " ==========\n");
// Target desired wikipedia content
//
NodeList nl = XPathProcessor.getInstance().EvaluateMutli(
"/WikiContent/p", doc);
// For each node
//
for (int i = 0; i < nl.getLength(); i++) {
String sContent = "";
try {
Node node = nl.item(i);
// get text content
//
sContent = node.getTextContent();
// clean text
//
sContent = cleanup(sContent);
// Write in output file
//
writer.write(sContent);
writer.write("\n\n");
writer.flush();
// Increase output file size count
//
currentFileSize += sContent.length() + 2;
}
catch (Exception e2) {
String sLog = "Paragraph not get " + i
+ " for id=" + articleId;
System.out.println(sLog);
}
}
}
catch (Exception e3) {
String sLog = "Article not get for id = " + articleId
+ "\n";
System.out.println(sLog);
e3.printStackTrace();
}
}
writer.close();
}
}
catch (Exception e) {
e.printStackTrace();
}
}
}
To compile the source you will need a XPath query manager available on this link.
A compiled version of WikiDump can be downloaded here.