Last active
August 29, 2015 14:08
-
-
Save ClickerMonkey/f481b31c8898b55ff4f9 to your computer and use it in GitHub Desktop.
Gathers information about all articles on Wikipedia and builds a graph of all related articles.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class Wikipedia | |
{ | |
public static class Article implements Serializable { | |
private static final long serialVersionUID = 1L; | |
public int id; | |
public String title; | |
public Set<String> related = new HashSet<String>(); | |
} | |
private static enum NodeType { | |
mediawiki, page, ns, id, revision, title, text, unknown | |
} | |
private static Map<String, NodeType> nodeMap = new HashMap<String, NodeType>() {{ | |
put("mediawiki", NodeType.mediawiki); | |
put("page", NodeType.page); | |
put("ns", NodeType.ns); | |
put("id", NodeType.id); | |
put("revision", NodeType.revision); | |
put("title", NodeType.title); | |
put("text", NodeType.text); | |
}}; | |
public static void main(String[] args) throws Exception | |
{ | |
String location = "C:\\Users\\pdiffenderfer\\Downloads\\enwiki-20140304-pages-articles-multistream.xml"; | |
String out = "./wikipedia.txt"; | |
SAXParserFactory factory = SAXParserFactory.newInstance(); | |
SAXParser saxParser = factory.newSAXParser(); | |
// mediawiki>page> | |
// ns=0 | |
// title | |
// id | |
// revision>text | |
final Stack<NodeType> stack = new Stack<Wikipedia.NodeType>(); | |
final Map<String, Article> articles = new HashMap<String, Wikipedia.Article>(); | |
final PrintStream stream = new PrintStream( out ); | |
final AtomicLong read = new AtomicLong(); | |
DefaultHandler handler = new DefaultHandler() { | |
Article current; | |
StringBuilder text = new StringBuilder(); | |
int ns; | |
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { | |
NodeType type = nodeMap.get( qName ); | |
if (type == null) { | |
type = NodeType.unknown; | |
} | |
stack.push( type ); | |
if (type == NodeType.page) { | |
current = new Article(); | |
} | |
} | |
public void endElement(String uri, String localName, String qName) throws SAXException { | |
NodeType popped = stack.pop(); | |
if (popped == NodeType.page) { | |
if (ns == 0) { | |
articles.put( current.title, current ); | |
read.incrementAndGet(); | |
if (articles.size() == 10000) { | |
for (Article a : articles.values()) { | |
print(stream, a); | |
} | |
articles.clear(); | |
System.out.println("Articles Read: " + read.get()); | |
} | |
int start = text.indexOf( "[[" ); | |
while (start != -1) { | |
int end = consumeUntil( text, "]]", start + 2 ); | |
String r = text.substring( start + 2, end ); | |
if (r.indexOf( ':' ) == -1 && r.indexOf( '#' ) == -1 && r.indexOf( "{" ) == -1 && r.indexOf( '/' ) != 0) { | |
int rr = r.indexOf( '|' ); | |
if (rr != -1) { | |
r = r.substring( 0, rr ); | |
} | |
current.related.add( r ); | |
} | |
if (end == text.length()) { | |
break; | |
} | |
start = text.indexOf( "[[", end ); | |
} | |
text.setLength( 0 ); | |
} | |
current = null; | |
} | |
} | |
public void characters(char ch[], int start, int length) throws SAXException { | |
NodeType currentType = stack.peek(); | |
switch (currentType) { | |
case id: | |
current.id = Integer.valueOf( String.valueOf( ch, start, length ) ); | |
break; | |
case ns: | |
ns = Integer.valueOf( String.valueOf( ch, start, length ) ); | |
break; | |
case title: | |
current.title = String.valueOf( ch, start, length ); | |
break; | |
case text: | |
text.append( ch, start, length ); | |
break; | |
default: | |
break; | |
} | |
} | |
}; | |
saxParser.parse(location, handler); | |
for (Article a : articles.values()) { | |
print(stream, a); | |
} | |
stream.close(); | |
System.out.println("Articles Read: " + read.get()); | |
} | |
private static void print(PrintStream out, Article a) | |
{ | |
out.format( "%d>%s>%d", a.id, a.title, a.related.size() ); | |
for (String x : a.related) { | |
out.print( '>' ); | |
out.print( x ); | |
} | |
out.println(); | |
} | |
private static int consumeUntil(StringBuilder haystack, String needle, int start) | |
{ | |
final char[] needleChars = needle.toCharArray(); | |
final int needleLength = needleChars.length; | |
final int max = haystack.length() - needleLength; | |
while (start <= max) { | |
boolean match = true; | |
for (int i = 0; i < needleLength; i++) { | |
match &= (needleChars[i] == haystack.charAt( i + start )); | |
} | |
if (match) { | |
break; | |
} | |
start++; | |
} | |
return start; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment