Created
February 4, 2016 02:26
-
-
Save gray/3c13603f281b8ad816be to your computer and use it in GitHub Desktop.
Parse a Wikipedia dump file and search for particular articles
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.012; | |
use warnings; | |
use XML::LibXML::Reader; | |
my $reader = XML::LibXML::Reader->new(IO => \*STDIN); | |
my $xpc = XML::LibXML::XPathContext->new; | |
$xpc->registerNs('w', 'http://www.mediawiki.org/xml/export-0.10/'); | |
STDOUT->autoflush; | |
binmode $_, ':utf8' for *STDOUT, *STDERR; | |
while ($reader->nextElement('page')) { | |
my $node = $reader->copyCurrentNode(1); | |
next if $xpc->exists('w:redirect', $node); | |
my $text = $xpc->find('w:revision/w:text', $node) | |
->to_literal_delimited(' '); | |
# Tweak. | |
next unless $text =~ /complicated search/; | |
say $xpc->findvalue('w:title', $node); | |
} | |
__END__ | |
pv enwiki-latest-pages-articles.xml.bz2 | bzcat | ./parse-wikipedia.pl > matches.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment