Skip to content

Instantly share code, notes, and snippets.

@yoku0825
Last active August 29, 2015 14:15
Show Gist options
  • Save yoku0825/7ee787766999ad5c42f9 to your computer and use it in GitHub Desktop.
Save yoku0825/7ee787766999ad5c42f9 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use XML::Simple;
use DBI;
my $xml= XML::Simple->new();
open(my $in, "< ./jawiki-latest-pages-articles.xml");
my $conn= DBI->connect("dbi:mysql:wikipedia;mysql_socket=/tmp/mysql.sock", "root", "", {mysql_enable_utf8 => 1});
$conn->do("SET sql_mode= ''");
my $buff;
while (my $row= <$in>)
{
if ($row =~ qr|<page>|)
{
$buff= "";
}
$buff .= $row;
if ($row =~ qr|</page>|)
{
my $ret= $xml->XMLin($buff);
my $content= $ret->{revision}->{text}->{content};
$content =~ s/\n/\\n/g;
$conn->do("INSERT INTO articles SET title = ?, " .
" content = ?, " .
" timestamp = ?",
undef, $ret->{title}, $content, $ret->{revision}->{timestamp});
}
}
CREATE DATABASE wikipedia;
CREATE TABLE wikipedia.articles (seq SERIAL, title TEXT, content LONGTEXT, timestamp DATETIME);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment