Created
June 8, 2015 23:08
-
-
Save yokawasa/f1cb68cd168f50dbf873 to your computer and use it in GitHub Desktop.
Generating JSON Data for Azure Search from Wikipedia Database Dump File
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use XML::Twig; | |
use Getopt::Std; | |
use Encode; | |
use JSON; | |
my $UPLOAD_THRESHOLD=100; | |
my %opts=(); | |
getopts('c:o:', \%opts) or die "Wrong Options!\n"; | |
my $inputfile=$opts{'c'}; | |
my $outputpath=$opts{'o'}; | |
usage() if ( $inputfile eq '' || $outputpath eq '' ); | |
my($tmpcounter,$itemcounter, $filecounter) = (0,0,0); | |
my $itemsarr = []; | |
my $twig = new XML::Twig( | |
twig_handlers => { doc => \&doc } | |
); | |
$twig->parsefile($inputfile); | |
if ($tmpcounter) { | |
flush2json($itemsarr, "items-$filecounter.json"); | |
} | |
sub doc { | |
my($twig, $doc)= @_; | |
my $item = { | |
itemid => "$itemcounter", | |
title=> utf8string($doc->first_child('title')->text), | |
abstract=> utf8string($doc->first_child('abstract')->text), | |
url =>$doc->first_child('url')->text | |
}; | |
push($itemsarr,$item); | |
$tmpcounter++; $itemcounter++; | |
if ($tmpcounter % $UPLOAD_THRESHOLD == 0 ){ | |
flush2json($itemsarr, "items-$filecounter.json"); | |
$tmpcounter=0; | |
$filecounter++; | |
$itemsarr=[]; | |
} | |
$twig->purge; | |
} | |
sub flush2json { | |
my ($iarr, $f)=@_; | |
my $outarr->{'value'}=$iarr; | |
my $json = JSON->new(); | |
my $js = $json->encode($outarr); | |
open(P, "> $outputpath/$f") or die "can't open: $outputpath/$f \n"; | |
print P "$js"; | |
close P; | |
} | |
sub usage { | |
print STDERR "Usage: $0 -c <inputxml> -o <outputpath>\n"; | |
exit(1); | |
} | |
sub utf8string { | |
my $s= shift; | |
if (utf8::is_utf8($s)) { | |
return encode('utf-8', $s); | |
} | |
return $s; | |
} | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment