Skip to content

Instantly share code, notes, and snippets.

@christianchristensen
Created October 8, 2012 01:10
Show Gist options
  • Save christianchristensen/3850219 to your computer and use it in GitHub Desktop.
Save christianchristensen/3850219 to your computer and use it in GitHub Desktop.
Create an index output from Wikipedia_stuff.pdf
#!/usr/bin/env php
<?php
$words = array();
while ($line = fgets(STDIN)) {
$word = explode("-", $line);
$words[$word[0]][] = trim($word[1]);
}
foreach ( $words as $word => $pages ) {
if (!empty($word)) {
$pagesoutput = implode(",", $pages);
fwrite(STDOUT, "$word - $pagesoutput\n");
}
}
#!/usr/bin/env php
<?php
$args = getopt('p:');
$page = !empty($args['p']) ? $args['p'] : '--';
$trimchars = ",().";
while ($line = fgets(STDIN)) {
foreach (explode(" ", $line) as $word) {
fwrite(STDOUT, trim(trim(trim(trim(strtolower($word)), $trimchars)), $trimchars) . " - ${page}\n");
}
}
for i in $(seq 334);
do
pdftotext -f $i -l $i Wikipedia_stuff.pdf;
./pdfindex.php -p $i < Wikipedia_stuff.txt >> output.txt
done;
cat output.txt | sort | uniq | ./compactindex.php > index.txt;
rm output.txt Wikipedia_stuff.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment