Skip to content

Instantly share code, notes, and snippets.

@xdbr
Created May 20, 2016 12:32
Show Gist options
  • Save xdbr/19429160a97f28b57c7d01274e13f16f to your computer and use it in GitHub Desktop.
Save xdbr/19429160a97f28b57c7d01274e13f16f to your computer and use it in GitHub Desktop.
extract abstracts from pdfs from a bibtex entries, add abstracts to bibtex entries and write to STDOUT [needs pdftottext]
#!/usr/bin/env perl
use strict;
use warnings;
use feature 'say';
use IO::File;
use Data::Dumper;
use BibTeX::Parser;
my $filename = $ARGV[0] // "papers.bib";
my $fh = IO::File->new($filename) or die $!;
my $parser = BibTeX::Parser->new($fh);
while (my $entry = $parser->next ) {
if ($entry->parse_ok and defined $entry->field('file')) {
my $file = $entry->field('file');
if ($file =~ /:PDF$/i) {
$file =~ s/^://;
$file =~ s/:PDF$//;
# say "searching for $file...";
if (-e $file) {
# $file = quotemeta $file;
my $dump = do {
local (@ARGV => $/) = qq[pdftotext "$file" - |]; <>
};
my $abstract =
join " + + + + + ",
grep $_ !~ /^\s*$/ =>
map { s/\n/ /g; $_ }
grep { /^abstract/i }
split /\n\n/, $dump ;
# say "--> $file:\n==> {{$abstract}}\n\n\n\n" if $abstract !~ /^\s*$/;
if ($abstract !~ /^\s*$/) {
$entry->field(abstract => $abstract)
}
}
}
}
say $entry->to_string;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment