Skip to content

Instantly share code, notes, and snippets.

@dagolden
Created March 28, 2013 20:02
Show Gist options
  • Select an option

  • Save dagolden/5266334 to your computer and use it in GitHub Desktop.

Select an option

Save dagolden/5266334 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use 5.010;
use strict;
use warnings;
use DDS;
use DateTime;
use JSON::XS;
use File::Slurp qw/read_file write_file/;
use Parse::CPAN::Authors;
use Parse::CPAN::Packages;
use aliased 'CPAN::DistnameInfo';
use aliased 'CPAN::Testers::Common::Article';
use aliased 'CPAN::Testers::Report';
use CPAN::Testers::Common::Utils;
use ORDB::CPANUploads;
BEGIN { $ENV{SCHEMA_LOADER_BACKCOMPAT}=1 }
use Getopt::Lucid qw/:all/;
use autodie;
#--------------------------------------------------------------------------#
my $opt = Getopt::Lucid->getopt([
Param('mailrc')->default("/srv/cpan/authors/01mailrc.txt.gz"),
Param('packages')->default("/srv/cpan/modules/02packages.details.txt.gz"),
Param('override')->default("./override.txt"),
Param('backpan')->default("./backpan.txt"),
Param('cache')->default("./backpan_index_cache.json"),
]);
my $mailrc = $opt->get_mailrc;
my $packages = $opt->get_packages;
my $override_list = $opt->get_override;
my $index_source = $opt->get_backpan;
my $index_cache = $opt->get_cache;
my $suffix_re = qr{\.(?:tar\.(bz2|gz|Z)|t(?:gz|bz)|zip)\z};
#--------------------------------------------------------------------------#
say STDERR "Loading CPAN authors list...";
my $mrc = Parse::CPAN::Authors->new($mailrc);
say STDERR "Loading CPAN packages list...";
my $pkg = Parse::CPAN::Packages->new($packages);
say STDERR "Loading Manual Override list...";
my $overrides;
for my $line ( read_file($override_list) ) {
my ($distvname, $path) = split ' ', $line;
$overrides->{$distvname} = $path;
}
my $dv_index;
if ( -f $index_cache ) {
say STDERR "Loading BackCPAN index...";
$dv_index = decode_json( scalar read_file($index_cache) ) ;
}
else {
say STDERR "Indexing BackPAN releases...";
my @lines = read_file($index_source);
for my $line ( @lines ) {
chomp $line;
next unless $line =~ m{authors/id/(.)/\1./};
next unless $line =~ $suffix_re;
my $d = DistnameInfo->new($line);
next unless defined $d->dist && defined $d->version;
$dv_index->{$d->distvname}{$d->cpanid} = $line
}
write_file($index_cache, encode_json($dv_index) );
}
for my $k ( sort keys %{$dv_index} ) {
my @found = map { s{authors/id/./../}{}; $_ } _find_distfile($k, '');
say "$k @found" if @found;
}
exit;
#--------------------------------------------------------------------------#
# private sub so we can return to shortcut heuristics
sub _find_distfile {
my ($dvname) = @_;
return unless my $authors = $dv_index->{$dvname};
my @authors = keys %$authors;
# If only one of this dvname on backpan
if ( @authors == 1 ) {
return values %$authors; # just one
}
# If we have an explicit manual mapping, we should use it
for ( $overrides->{$dvname} ) {
return $_ if defined $_;
}
# If only one of the paths on backpan has a known author in the 01mailrc;
# Examples seem to be where an owner moved distributions to a new ID
# and deleted the old one, so we use the one that is current
if ( 1 == (my @valid_ids = grep { length $mrc->author($_)->name } @authors) ) {
return $authors->{$valid_ids[0]};
}
# If we're here, we have multiple candidates, can't tell from the report,
# and have multiple potentially valid authors. Now we need to start
# guessing based on what we know about the distribution, the package
# index files and so on.
# we want to return an ordered list from most likely to least
# If only candidate path is current in 02packages, then it's highly likely
# that the indexed distfile is what was tested; if multiple are, it's a
# toss up.
my %in_pkgs = map { $_ => 1 } grep { $pkg->distribution($_) } values %$authors;
if ( keys %in_pkgs ) {
return (keys(%in_pkgs), grep { ! $in_pkgs{$_} } values %$authors);
}
# If we're here, then none of the candidate paths are currently indexed.
# I could just assume the *oldest* dist, on the grounds that it was
# probably the "official" one; I could try to compare distfile
# timestamps with report email timestamps; I could assume *oldest*
# only for non-dev dist versions and assume *most recent* for dev
# dists; I could try to match to the CPAN id of the *current* maintainer
# of a similarly named distribution in the index
#
# All of these have potential problems, but assuming it's the same
# as the current maintainer is probably a best best -- it's consistent
# with how CPANPLUS does things
my $current_maint;
for my $p ( values %$authors ) {
my $info = DistnameInfo->new($p);
my $latest = $pkg->latest_distribution($info->dist);
next unless $latest;
if ( $latest->cpanid eq $info->cpanid ) {
$current_maint = $p;
last;
}
}
if ( $current_maint ) {
return ($current_maint, grep { $_ ne $current_maint } values %$authors);
}
# The author doesn't also maintain the current distribution. Bummer.
# Can we find who uploaded first?
my %dates;
for my $p ( values %$authors ) {
my $info = DistnameInfo->new($p);
my @list = ORDB::CPANUploads::Uploads->select(
'where author = ? and filename = ?', $info->cpanid, $info->filename
);
warn "Got " . scalar @list . " uploads for '$p'\n" if @list != 1;
next unless @list;
$dates{$p} = $list[0]->released;
}
if ( keys %dates ) {
return sort { $dates{$a} <=> $dates{$b} } keys %dates;
}
# wow -- we really have no clue. Give up!
return;
}
# vim: ts=2 sts=2 sw=2:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment