Created
March 28, 2013 20:02
-
-
Save dagolden/5266334 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| use 5.010; | |
| use strict; | |
| use warnings; | |
| use DDS; | |
| use DateTime; | |
| use JSON::XS; | |
| use File::Slurp qw/read_file write_file/; | |
| use Parse::CPAN::Authors; | |
| use Parse::CPAN::Packages; | |
| use aliased 'CPAN::DistnameInfo'; | |
| use aliased 'CPAN::Testers::Common::Article'; | |
| use aliased 'CPAN::Testers::Report'; | |
| use CPAN::Testers::Common::Utils; | |
| use ORDB::CPANUploads; | |
| BEGIN { $ENV{SCHEMA_LOADER_BACKCOMPAT}=1 } | |
| use Getopt::Lucid qw/:all/; | |
| use autodie; | |
| #--------------------------------------------------------------------------# | |
| my $opt = Getopt::Lucid->getopt([ | |
| Param('mailrc')->default("/srv/cpan/authors/01mailrc.txt.gz"), | |
| Param('packages')->default("/srv/cpan/modules/02packages.details.txt.gz"), | |
| Param('override')->default("./override.txt"), | |
| Param('backpan')->default("./backpan.txt"), | |
| Param('cache')->default("./backpan_index_cache.json"), | |
| ]); | |
| my $mailrc = $opt->get_mailrc; | |
| my $packages = $opt->get_packages; | |
| my $override_list = $opt->get_override; | |
| my $index_source = $opt->get_backpan; | |
| my $index_cache = $opt->get_cache; | |
| my $suffix_re = qr{\.(?:tar\.(bz2|gz|Z)|t(?:gz|bz)|zip)\z}; | |
| #--------------------------------------------------------------------------# | |
| say STDERR "Loading CPAN authors list..."; | |
| my $mrc = Parse::CPAN::Authors->new($mailrc); | |
| say STDERR "Loading CPAN packages list..."; | |
| my $pkg = Parse::CPAN::Packages->new($packages); | |
| say STDERR "Loading Manual Override list..."; | |
| my $overrides; | |
| for my $line ( read_file($override_list) ) { | |
| my ($distvname, $path) = split ' ', $line; | |
| $overrides->{$distvname} = $path; | |
| } | |
| my $dv_index; | |
| if ( -f $index_cache ) { | |
| say STDERR "Loading BackCPAN index..."; | |
| $dv_index = decode_json( scalar read_file($index_cache) ) ; | |
| } | |
| else { | |
| say STDERR "Indexing BackPAN releases..."; | |
| my @lines = read_file($index_source); | |
| for my $line ( @lines ) { | |
| chomp $line; | |
| next unless $line =~ m{authors/id/(.)/\1./}; | |
| next unless $line =~ $suffix_re; | |
| my $d = DistnameInfo->new($line); | |
| next unless defined $d->dist && defined $d->version; | |
| $dv_index->{$d->distvname}{$d->cpanid} = $line | |
| } | |
| write_file($index_cache, encode_json($dv_index) ); | |
| } | |
| for my $k ( sort keys %{$dv_index} ) { | |
| my @found = map { s{authors/id/./../}{}; $_ } _find_distfile($k, ''); | |
| say "$k @found" if @found; | |
| } | |
| exit; | |
| #--------------------------------------------------------------------------# | |
| # private sub so we can return to shortcut heuristics | |
| sub _find_distfile { | |
| my ($dvname) = @_; | |
| return unless my $authors = $dv_index->{$dvname}; | |
| my @authors = keys %$authors; | |
| # If only one of this dvname on backpan | |
| if ( @authors == 1 ) { | |
| return values %$authors; # just one | |
| } | |
| # If we have an explicit manual mapping, we should use it | |
| for ( $overrides->{$dvname} ) { | |
| return $_ if defined $_; | |
| } | |
| # If only one of the paths on backpan has a known author in the 01mailrc; | |
| # Examples seem to be where an owner moved distributions to a new ID | |
| # and deleted the old one, so we use the one that is current | |
| if ( 1 == (my @valid_ids = grep { length $mrc->author($_)->name } @authors) ) { | |
| return $authors->{$valid_ids[0]}; | |
| } | |
| # If we're here, we have multiple candidates, can't tell from the report, | |
| # and have multiple potentially valid authors. Now we need to start | |
| # guessing based on what we know about the distribution, the package | |
| # index files and so on. | |
| # we want to return an ordered list from most likely to least | |
| # If only candidate path is current in 02packages, then it's highly likely | |
| # that the indexed distfile is what was tested; if multiple are, it's a | |
| # toss up. | |
| my %in_pkgs = map { $_ => 1 } grep { $pkg->distribution($_) } values %$authors; | |
| if ( keys %in_pkgs ) { | |
| return (keys(%in_pkgs), grep { ! $in_pkgs{$_} } values %$authors); | |
| } | |
| # If we're here, then none of the candidate paths are currently indexed. | |
| # I could just assume the *oldest* dist, on the grounds that it was | |
| # probably the "official" one; I could try to compare distfile | |
| # timestamps with report email timestamps; I could assume *oldest* | |
| # only for non-dev dist versions and assume *most recent* for dev | |
| # dists; I could try to match to the CPAN id of the *current* maintainer | |
| # of a similarly named distribution in the index | |
| # | |
| # All of these have potential problems, but assuming it's the same | |
| # as the current maintainer is probably a best best -- it's consistent | |
| # with how CPANPLUS does things | |
| my $current_maint; | |
| for my $p ( values %$authors ) { | |
| my $info = DistnameInfo->new($p); | |
| my $latest = $pkg->latest_distribution($info->dist); | |
| next unless $latest; | |
| if ( $latest->cpanid eq $info->cpanid ) { | |
| $current_maint = $p; | |
| last; | |
| } | |
| } | |
| if ( $current_maint ) { | |
| return ($current_maint, grep { $_ ne $current_maint } values %$authors); | |
| } | |
| # The author doesn't also maintain the current distribution. Bummer. | |
| # Can we find who uploaded first? | |
| my %dates; | |
| for my $p ( values %$authors ) { | |
| my $info = DistnameInfo->new($p); | |
| my @list = ORDB::CPANUploads::Uploads->select( | |
| 'where author = ? and filename = ?', $info->cpanid, $info->filename | |
| ); | |
| warn "Got " . scalar @list . " uploads for '$p'\n" if @list != 1; | |
| next unless @list; | |
| $dates{$p} = $list[0]->released; | |
| } | |
| if ( keys %dates ) { | |
| return sort { $dates{$a} <=> $dates{$b} } keys %dates; | |
| } | |
| # wow -- we really have no clue. Give up! | |
| return; | |
| } | |
| # vim: ts=2 sts=2 sw=2: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment