diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ac325f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/02packages.pause.txt +/02packages.metacpan.txt diff --git a/bin/qa/02packages b/bin/qa/02packages new file mode 100755 index 0000000..e7b6347 --- /dev/null +++ b/bin/qa/02packages @@ -0,0 +1,90 @@ +#!/usr/bin/env perl +use 5.020; +use utf8; +use open qw< :std :encoding(UTF-8) >; +use strict; +use warnings; +use experimental qw< postderef >; + +use MetaCPAN::Client; +use List::UtilsBy qw< uniq_by >; + +my $n = 0; +my $page_size = 5000; +my @lines; + +my $metacpan = MetaCPAN::Client->new; +my $files = $metacpan->all( + "files", + { + # This filter is based on the "find" method in + # MetaCPAN::Document::File::Set. + es_filter => { + and => [ + # The _file_ must be indexed (i.e. not an ignored file), + # authorized, and marked "latest"… + { term => { indexed => 1 } }, + { term => { authorized => 1 } }, + { term => { status => 'latest' } }, + + # …and have at least one _module_ which is also indexed and + # authorized. + { + nested => { + path => "module", + filter => { + and => [ + { term => { "module.indexed" => 1 } }, + { term => { "module.authorized" => 1 } }, + ] + } + } + }, + ], + }, + # Only about 5m is necessary on the ServerCentral internet connection, + # but it varies. Use way more time than we need to be safe. + scroller_time => '10m', + scroller_size => $page_size, + } +); + +warn "Starting scroll over \"latest\" files…\n"; + +while (my $file = $files->next) { + warn "Screened $n files\n" + if $n++ and $n % $page_size == 0; + + # This should be guaranteed by the ES query, but just in case, it doesn't + # hurt to repeat it here. + next unless $file->indexed + and $file->authorized + and $file->status eq 'latest'; + + my $archive = $file->download_url =~ s{.*/authors/id/}{}r; + my $modules = $file->module + or next; + + for my $module ($modules->@*) { + next unless $module->{indexed} and $module->{authorized}; + push @lines, join "\t", + $module->{name}, + sprintf("%.12f", $module->{version_numified} // 0), + $archive; + } +} + +# This mimics the header that PAUSE adds. +say <&2 + curl -fsSL https://cpan.metacpan.org/modules/02packages.details.txt.gz \ + | gunzip -c \ + | $bin/normalize-02packages \ + > $pause +fi + +if [[ ! -s $metacpan ]]; then + echo Generating 02packages from MetaCPAN… >&2 + $bin/02packages > 02packages.metacpan.txt +fi + +git diff --no-index --color-words $pause $metacpan diff --git a/bin/qa/normalize-02packages b/bin/qa/normalize-02packages new file mode 100755 index 0000000..679b92e --- /dev/null +++ b/bin/qa/normalize-02packages @@ -0,0 +1,37 @@ +#!/usr/bin/env perl +use 5.020; +use strict; +use warnings; +use utf8; +use open qw< :std :encoding(UTF-8) >; +use version; + +while (<>) { + # Skip the header + print, next if 1 .. 9; + + # Transform the data lines + my ($module, $version, $archive) = split ' ', $_, 3; + print join "\t", + $module, + numify_version($version), + $archive; +} + +sub numify_version { + my $version = shift; + + # undef → 0 + $version = 0 if $version eq "undef"; + + # Strip underscores, so version->parse doesn't barf. These generally don't + # make it into 02packages since underscores indicate a trial release, but + # they can when the release itself is not a trial release but the module + # version contains an underscore. + $version =~ s/_//g; + + # numify + $version = version->parse($version)->numify; + + return sprintf "%.12f", $version; +}