From 4532e5ec443431496885481f86131a3a38c2978b Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 16 Nov 2017 15:05:55 -0500 Subject: [PATCH 1/2] QA tools for comparison of packages in MetaCPAN and PAUSE bin/qa/02packages outputs text in the style of PAUSE's 02packages.details.txt, but describes MetaCPAN's view of CPAN. A key difference is that versions are _always_ numified and formatted as decimals with 12 digits of precision (4 repeats of 3 digit fixed-width values). bin/qa/normalize-02packages transforms PAUSE's 02packages in order to ease direct textual diffing of it with the output of bin/qa/02packages above. Notably, this numifies version numbers in the same consistent way, as PAUSE's indexing has changed over the years leading to a mix of version formats in the file. I'm using these tools to help identify classes of data mismatches between MetaCPAN and PAUSE, which is useful for knowing what we have and what we can rely on. --- bin/qa/02packages | 90 +++++++++++++++++++++++++++++++++++++ bin/qa/normalize-02packages | 37 +++++++++++++++ 2 files changed, 127 insertions(+) create mode 100755 bin/qa/02packages create mode 100755 bin/qa/normalize-02packages diff --git a/bin/qa/02packages b/bin/qa/02packages new file mode 100755 index 0000000..e7b6347 --- /dev/null +++ b/bin/qa/02packages @@ -0,0 +1,90 @@ +#!/usr/bin/env perl +use 5.020; +use utf8; +use open qw< :std :encoding(UTF-8) >; +use strict; +use warnings; +use experimental qw< postderef >; + +use MetaCPAN::Client; +use List::UtilsBy qw< uniq_by >; + +my $n = 0; +my $page_size = 5000; +my @lines; + +my $metacpan = MetaCPAN::Client->new; +my $files = $metacpan->all( + "files", + { + # This filter is based on the "find" method in + # MetaCPAN::Document::File::Set. + es_filter => { + and => [ + # The _file_ must be indexed (i.e. not an ignored file), + # authorized, and marked "latest"… + { term => { indexed => 1 } }, + { term => { authorized => 1 } }, + { term => { status => 'latest' } }, + + # …and have at least one _module_ which is also indexed and + # authorized. + { + nested => { + path => "module", + filter => { + and => [ + { term => { "module.indexed" => 1 } }, + { term => { "module.authorized" => 1 } }, + ] + } + } + }, + ], + }, + # Only about 5m is necessary on the ServerCentral internet connection, + # but it varies. Use way more time than we need to be safe. + scroller_time => '10m', + scroller_size => $page_size, + } +); + +warn "Starting scroll over \"latest\" files…\n"; + +while (my $file = $files->next) { + warn "Screened $n files\n" + if $n++ and $n % $page_size == 0; + + # This should be guaranteed by the ES query, but just in case, it doesn't + # hurt to repeat it here. + next unless $file->indexed + and $file->authorized + and $file->status eq 'latest'; + + my $archive = $file->download_url =~ s{.*/authors/id/}{}r; + my $modules = $file->module + or next; + + for my $module ($modules->@*) { + next unless $module->{indexed} and $module->{authorized}; + push @lines, join "\t", + $module->{name}, + sprintf("%.12f", $module->{version_numified} // 0), + $archive; + } +} + +# This mimics the header that PAUSE adds. +say <; +use version; + +while (<>) { + # Skip the header + print, next if 1 .. 9; + + # Transform the data lines + my ($module, $version, $archive) = split ' ', $_, 3; + print join "\t", + $module, + numify_version($version), + $archive; +} + +sub numify_version { + my $version = shift; + + # undef → 0 + $version = 0 if $version eq "undef"; + + # Strip underscores, so version->parse doesn't barf. These generally don't + # make it into 02packages since underscores indicate a trial release, but + # they can when the release itself is not a trial release but the module + # version contains an underscore. + $version =~ s/_//g; + + # numify + $version = version->parse($version)->numify; + + return sprintf "%.12f", $version; +} From 51a23b16d00fea02e5fcc78cd61ce055604c0ab2 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Sat, 18 Nov 2017 07:28:45 -0800 Subject: [PATCH 2/2] Comparison script for 02packages --- .gitignore | 2 ++ bin/qa/compare-02packages | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 .gitignore create mode 100755 bin/qa/compare-02packages diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ac325f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/02packages.pause.txt +/02packages.metacpan.txt diff --git a/bin/qa/compare-02packages b/bin/qa/compare-02packages new file mode 100755 index 0000000..75a3dd4 --- /dev/null +++ b/bin/qa/compare-02packages @@ -0,0 +1,21 @@ +#!/bin/bash +set -e -o pipefail + +bin=$(dirname $0) +pause=02packages.pause.txt +metacpan=02packages.metacpan.txt + +if [[ ! -s $pause ]]; then + echo Downloading 02packages from PAUSE… >&2 + curl -fsSL https://cpan.metacpan.org/modules/02packages.details.txt.gz \ + | gunzip -c \ + | $bin/normalize-02packages \ + > $pause +fi + +if [[ ! -s $metacpan ]]; then + echo Generating 02packages from MetaCPAN… >&2 + $bin/02packages > 02packages.metacpan.txt +fi + +git diff --no-index --color-words $pause $metacpan