5050#include "utils/lsyscache.h"
5151#include "utils/memutils.h"
5252#include "utils/pg_rusage.h"
53+ #include "utils/sampling.h"
5354#include "utils/sortsupport.h"
5455#include "utils/syscache.h"
5556#include "utils/timestamp.h"
5657#include "utils/tqual.h"
5758
5859
59- /* Data structure for Algorithm S from Knuth 3.4.2 */
60- typedef struct
61- {
62- BlockNumber N ; /* number of blocks, known in advance */
63- int n ; /* desired sample size */
64- BlockNumber t ; /* current block number */
65- int m ; /* blocks selected so far */
66- } BlockSamplerData ;
67-
68- typedef BlockSamplerData * BlockSampler ;
69-
7060/* Per-index data for ANALYZE */
7161typedef struct AnlIndexData
7262{
@@ -89,10 +79,6 @@ static void do_analyze_rel(Relation onerel, int options,
8979 VacuumParams * params , List * va_cols ,
9080 AcquireSampleRowsFunc acquirefunc , BlockNumber relpages ,
9181 bool inh , bool in_outer_xact , int elevel );
92- static void BlockSampler_Init (BlockSampler bs , BlockNumber nblocks ,
93- int samplesize );
94- static bool BlockSampler_HasMore (BlockSampler bs );
95- static BlockNumber BlockSampler_Next (BlockSampler bs );
9682static void compute_index_stats (Relation onerel , double totalrows ,
9783 AnlIndexData * indexdata , int nindexes ,
9884 HeapTuple * rows , int numrows ,
@@ -950,94 +936,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
950936 return stats ;
951937}
952938
953- /*
954- * BlockSampler_Init -- prepare for random sampling of blocknumbers
955- *
956- * BlockSampler is used for stage one of our new two-stage tuple
957- * sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
958- * "Large DB"). It selects a random sample of samplesize blocks out of
959- * the nblocks blocks in the table. If the table has less than
960- * samplesize blocks, all blocks are selected.
961- *
962- * Since we know the total number of blocks in advance, we can use the
963- * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
964- * algorithm.
965- */
966- static void
967- BlockSampler_Init (BlockSampler bs , BlockNumber nblocks , int samplesize )
968- {
969- bs -> N = nblocks ; /* measured table size */
970-
971- /*
972- * If we decide to reduce samplesize for tables that have less or not much
973- * more than samplesize blocks, here is the place to do it.
974- */
975- bs -> n = samplesize ;
976- bs -> t = 0 ; /* blocks scanned so far */
977- bs -> m = 0 ; /* blocks selected so far */
978- }
979-
980- static bool
981- BlockSampler_HasMore (BlockSampler bs )
982- {
983- return (bs -> t < bs -> N ) && (bs -> m < bs -> n );
984- }
985-
986- static BlockNumber
987- BlockSampler_Next (BlockSampler bs )
988- {
989- BlockNumber K = bs -> N - bs -> t ; /* remaining blocks */
990- int k = bs -> n - bs -> m ; /* blocks still to sample */
991- double p ; /* probability to skip block */
992- double V ; /* random */
993-
994- Assert (BlockSampler_HasMore (bs )); /* hence K > 0 and k > 0 */
995-
996- if ((BlockNumber ) k >= K )
997- {
998- /* need all the rest */
999- bs -> m ++ ;
1000- return bs -> t ++ ;
1001- }
1002-
1003- /*----------
1004- * It is not obvious that this code matches Knuth's Algorithm S.
1005- * Knuth says to skip the current block with probability 1 - k/K.
1006- * If we are to skip, we should advance t (hence decrease K), and
1007- * repeat the same probabilistic test for the next block. The naive
1008- * implementation thus requires an anl_random_fract() call for each block
1009- * number. But we can reduce this to one anl_random_fract() call per
1010- * selected block, by noting that each time the while-test succeeds,
1011- * we can reinterpret V as a uniform random number in the range 0 to p.
1012- * Therefore, instead of choosing a new V, we just adjust p to be
1013- * the appropriate fraction of its former value, and our next loop
1014- * makes the appropriate probabilistic test.
1015- *
1016- * We have initially K > k > 0. If the loop reduces K to equal k,
1017- * the next while-test must fail since p will become exactly zero
1018- * (we assume there will not be roundoff error in the division).
1019- * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
1020- * to be doubly sure about roundoff error.) Therefore K cannot become
1021- * less than k, which means that we cannot fail to select enough blocks.
1022- *----------
1023- */
1024- V = anl_random_fract ();
1025- p = 1.0 - (double ) k / (double ) K ;
1026- while (V < p )
1027- {
1028- /* skip */
1029- bs -> t ++ ;
1030- K -- ; /* keep K == N - t */
1031-
1032- /* adjust p to be new cutoff point in reduced range */
1033- p *= 1.0 - (double ) k / (double ) K ;
1034- }
1035-
1036- /* select */
1037- bs -> m ++ ;
1038- return bs -> t ++ ;
1039- }
1040-
1041939/*
1042940 * acquire_sample_rows -- acquire a random sample of rows from the table
1043941 *
@@ -1084,7 +982,7 @@ acquire_sample_rows(Relation onerel, int elevel,
1084982 BlockNumber totalblocks ;
1085983 TransactionId OldestXmin ;
1086984 BlockSamplerData bs ;
1087- double rstate ;
985+ ReservoirStateData rstate ;
1088986
1089987 Assert (targrows > 0 );
1090988
@@ -1094,9 +992,9 @@ acquire_sample_rows(Relation onerel, int elevel,
1094992 OldestXmin = GetOldestXmin (onerel , true);
1095993
1096994 /* Prepare for sampling block numbers */
1097- BlockSampler_Init (& bs , totalblocks , targrows );
995+ BlockSampler_Init (& bs , totalblocks , targrows , random () );
1098996 /* Prepare for sampling rows */
1099- rstate = anl_init_selection_state ( targrows );
997+ reservoir_init_selection_state ( & rstate , targrows );
1100998
1101999 /* Outer loop over blocks to sample */
11021000 while (BlockSampler_HasMore (& bs ))
@@ -1244,16 +1142,15 @@ acquire_sample_rows(Relation onerel, int elevel,
12441142 * t.
12451143 */
12461144 if (rowstoskip < 0 )
1247- rowstoskip = anl_get_next_S (samplerows , targrows ,
1248- & rstate );
1145+ rowstoskip = reservoir_get_next_S (& rstate , samplerows , targrows );
12491146
12501147 if (rowstoskip <= 0 )
12511148 {
12521149 /*
12531150 * Found a suitable tuple, so save it, replacing one
12541151 * old tuple at random
12551152 */
1256- int k = (int ) (targrows * anl_random_fract ());
1153+ int k = (int ) (targrows * sampler_random_fract ());
12571154
12581155 Assert (k >= 0 && k < targrows );
12591156 heap_freetuple (rows [k ]);
@@ -1312,116 +1209,6 @@ acquire_sample_rows(Relation onerel, int elevel,
13121209 return numrows ;
13131210}
13141211
1315- /* Select a random value R uniformly distributed in (0 - 1) */
1316- double
1317- anl_random_fract (void )
1318- {
1319- return ((double ) random () + 1 ) / ((double ) MAX_RANDOM_VALUE + 2 );
1320- }
1321-
1322- /*
1323- * These two routines embody Algorithm Z from "Random sampling with a
1324- * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
1325- * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
1326- * of the count S of records to skip before processing another record.
1327- * It is computed primarily based on t, the number of records already read.
1328- * The only extra state needed between calls is W, a random state variable.
1329- *
1330- * anl_init_selection_state computes the initial W value.
1331- *
1332- * Given that we've already read t records (t >= n), anl_get_next_S
1333- * determines the number of records to skip before the next record is
1334- * processed.
1335- */
1336- double
1337- anl_init_selection_state (int n )
1338- {
1339- /* Initial value of W (for use when Algorithm Z is first applied) */
1340- return exp (- log (anl_random_fract ()) / n );
1341- }
1342-
1343- double
1344- anl_get_next_S (double t , int n , double * stateptr )
1345- {
1346- double S ;
1347-
1348- /* The magic constant here is T from Vitter's paper */
1349- if (t <= (22.0 * n ))
1350- {
1351- /* Process records using Algorithm X until t is large enough */
1352- double V ,
1353- quot ;
1354-
1355- V = anl_random_fract (); /* Generate V */
1356- S = 0 ;
1357- t += 1 ;
1358- /* Note: "num" in Vitter's code is always equal to t - n */
1359- quot = (t - (double ) n ) / t ;
1360- /* Find min S satisfying (4.1) */
1361- while (quot > V )
1362- {
1363- S += 1 ;
1364- t += 1 ;
1365- quot *= (t - (double ) n ) / t ;
1366- }
1367- }
1368- else
1369- {
1370- /* Now apply Algorithm Z */
1371- double W = * stateptr ;
1372- double term = t - (double ) n + 1 ;
1373-
1374- for (;;)
1375- {
1376- double numer ,
1377- numer_lim ,
1378- denom ;
1379- double U ,
1380- X ,
1381- lhs ,
1382- rhs ,
1383- y ,
1384- tmp ;
1385-
1386- /* Generate U and X */
1387- U = anl_random_fract ();
1388- X = t * (W - 1.0 );
1389- S = floor (X ); /* S is tentatively set to floor(X) */
1390- /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
1391- tmp = (t + 1 ) / term ;
1392- lhs = exp (log (((U * tmp * tmp ) * (term + S )) / (t + X )) / n );
1393- rhs = (((t + X ) / (term + S )) * term ) / t ;
1394- if (lhs <= rhs )
1395- {
1396- W = rhs / lhs ;
1397- break ;
1398- }
1399- /* Test if U <= f(S)/cg(X) */
1400- y = (((U * (t + 1 )) / term ) * (t + S + 1 )) / (t + X );
1401- if ((double ) n < S )
1402- {
1403- denom = t ;
1404- numer_lim = term + S ;
1405- }
1406- else
1407- {
1408- denom = t - (double ) n + S ;
1409- numer_lim = t + 1 ;
1410- }
1411- for (numer = t + S ; numer >= numer_lim ; numer -= 1 )
1412- {
1413- y *= numer / denom ;
1414- denom -= 1 ;
1415- }
1416- W = exp (- log (anl_random_fract ()) / n ); /* Generate W in advance */
1417- if (exp (log (y ) / n ) <= (t + X ) / t )
1418- break ;
1419- }
1420- * stateptr = W ;
1421- }
1422- return S ;
1423- }
1424-
14251212/*
14261213 * qsort comparator for sorting rows[] array
14271214 */
0 commit comments