|
333 | 333 | $node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'"); |
334 | 334 | $node_standby3->start; |
335 | 335 | $node_primary3->wait_for_catchup($node_standby3); |
336 | | -my $senderpid = $node_primary3->safe_psql('postgres', |
337 | | - "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'"); |
338 | | - |
339 | | -# We've seen occasional cases where multiple walsender pids are active. An |
340 | | -# immediate shutdown may hide evidence of a locking bug. So if multiple |
341 | | -# walsenders are observed, shut down in fast mode, and collect some more |
342 | | -# information. |
343 | | -if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid")) |
| 336 | + |
| 337 | +my $senderpid; |
| 338 | + |
| 339 | +# We've seen occasional cases where multiple walsender pids are active. It |
| 340 | +# could be that we're just observing process shutdown being slow. To collect |
| 341 | +# more information, retry a couple times, print a bit of debugging information |
| 342 | +# each iteration. For now report a test failure even if later iterations |
| 343 | +# succeed. |
| 344 | +my $i = 0; |
| 345 | +while (1) |
344 | 346 | { |
345 | 347 | my ($stdout, $stderr); |
| 348 | + |
| 349 | + $senderpid = $node_primary3->safe_psql('postgres', |
| 350 | + "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'"); |
| 351 | + |
| 352 | + last if like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"); |
| 353 | + |
| 354 | + # show information about all active connections |
346 | 355 | $node_primary3->psql('postgres', |
347 | 356 | "\\a\\t\nSELECT * FROM pg_stat_activity", |
348 | 357 | stdout => \$stdout, stderr => \$stderr); |
349 | 358 | diag $stdout, $stderr; |
350 | | - $node_primary3->stop('fast'); |
351 | | - $node_standby3->stop('fast'); |
352 | | - die "could not determine walsender pid, can't continue"; |
| 359 | + |
| 360 | + # unlikely that the problem would resolve after 15s, so give up at point |
| 361 | + if ($i++ == 150) |
| 362 | + { |
| 363 | + # An immediate shutdown may hide evidence of a locking bug. If |
| 364 | + # retrying didn't resolve the issue, shut down in fast mode. |
| 365 | + $node_primary3->stop('fast'); |
| 366 | + $node_standby3->stop('fast'); |
| 367 | + die "could not determine walsender pid, can't continue"; |
| 368 | + } |
| 369 | + |
| 370 | + usleep(100_000); |
353 | 371 | } |
354 | 372 |
|
355 | 373 | my $receiverpid = $node_standby3->safe_psql('postgres', |
|
0 commit comments