Skip to content

Commit 74f6e63

Browse files
committed
Fix sampling frequency
It had a serious bug - profile_period wasn't used in code at all Also, sampling frequency was a bit off - in 10 seconds of pg_sleep we lost about 1% of samples with default 10ms sampling interval
1 parent 91b163d commit 74f6e63

File tree

1 file changed

+67
-22
lines changed

1 file changed

+67
-22
lines changed

collector.c

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -299,17 +299,36 @@ make_profile_hash()
299299

300300
/*
301301
* Delta between two timestamps in milliseconds.
302+
* Also save microsecs to calculate rounding
302303
*/
303304
static int64
304-
millisecs_diff(TimestampTz tz1, TimestampTz tz2)
305+
millisecs_diff(TimestampTz tz1, TimestampTz tz2, int *microsecs)
305306
{
306-
long secs;
307-
int microsecs;
307+
long secs;
308308

309-
TimestampDifference(tz1, tz2, &secs, &microsecs);
309+
TimestampDifference(tz1, tz2, &secs, microsecs);
310310

311-
return secs * 1000 + microsecs / 1000;
311+
return secs * 1000 + *microsecs / 1000;
312+
}
312313

314+
/*
315+
* Calculate time (in milliseconds) between two samples considering leftovers
316+
*/
317+
static void
318+
calculate_elapsed(int64 *elapsed, int64 *leftovers, int period,
319+
TimestampTz start, TimestampTz end)
320+
{
321+
int microsecs = 0;
322+
323+
*elapsed += millisecs_diff(start, end, &microsecs);
324+
*leftovers += microsecs % 1000;
325+
326+
/* If leftovers are at least 1 microsecond */
327+
if (*leftovers >= 1000)
328+
{
329+
*elapsed += *leftovers / 1000;
330+
*leftovers %= 1000;
331+
}
313332
}
314333

315334
/*
@@ -323,8 +342,12 @@ pgws_collector_main(Datum main_arg)
323342
MemoryContext old_context,
324343
collector_context;
325344
TimestampTz current_ts,
326-
history_ts,
327-
profile_ts;
345+
previous_ts;
346+
/* in microsecs */
347+
int64 rounding_leftovers_history = 0,
348+
rounding_leftovers_profile = 0;
349+
int64 time_elapsed_history = 0,
350+
time_elapsed_profile = 0;
328351

329352
/*
330353
* Establish signal handlers.
@@ -359,16 +382,15 @@ pgws_collector_main(Datum main_arg)
359382
ereport(LOG, (errmsg("pg_wait_sampling collector started")));
360383

361384
/* Start counting time for history and profile samples */
362-
profile_ts = history_ts = GetCurrentTimestamp();
385+
previous_ts = GetCurrentTimestamp();
363386

364387
while (1)
365388
{
366389
int rc;
367390
shm_mq_handle *mqh;
368-
int64 history_diff,
369-
profile_diff;
370391
bool write_history,
371392
write_profile;
393+
int time_to_sleep;
372394

373395
/* We need an explicit call for at least ProcSignal notifications. */
374396
CHECK_FOR_INTERRUPTS();
@@ -382,30 +404,54 @@ pgws_collector_main(Datum main_arg)
382404
/* Calculate time to next sample for history or profile */
383405
current_ts = GetCurrentTimestamp();
384406

385-
history_diff = millisecs_diff(history_ts, current_ts);
386-
profile_diff = millisecs_diff(profile_ts, current_ts);
407+
calculate_elapsed(&time_elapsed_history, &rounding_leftovers_history,
408+
pgws_historyPeriod, previous_ts, current_ts);
409+
calculate_elapsed(&time_elapsed_profile, &rounding_leftovers_profile,
410+
pgws_profilePeriod, previous_ts, current_ts);
387411

388-
write_history = (history_diff >= (int64) pgws_historyPeriod);
389-
write_profile = (profile_diff >= (int64) pgws_profilePeriod);
412+
write_history = (time_elapsed_history >= (int64) pgws_historyPeriod);
413+
write_profile = (time_elapsed_profile >= (int64) pgws_profilePeriod);
390414

391415
if (write_history || write_profile)
392416
{
393417
probe_waits(&observations, profile_hash,
394418
write_history, write_profile, pgws_profilePid);
395-
419+
previous_ts = current_ts;
396420
if (write_history)
397421
{
398-
history_ts = current_ts;
399-
history_diff = 0;
422+
while (time_elapsed_history >= pgws_historyPeriod)
423+
{
424+
/*
425+
* Normally, this will happen only once, but if processor
426+
* is working overtime, time_elapsed could mount up.
427+
* We deliberately skip some samplings (well, we don't force
428+
* additional ones) in such case
429+
*/
430+
time_elapsed_history -= pgws_historyPeriod;
431+
}
400432
}
401-
402433
if (write_profile)
403434
{
404-
profile_ts = current_ts;
405-
profile_diff = 0;
435+
while (time_elapsed_profile >= pgws_profilePeriod)
436+
{
437+
/*
438+
* Normally, this will happen only once, but if processor
439+
* is working overtime, time_elapsed could mount up.
440+
* We deliberately skip some samplings (well, we don't force
441+
* additional ones) in such case
442+
*/
443+
time_elapsed_profile -= pgws_profilePeriod;
444+
}
406445
}
407446
}
408447

448+
/* Calculate how much time we have to sleep until any next sampling */
449+
time_to_sleep = Min(pgws_historyPeriod - (int) time_elapsed_history,
450+
pgws_profilePeriod - (int) time_elapsed_profile);
451+
452+
if (time_to_sleep < 0)
453+
time_to_sleep = 0;
454+
409455
/* Shutdown if requested */
410456
if (shutdown_requested)
411457
break;
@@ -415,8 +461,7 @@ pgws_collector_main(Datum main_arg)
415461
* shared memory.
416462
*/
417463
rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
418-
Min(pgws_historyPeriod - (int) history_diff,
419-
pgws_historyPeriod - (int) profile_diff), PG_WAIT_EXTENSION);
464+
time_to_sleep, PG_WAIT_EXTENSION);
420465

421466
if (rc & WL_POSTMASTER_DEATH)
422467
proc_exit(1);

0 commit comments

Comments
 (0)