@@ -420,6 +420,7 @@ static void TerminateChildren(int signal);
420420#define SignalChildren (sig ) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
421421
422422static int CountChildren (int target );
423+ static bool assign_backendlist_entry (RegisteredBgWorker * rw );
423424static void maybe_start_bgworker (void );
424425static bool CreateOptsFile (int argc , char * argv [], char * fullprogname );
425426static pid_t StartChildProcess (AuxProcType type );
@@ -5531,13 +5532,33 @@ bgworker_forkexec(int shmem_slot)
55315532 * Start a new bgworker.
55325533 * Starting time conditions must have been checked already.
55335534 *
5535+ * Returns true on success, false on failure.
5536+ * In either case, update the RegisteredBgWorker's state appropriately.
5537+ *
55345538 * This code is heavily based on autovacuum.c, q.v.
55355539 */
5536- static void
5540+ static bool
55375541do_start_bgworker (RegisteredBgWorker * rw )
55385542{
55395543 pid_t worker_pid ;
55405544
5545+ Assert (rw -> rw_pid == 0 );
5546+
5547+ /*
5548+ * Allocate and assign the Backend element. Note we must do this before
5549+ * forking, so that we can handle out of memory properly.
5550+ *
5551+ * Treat failure as though the worker had crashed. That way, the
5552+ * postmaster will wait a bit before attempting to start it again; if it
5553+ * tried again right away, most likely it'd find itself repeating the
5554+ * out-of-memory or fork failure condition.
5555+ */
5556+ if (!assign_backendlist_entry (rw ))
5557+ {
5558+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5559+ return false;
5560+ }
5561+
55415562 ereport (DEBUG1 ,
55425563 (errmsg ("starting background worker process \"%s\"" ,
55435564 rw -> rw_worker .bgw_name )));
@@ -5549,9 +5570,17 @@ do_start_bgworker(RegisteredBgWorker *rw)
55495570#endif
55505571 {
55515572 case -1 :
5573+ /* in postmaster, fork failed ... */
55525574 ereport (LOG ,
55535575 (errmsg ("could not fork worker process: %m" )));
5554- return ;
5576+ /* undo what assign_backendlist_entry did */
5577+ ReleasePostmasterChildSlot (rw -> rw_child_slot );
5578+ rw -> rw_child_slot = 0 ;
5579+ free (rw -> rw_backend );
5580+ rw -> rw_backend = NULL ;
5581+ /* mark entry as crashed, so we'll try again later */
5582+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5583+ break ;
55555584
55565585#ifndef EXEC_BACKEND
55575586 case 0 :
@@ -5575,14 +5604,24 @@ do_start_bgworker(RegisteredBgWorker *rw)
55755604 PostmasterContext = NULL ;
55765605
55775606 StartBackgroundWorker ();
5607+
5608+ exit (1 ); /* should not get here */
55785609 break ;
55795610#endif
55805611 default :
5612+ /* in postmaster, fork successful ... */
55815613 rw -> rw_pid = worker_pid ;
55825614 rw -> rw_backend -> pid = rw -> rw_pid ;
55835615 ReportBackgroundWorkerPID (rw );
5584- break ;
5616+ /* add new worker to lists of backends */
5617+ dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5618+ #ifdef EXEC_BACKEND
5619+ ShmemBackendArrayAdd (rw -> rw_backend );
5620+ #endif
5621+ return true;
55855622 }
5623+
5624+ return false;
55865625}
55875626
55885627/*
@@ -5629,6 +5668,8 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
56295668 * Allocate the Backend struct for a connected background worker, but don't
56305669 * add it to the list of backends just yet.
56315670 *
5671+ * On failure, return false without changing any worker state.
5672+ *
56325673 * Some info from the Backend is copied into the passed rw.
56335674 */
56345675static bool
@@ -5647,8 +5688,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
56475688 ereport (LOG ,
56485689 (errcode (ERRCODE_INTERNAL_ERROR ),
56495690 errmsg ("could not generate random cancel key" )));
5650-
5651- rw -> rw_crashed_at = GetCurrentTimestamp ();
56525691 return false;
56535692 }
56545693
@@ -5658,14 +5697,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
56585697 ereport (LOG ,
56595698 (errcode (ERRCODE_OUT_OF_MEMORY ),
56605699 errmsg ("out of memory" )));
5661-
5662- /*
5663- * The worker didn't really crash, but setting this nonzero makes
5664- * postmaster wait a bit before attempting to start it again; if it
5665- * tried again right away, most likely it'd find itself under the same
5666- * memory pressure.
5667- */
5668- rw -> rw_crashed_at = GetCurrentTimestamp ();
56695700 return false;
56705701 }
56715702
@@ -5687,20 +5718,31 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
56875718 * As a side effect, the bgworker control variables are set or reset whenever
56885719 * there are more workers to start after this one, and whenever the overall
56895720 * system state requires it.
5721+ *
5722+ * The reason we start at most one worker per call is to avoid consuming the
5723+ * postmaster's attention for too long when many such requests are pending.
5724+ * As long as StartWorkerNeeded is true, ServerLoop will not block and will
5725+ * call this function again after dealing with any other issues.
56905726 */
56915727static void
56925728maybe_start_bgworker (void )
56935729{
56945730 slist_mutable_iter iter ;
56955731 TimestampTz now = 0 ;
56965732
5733+ /*
5734+ * During crash recovery, we have no need to be called until the state
5735+ * transition out of recovery.
5736+ */
56975737 if (FatalError )
56985738 {
56995739 StartWorkerNeeded = false;
57005740 HaveCrashedWorker = false;
5701- return ; /* not yet */
5741+ return ;
57025742 }
57035743
5744+ /* Don't need to be called again unless we find a reason for it below */
5745+ StartWorkerNeeded = false;
57045746 HaveCrashedWorker = false;
57055747
57065748 slist_foreach_modify (iter , & BackgroundWorkerList )
@@ -5709,11 +5751,11 @@ maybe_start_bgworker(void)
57095751
57105752 rw = slist_container (RegisteredBgWorker , rw_lnode , iter .cur );
57115753
5712- /* already running? */
5754+ /* ignore if already running */
57135755 if (rw -> rw_pid != 0 )
57145756 continue ;
57155757
5716- /* marked for death? */
5758+ /* if marked for death, clean up and remove from list */
57175759 if (rw -> rw_terminate )
57185760 {
57195761 ForgetBackgroundWorker (& iter );
@@ -5735,48 +5777,50 @@ maybe_start_bgworker(void)
57355777 continue ;
57365778 }
57375779
5780+ /* read system time only when needed */
57385781 if (now == 0 )
57395782 now = GetCurrentTimestamp ();
57405783
57415784 if (!TimestampDifferenceExceeds (rw -> rw_crashed_at , now ,
57425785 rw -> rw_worker .bgw_restart_time * 1000 ))
57435786 {
5787+ /* Set flag to remember that we have workers to start later */
57445788 HaveCrashedWorker = true;
57455789 continue ;
57465790 }
57475791 }
57485792
57495793 if (bgworker_should_start_now (rw -> rw_worker .bgw_start_time ))
57505794 {
5751- /* reset crash time before calling assign_backendlist_entry */
5795+ /* reset crash time before trying to start worker */
57525796 rw -> rw_crashed_at = 0 ;
57535797
57545798 /*
5755- * Allocate and assign the Backend element. Note we must do this
5756- * before forking, so that we can handle out of memory properly.
5799+ * Try to start the worker.
5800+ *
5801+ * On failure, give up processing workers for now, but set
5802+ * StartWorkerNeeded so we'll come back here on the next iteration
5803+ * of ServerLoop to try again. (We don't want to wait, because
5804+ * there might be additional ready-to-run workers.) We could set
5805+ * HaveCrashedWorker as well, since this worker is now marked
5806+ * crashed, but there's no need because the next run of this
5807+ * function will do that.
57575808 */
5758- if (!assign_backendlist_entry (rw ))
5809+ if (!do_start_bgworker (rw ))
5810+ {
5811+ StartWorkerNeeded = true;
57595812 return ;
5760-
5761- do_start_bgworker (rw ); /* sets rw->rw_pid */
5762-
5763- dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5764- #ifdef EXEC_BACKEND
5765- ShmemBackendArrayAdd (rw -> rw_backend );
5766- #endif
5813+ }
57675814
57685815 /*
5769- * Have ServerLoop call us again. Note that there might not
5770- * actually *be* another runnable worker , but we don't care all
5771- * that much; we will find out the next time we run.
5816+ * Quit, but have ServerLoop call us again to look for additional
5817+ * ready-to-run workers. There might not be any , but we'll find
5818+ * out the next time we run.
57725819 */
57735820 StartWorkerNeeded = true;
57745821 return ;
57755822 }
57765823 }
5777-
5778- /* no runnable worker found */
5779- StartWorkerNeeded = false;
57805824}
57815825
57825826/*
0 commit comments