@@ -95,6 +95,13 @@ static uint32 pgfdw_we_get_result = 0;
9595 */
9696#define CONNECTION_CLEANUP_TIMEOUT 30000
9797
98+ /*
99+ * Milliseconds to wait before issuing another cancel request. This covers
100+ * the race condition where the remote session ignored our cancel request
101+ * because it arrived while idle.
102+ */
103+ #define RETRY_CANCEL_TIMEOUT 1000
104+
98105/* Macro for constructing abort command to be sent */
99106#define CONSTRUCT_ABORT_COMMAND (sql , entry , toplevel ) \
100107 do { \
@@ -135,6 +142,7 @@ static void pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel);
135142static bool pgfdw_cancel_query (PGconn * conn );
136143static bool pgfdw_cancel_query_begin (PGconn * conn , TimestampTz endtime );
137144static bool pgfdw_cancel_query_end (PGconn * conn , TimestampTz endtime ,
145+ TimestampTz retrycanceltime ,
138146 bool consume_input );
139147static bool pgfdw_exec_cleanup_query (PGconn * conn , const char * query ,
140148 bool ignore_errors );
@@ -144,6 +152,7 @@ static bool pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
144152 bool consume_input ,
145153 bool ignore_errors );
146154static bool pgfdw_get_cleanup_result (PGconn * conn , TimestampTz endtime ,
155+ TimestampTz retrycanceltime ,
147156 PGresult * * result , bool * timed_out );
148157static void pgfdw_abort_cleanup (ConnCacheEntry * entry , bool toplevel );
149158static bool pgfdw_abort_cleanup_begin (ConnCacheEntry * entry , bool toplevel ,
@@ -1308,18 +1317,25 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
13081317static bool
13091318pgfdw_cancel_query (PGconn * conn )
13101319{
1320+ TimestampTz now = GetCurrentTimestamp ();
13111321 TimestampTz endtime ;
1322+ TimestampTz retrycanceltime ;
13121323
13131324 /*
13141325 * If it takes too long to cancel the query and discard the result, assume
13151326 * the connection is dead.
13161327 */
1317- endtime = TimestampTzPlusMilliseconds (GetCurrentTimestamp (),
1318- CONNECTION_CLEANUP_TIMEOUT );
1328+ endtime = TimestampTzPlusMilliseconds (now , CONNECTION_CLEANUP_TIMEOUT );
1329+
1330+ /*
1331+ * Also, lose patience and re-issue the cancel request after a little bit.
1332+ * (This serves to close some race conditions.)
1333+ */
1334+ retrycanceltime = TimestampTzPlusMilliseconds (now , RETRY_CANCEL_TIMEOUT );
13191335
13201336 if (!pgfdw_cancel_query_begin (conn , endtime ))
13211337 return false;
1322- return pgfdw_cancel_query_end (conn , endtime , false);
1338+ return pgfdw_cancel_query_end (conn , endtime , retrycanceltime , false);
13231339}
13241340
13251341/*
@@ -1345,9 +1361,10 @@ pgfdw_cancel_query_begin(PGconn *conn, TimestampTz endtime)
13451361}
13461362
13471363static bool
1348- pgfdw_cancel_query_end (PGconn * conn , TimestampTz endtime , bool consume_input )
1364+ pgfdw_cancel_query_end (PGconn * conn , TimestampTz endtime ,
1365+ TimestampTz retrycanceltime , bool consume_input )
13491366{
1350- PGresult * result = NULL ;
1367+ PGresult * result ;
13511368 bool timed_out ;
13521369
13531370 /*
@@ -1366,7 +1383,8 @@ pgfdw_cancel_query_end(PGconn *conn, TimestampTz endtime, bool consume_input)
13661383 }
13671384
13681385 /* Get and discard the result of the query. */
1369- if (pgfdw_get_cleanup_result (conn , endtime , & result , & timed_out ))
1386+ if (pgfdw_get_cleanup_result (conn , endtime , retrycanceltime ,
1387+ & result , & timed_out ))
13701388 {
13711389 if (timed_out )
13721390 ereport (WARNING ,
@@ -1439,7 +1457,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
14391457 TimestampTz endtime , bool consume_input ,
14401458 bool ignore_errors )
14411459{
1442- PGresult * result = NULL ;
1460+ PGresult * result ;
14431461 bool timed_out ;
14441462
14451463 Assert (query != NULL );
@@ -1457,7 +1475,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
14571475 }
14581476
14591477 /* Get the result of the query. */
1460- if (pgfdw_get_cleanup_result (conn , endtime , & result , & timed_out ))
1478+ if (pgfdw_get_cleanup_result (conn , endtime , endtime , & result , & timed_out ))
14611479 {
14621480 if (timed_out )
14631481 ereport (WARNING ,
@@ -1481,28 +1499,36 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
14811499}
14821500
14831501/*
1484- * Get, during abort cleanup, the result of a query that is in progress. This
1485- * might be a query that is being interrupted by transaction abort, or it might
1486- * be a query that was initiated as part of transaction abort to get the remote
1487- * side back to the appropriate state.
1502+ * Get, during abort cleanup, the result of a query that is in progress.
1503+ * This might be a query that is being interrupted by a cancel request or by
1504+ * transaction abort, or it might be a query that was initiated as part of
1505+ * transaction abort to get the remote side back to the appropriate state.
1506+ *
1507+ * endtime is the time at which we should give up and assume the remote side
1508+ * is dead. retrycanceltime is the time at which we should issue a fresh
1509+ * cancel request (pass the same value as endtime if this is not wanted).
14881510 *
1489- * endtime is the time at which we should give up and assume the remote
1490- * side is dead. Returns true if the timeout expired or connection trouble
1491- * occurred, false otherwise. Sets *result except in case of a timeout.
1492- * Sets timed_out to true only when the timeout expired.
1511+ * Returns true if the timeout expired or connection trouble occurred,
1512+ * false otherwise. Sets *result except in case of a true result.
1513+ * Sets *timed_out to true only when the timeout expired.
14931514 */
14941515static bool
1495- pgfdw_get_cleanup_result (PGconn * conn , TimestampTz endtime , PGresult * * result ,
1516+ pgfdw_get_cleanup_result (PGconn * conn , TimestampTz endtime ,
1517+ TimestampTz retrycanceltime ,
1518+ PGresult * * result ,
14961519 bool * timed_out )
14971520{
14981521 volatile bool failed = false;
14991522 PGresult * volatile last_res = NULL ;
15001523
1524+ * result = NULL ;
15011525 * timed_out = false;
15021526
15031527 /* In what follows, do not leak any PGresults on an error. */
15041528 PG_TRY ();
15051529 {
1530+ int canceldelta = RETRY_CANCEL_TIMEOUT * 2 ;
1531+
15061532 for (;;)
15071533 {
15081534 PGresult * res ;
@@ -1513,8 +1539,33 @@ pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime, PGresult **result,
15131539 TimestampTz now = GetCurrentTimestamp ();
15141540 long cur_timeout ;
15151541
1542+ /* If timeout has expired, give up. */
1543+ if (now >= endtime )
1544+ {
1545+ * timed_out = true;
1546+ failed = true;
1547+ goto exit ;
1548+ }
1549+
1550+ /* If we need to re-issue the cancel request, do that. */
1551+ if (now >= retrycanceltime )
1552+ {
1553+ /* We ignore failure to issue the repeated request. */
1554+ (void ) libpqsrv_cancel (conn , endtime );
1555+
1556+ /* Recompute "now" in case that took measurable time. */
1557+ now = GetCurrentTimestamp ();
1558+
1559+ /* Adjust re-cancel timeout in increasing steps. */
1560+ retrycanceltime = TimestampTzPlusMilliseconds (now ,
1561+ canceldelta );
1562+ canceldelta += canceldelta ;
1563+ }
1564+
15161565 /* If timeout has expired, give up, else get sleep time. */
1517- cur_timeout = TimestampDifferenceMilliseconds (now , endtime );
1566+ cur_timeout = TimestampDifferenceMilliseconds (now ,
1567+ Min (endtime ,
1568+ retrycanceltime ));
15181569 if (cur_timeout <= 0 )
15191570 {
15201571 * timed_out = true;
@@ -1835,7 +1886,9 @@ pgfdw_finish_abort_cleanup(List *pending_entries, List *cancel_requested,
18351886 foreach (lc , cancel_requested )
18361887 {
18371888 ConnCacheEntry * entry = (ConnCacheEntry * ) lfirst (lc );
1889+ TimestampTz now = GetCurrentTimestamp ();
18381890 TimestampTz endtime ;
1891+ TimestampTz retrycanceltime ;
18391892 char sql [100 ];
18401893
18411894 Assert (entry -> changing_xact_state );
@@ -1849,10 +1902,13 @@ pgfdw_finish_abort_cleanup(List *pending_entries, List *cancel_requested,
18491902 * remaining entries in the list, leading to slamming that entry's
18501903 * connection shut.
18511904 */
1852- endtime = TimestampTzPlusMilliseconds (GetCurrentTimestamp () ,
1905+ endtime = TimestampTzPlusMilliseconds (now ,
18531906 CONNECTION_CLEANUP_TIMEOUT );
1907+ retrycanceltime = TimestampTzPlusMilliseconds (now ,
1908+ RETRY_CANCEL_TIMEOUT );
18541909
1855- if (!pgfdw_cancel_query_end (entry -> conn , endtime , true))
1910+ if (!pgfdw_cancel_query_end (entry -> conn , endtime ,
1911+ retrycanceltime , true))
18561912 {
18571913 /* Unable to cancel running query */
18581914 pgfdw_reset_xact_state (entry , toplevel );
0 commit comments