@@ -95,6 +95,13 @@ static uint32 pgfdw_we_get_result = 0;
9595 */
9696#define CONNECTION_CLEANUP_TIMEOUT 30000
9797
98+ /*
99+ * Milliseconds to wait before issuing another cancel request. This covers
100+ * the race condition where the remote session ignored our cancel request
101+ * because it arrived while idle.
102+ */
103+ #define RETRY_CANCEL_TIMEOUT 1000
104+
98105/* Macro for constructing abort command to be sent */
99106#define CONSTRUCT_ABORT_COMMAND (sql , entry , toplevel ) \
100107 do { \
@@ -145,6 +152,7 @@ static void pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel);
145152static bool pgfdw_cancel_query (PGconn * conn );
146153static bool pgfdw_cancel_query_begin (PGconn * conn , TimestampTz endtime );
147154static bool pgfdw_cancel_query_end (PGconn * conn , TimestampTz endtime ,
155+ TimestampTz retrycanceltime ,
148156 bool consume_input );
149157static bool pgfdw_exec_cleanup_query (PGconn * conn , const char * query ,
150158 bool ignore_errors );
@@ -154,6 +162,7 @@ static bool pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
154162 bool consume_input ,
155163 bool ignore_errors );
156164static bool pgfdw_get_cleanup_result (PGconn * conn , TimestampTz endtime ,
165+ TimestampTz retrycanceltime ,
157166 PGresult * * result , bool * timed_out );
158167static void pgfdw_abort_cleanup (ConnCacheEntry * entry , bool toplevel );
159168static bool pgfdw_abort_cleanup_begin (ConnCacheEntry * entry , bool toplevel ,
@@ -1322,18 +1331,25 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
13221331static bool
13231332pgfdw_cancel_query (PGconn * conn )
13241333{
1334+ TimestampTz now = GetCurrentTimestamp ();
13251335 TimestampTz endtime ;
1336+ TimestampTz retrycanceltime ;
13261337
13271338 /*
13281339 * If it takes too long to cancel the query and discard the result, assume
13291340 * the connection is dead.
13301341 */
1331- endtime = TimestampTzPlusMilliseconds (GetCurrentTimestamp (),
1332- CONNECTION_CLEANUP_TIMEOUT );
1342+ endtime = TimestampTzPlusMilliseconds (now , CONNECTION_CLEANUP_TIMEOUT );
1343+
1344+ /*
1345+ * Also, lose patience and re-issue the cancel request after a little bit.
1346+ * (This serves to close some race conditions.)
1347+ */
1348+ retrycanceltime = TimestampTzPlusMilliseconds (now , RETRY_CANCEL_TIMEOUT );
13331349
13341350 if (!pgfdw_cancel_query_begin (conn , endtime ))
13351351 return false;
1336- return pgfdw_cancel_query_end (conn , endtime , false);
1352+ return pgfdw_cancel_query_end (conn , endtime , retrycanceltime , false);
13371353}
13381354
13391355/*
@@ -1359,9 +1375,10 @@ pgfdw_cancel_query_begin(PGconn *conn, TimestampTz endtime)
13591375}
13601376
13611377static bool
1362- pgfdw_cancel_query_end (PGconn * conn , TimestampTz endtime , bool consume_input )
1378+ pgfdw_cancel_query_end (PGconn * conn , TimestampTz endtime ,
1379+ TimestampTz retrycanceltime , bool consume_input )
13631380{
1364- PGresult * result = NULL ;
1381+ PGresult * result ;
13651382 bool timed_out ;
13661383
13671384 /*
@@ -1380,7 +1397,8 @@ pgfdw_cancel_query_end(PGconn *conn, TimestampTz endtime, bool consume_input)
13801397 }
13811398
13821399 /* Get and discard the result of the query. */
1383- if (pgfdw_get_cleanup_result (conn , endtime , & result , & timed_out ))
1400+ if (pgfdw_get_cleanup_result (conn , endtime , retrycanceltime ,
1401+ & result , & timed_out ))
13841402 {
13851403 if (timed_out )
13861404 ereport (WARNING ,
@@ -1453,7 +1471,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
14531471 TimestampTz endtime , bool consume_input ,
14541472 bool ignore_errors )
14551473{
1456- PGresult * result = NULL ;
1474+ PGresult * result ;
14571475 bool timed_out ;
14581476
14591477 Assert (query != NULL );
@@ -1471,7 +1489,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
14711489 }
14721490
14731491 /* Get the result of the query. */
1474- if (pgfdw_get_cleanup_result (conn , endtime , & result , & timed_out ))
1492+ if (pgfdw_get_cleanup_result (conn , endtime , endtime , & result , & timed_out ))
14751493 {
14761494 if (timed_out )
14771495 ereport (WARNING ,
@@ -1495,28 +1513,36 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query,
14951513}
14961514
14971515/*
1498- * Get, during abort cleanup, the result of a query that is in progress. This
1499- * might be a query that is being interrupted by transaction abort, or it might
1500- * be a query that was initiated as part of transaction abort to get the remote
1501- * side back to the appropriate state.
1516+ * Get, during abort cleanup, the result of a query that is in progress.
1517+ * This might be a query that is being interrupted by a cancel request or by
1518+ * transaction abort, or it might be a query that was initiated as part of
1519+ * transaction abort to get the remote side back to the appropriate state.
1520+ *
1521+ * endtime is the time at which we should give up and assume the remote side
1522+ * is dead. retrycanceltime is the time at which we should issue a fresh
1523+ * cancel request (pass the same value as endtime if this is not wanted).
15021524 *
1503- * endtime is the time at which we should give up and assume the remote
1504- * side is dead. Returns true if the timeout expired or connection trouble
1505- * occurred, false otherwise. Sets *result except in case of a timeout.
1506- * Sets timed_out to true only when the timeout expired.
1525+ * Returns true if the timeout expired or connection trouble occurred,
1526+ * false otherwise. Sets *result except in case of a true result.
1527+ * Sets *timed_out to true only when the timeout expired.
15071528 */
15081529static bool
1509- pgfdw_get_cleanup_result (PGconn * conn , TimestampTz endtime , PGresult * * result ,
1530+ pgfdw_get_cleanup_result (PGconn * conn , TimestampTz endtime ,
1531+ TimestampTz retrycanceltime ,
1532+ PGresult * * result ,
15101533 bool * timed_out )
15111534{
15121535 volatile bool failed = false;
15131536 PGresult * volatile last_res = NULL ;
15141537
1538+ * result = NULL ;
15151539 * timed_out = false;
15161540
15171541 /* In what follows, do not leak any PGresults on an error. */
15181542 PG_TRY ();
15191543 {
1544+ int canceldelta = RETRY_CANCEL_TIMEOUT * 2 ;
1545+
15201546 for (;;)
15211547 {
15221548 PGresult * res ;
@@ -1527,8 +1553,33 @@ pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime, PGresult **result,
15271553 TimestampTz now = GetCurrentTimestamp ();
15281554 long cur_timeout ;
15291555
1556+ /* If timeout has expired, give up. */
1557+ if (now >= endtime )
1558+ {
1559+ * timed_out = true;
1560+ failed = true;
1561+ goto exit ;
1562+ }
1563+
1564+ /* If we need to re-issue the cancel request, do that. */
1565+ if (now >= retrycanceltime )
1566+ {
1567+ /* We ignore failure to issue the repeated request. */
1568+ (void ) libpqsrv_cancel (conn , endtime );
1569+
1570+ /* Recompute "now" in case that took measurable time. */
1571+ now = GetCurrentTimestamp ();
1572+
1573+ /* Adjust re-cancel timeout in increasing steps. */
1574+ retrycanceltime = TimestampTzPlusMilliseconds (now ,
1575+ canceldelta );
1576+ canceldelta += canceldelta ;
1577+ }
1578+
15301579 /* If timeout has expired, give up, else get sleep time. */
1531- cur_timeout = TimestampDifferenceMilliseconds (now , endtime );
1580+ cur_timeout = TimestampDifferenceMilliseconds (now ,
1581+ Min (endtime ,
1582+ retrycanceltime ));
15321583 if (cur_timeout <= 0 )
15331584 {
15341585 * timed_out = true;
@@ -1849,7 +1900,9 @@ pgfdw_finish_abort_cleanup(List *pending_entries, List *cancel_requested,
18491900 foreach (lc , cancel_requested )
18501901 {
18511902 ConnCacheEntry * entry = (ConnCacheEntry * ) lfirst (lc );
1903+ TimestampTz now = GetCurrentTimestamp ();
18521904 TimestampTz endtime ;
1905+ TimestampTz retrycanceltime ;
18531906 char sql [100 ];
18541907
18551908 Assert (entry -> changing_xact_state );
@@ -1863,10 +1916,13 @@ pgfdw_finish_abort_cleanup(List *pending_entries, List *cancel_requested,
18631916 * remaining entries in the list, leading to slamming that entry's
18641917 * connection shut.
18651918 */
1866- endtime = TimestampTzPlusMilliseconds (GetCurrentTimestamp () ,
1919+ endtime = TimestampTzPlusMilliseconds (now ,
18671920 CONNECTION_CLEANUP_TIMEOUT );
1921+ retrycanceltime = TimestampTzPlusMilliseconds (now ,
1922+ RETRY_CANCEL_TIMEOUT );
18681923
1869- if (!pgfdw_cancel_query_end (entry -> conn , endtime , true))
1924+ if (!pgfdw_cancel_query_end (entry -> conn , endtime ,
1925+ retrycanceltime , true))
18701926 {
18711927 /* Unable to cancel running query */
18721928 pgfdw_reset_xact_state (entry , toplevel );
0 commit comments