Replace wal_sender_timeout-based liveness with TCP keepalive.

Ibrar Ahmed · Ibrar Ahmed · commit 1b0262a8855c · 2026-03-04T11:53:27.000+05:00
The apply worker previously relied on wal_sender_timeout as both a
server-side disconnect trigger and an indirect keepalive pressure on
the subscriber.  This caused spurious disconnects in two scenarios:
a flood of 'w' messages keeping the subscriber too busy to send 'r'
feedback in time, and large transactions whose apply time exceeded
wal_sender_timeout.

The workaround was maybe_send_feedback(), which force-sent 'r' after
every 10 'w' messages or wal_sender_timeout/2, whichever came first.
This was a fragile band-aid that coupled subscriber behavior to a
server GUC it cannot control.

Replace the entire mechanism with a clean two-layer model:

- TCP keepalive (keepalives_idle=10, keepalives_interval=5,
  keepalives_count=3) is the primary liveness detector on both sides.
  A dead network or crashed host is detected in ~25 seconds.

- wal_sender_timeout=0 is set on replication connections so the
  walsender never disconnects due to missing 'r' feedback.  Liveness
  on the server side is now handled entirely by TCP keepalive.

- spock.apply_idle_timeout (default 300s) is a subscriber-side safety
  net for a hung-but-connected walsender whose TCP keepalive probes
  are answered by the kernel but sends no data.  Set to 0 to disable.

Fix a bug in last_receive_timestamp handling: it was updated
unconditionally after every PQgetCopyData call, including when r==0
(no data available).  Each 1-second WL_TIMEOUT spin silently reset
the timer, making apply_idle_timeout never fire.  Move the update to
after the r==0 guard so it reflects actual data receipt only.

Remove maybe_send_feedback() as it is no longer needed.
diff --git a/include/spock.h b/include/spock.h
@@ -53,6 +53,7 @@ extern int	restart_delay_on_exception;
 extern int	spock_replay_queue_size;	/* Deprecated - no longer used */
 extern bool check_all_uc_indexes;
 extern bool	spock_enable_quiet_mode;
+extern int	spock_apply_idle_timeout;
 
 extern char *shorten_hash(const char *str, int maxlen);
 extern void gen_slot_name(Name slot_name, char *dbname,
diff --git a/src/spock.c b/src/spock.c
@@ -135,6 +135,7 @@ int			restart_delay_on_exception;
 int			spock_replay_queue_size;	/* Deprecated - no longer used */
 bool		check_all_uc_indexes = false;
 bool		spock_enable_quiet_mode = false;
+int			spock_apply_idle_timeout = 300;
 
 static emit_log_hook_type prev_emit_log_hook = NULL;
 static Checkpoint_hook_type prev_Checkpoint_hook = NULL;
@@ -304,7 +305,7 @@ get_spock_table_oid(const char *table)
 	return reloid;
 }
 
-#define CONN_PARAM_ARRAY_SIZE 9
+#define CONN_PARAM_ARRAY_SIZE 10
 
 static PGconn *
 spock_connect_base(const char *connstr, const char *appname,
@@ -345,17 +346,32 @@ spock_connect_base(const char *connstr, const char *appname,
 	vals[i] = "1";
 	i++;
 	keys[i] = "keepalives_idle";
-	vals[i] = "20";
+	vals[i] = "10";
 	i++;
 	keys[i] = "keepalives_interval";
-	vals[i] = "20";
+	vals[i] = "5";
 	i++;
 	keys[i] = "keepalives_count";
-	vals[i] = "5";
+	vals[i] = "3";
 	i++;
 	keys[i] = "replication";
 	vals[i] = replication ? "database" : NULL;
 	i++;
+	/*
+	 * For replication connections, disable the server-side walsender timeout.
+	 * Liveness detection is handled by TCP keepalives (keepalives_idle /
+	 * keepalives_interval / keepalives_count above) on both sides, and by
+	 * spock.apply_idle_timeout on the subscriber side as a safety net for a
+	 * hung-but-connected walsender.  Leaving wal_sender_timeout active would
+	 * cause spurious disconnects whenever the subscriber is legitimately busy
+	 * applying a large transaction and cannot send 'r' feedback in time.
+	 */
+	if (replication)
+	{
+		keys[i] = "options";
+		vals[i] = "-c wal_sender_timeout=0";
+		i++;
+	}
 	keys[i] = NULL;
 	vals[i] = NULL;
 
@@ -1186,6 +1202,22 @@ _PG_init(void)
 							NULL,
 							NULL);
 
+	DefineCustomIntVariable("spock.apply_idle_timeout",
+							"Maximum idle time in seconds before apply worker reconnects",
+							"Safety net for detecting a hung walsender that keeps the "
+							"TCP connection alive but stops sending data. The timer "
+							"resets on any received message. Set to 0 to disable and "
+							"rely solely on TCP keepalive for liveness detection.",
+							&spock_apply_idle_timeout,
+							300,
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_S,
+							NULL,
+							NULL,
+							NULL);
+
 	if (IsBinaryUpgrade)
 		return;
 
diff --git a/src/spock_apply.c b/src/spock_apply.c
@@ -229,8 +229,6 @@ static bool should_log_exception(bool failed);
 static ApplyReplayEntry * apply_replay_entry_create(int r, char *buf);
 static void apply_replay_entry_free(ApplyReplayEntry * entry);
 static void apply_replay_queue_reset(void);
-static void maybe_send_feedback(PGconn *applyconn, XLogRecPtr lsn_to_send,
-								TimestampTz *last_receive_timestamp);
 static void append_feedback_position(XLogRecPtr recvpos);
 static void get_feedback_position(XLogRecPtr *recvpos, XLogRecPtr *writepos,
 								  XLogRecPtr *flushpos, XLogRecPtr *max_recvpos);
@@ -2838,25 +2836,24 @@ apply_work(PGconn *streamConn)
 			}
 
 			/*
-			 * The walsender is supposed to ping us for a status update every
-			 * wal_sender_timeout / 2 milliseconds. If we don't get those, we
-			 * assume that we have lost the connection.
-			 *
-			 * Note: keepalive configuration is supposed to cover this but is
-			 * apparently unreliable.
+			 * Connection liveness is handled by TCP keepalive (primary)
+			 * and PQstatus == CONNECTION_BAD (above). The idle timeout
+			 * below is a safety net for the case where the walsender
+			 * process is alive but hung -- TCP probes succeed because the
+			 * kernel ACKs them, but no data is being sent.
 			 */
-			if (rc & WL_TIMEOUT)
+			if (rc & WL_TIMEOUT && spock_apply_idle_timeout > 0)
 			{
 				TimestampTz timeout;
 
 				timeout = TimestampTzPlusMilliseconds(last_receive_timestamp,
-													  (wal_sender_timeout * 3) / 2);
+													  (long) spock_apply_idle_timeout * 1000L);
 				if (GetCurrentTimestamp() > timeout)
 				{
 					MySpockWorker->worker_status = SPOCK_WORKER_STATUS_STOPPED;
-					elog(ERROR, "SPOCK %s: terminating apply due to missing "
-						 "walsender ping",
-						 MySubscription->name);
+					elog(ERROR, "SPOCK %s: no data received for %d seconds, "
+						 "reconnecting (spock.apply_idle_timeout)",
+						 MySubscription->name, spock_apply_idle_timeout);
 				}
 			}
 
@@ -2879,8 +2876,6 @@ apply_work(PGconn *streamConn)
 					/* We are not in replay mode so receive from the stream */
 					r = PQgetCopyData(applyconn, &buf, 1);
 
-					last_receive_timestamp = GetCurrentTimestamp();
-
 					/* Check for errors */
 					if (r == -1)
 					{
@@ -2912,6 +2907,14 @@ apply_work(PGconn *streamConn)
 						break;
 					}
 
+					/*
+					 * We received actual data. Update the idle-timeout clock
+					 * only here, after confirming r > 0, so that a WL_TIMEOUT
+					 * spin with no incoming data does not silently reset the
+					 * timer and mask a hung walsender.
+					 */
+					last_receive_timestamp = GetCurrentTimestamp();
+
 					/*
 					 * We have a valid message, create an apply queue entry
 					 * but don't add it to the queue yet.
@@ -2947,16 +2950,6 @@ apply_work(PGconn *streamConn)
 					end_lsn = pq_getmsgint64(msg);
 					pq_getmsgint64(msg);	/* sendTime */
 
-					/*
-					 * Call maybe_send_feedback before last_received is
-					 * updated. This ordering guarantees that feedback LSN
-					 * never advertises a position beyond what has actually
-					 * been received and processed. Prevents skipping over
-					 * unapplied changes due to premature flush LSN.
-					 */
-					maybe_send_feedback(applyconn, last_received,
-										&last_receive_timestamp);
-
 					if (last_received < start_lsn)
 						last_received = start_lsn;
 
@@ -3924,39 +3917,6 @@ apply_replay_queue_reset(void)
 	MemoryContextReset(ApplyReplayContext);
 }
 
-/*
- * Check if we should send feedback based on message count or timeout.
- * Resets internal state if feedback is sent.
- */
-static void
-maybe_send_feedback(PGconn *applyconn, XLogRecPtr lsn_to_send,
-					TimestampTz *last_receive_timestamp)
-{
-	static int	w_message_count = 0;
-	TimestampTz now = GetCurrentTimestamp();
-
-	w_message_count++;
-
-	/*
-	 * Send feedback if wal_sender_timeout/2 has passed or after 10 'w'
-	 * messages.
-	 */
-	if (TimestampDifferenceExceeds(*last_receive_timestamp, now, wal_sender_timeout / 2) ||
-		w_message_count >= 10)
-	{
-		elog(DEBUG2, "SPOCK %s: force sending feedback after %d 'w' messages or timeout",
-			 MySubscription->name, w_message_count);
-
-		/*
-		 * We need to send feedback to the walsender process to avoid remote
-		 * wal_sender_timeout.
-		 */
-		send_feedback(applyconn, lsn_to_send, now, true);
-		*last_receive_timestamp = now;
-		w_message_count = 0;
-	}
-}
-
 /*
  * Advance the replication origin for forwarded transactions.
  *