diff --git a/tests/debezium/checkpoint-restart-test.sh b/tests/debezium/checkpoint-restart-test.sh new file mode 100755 index 00000000..a8cddc0a --- /dev/null +++ b/tests/debezium/checkpoint-restart-test.sh @@ -0,0 +1,517 @@ +#!/usr/bin/env bash +# checkpoint-restart-test.sh — Single-instance checkpoint/restart test. +# +# Same logic as the RAC version but against local Docker Oracle XE 21c. +# Tests whether Bug 1 (duplicate SYS.TAB$) and Bug 2 (heap-use-after-free) +# are RAC-specific or also present in single-instance mode. +# +# Usage: ./checkpoint-restart-test.sh [kill-count] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPTS_DIR="$(cd "$SCRIPT_DIR/../sql/scripts" && pwd)" + +KILL_COUNT="${1:-3}" +RECEIVER_URL="http://localhost:8080" +POLL_TIMEOUT=180 +OLR_CONTAINER="dbz-olr" +ORACLE_CONTAINER="dbz-oracle" +ORACLE_PORT="${ORACLE_PORT:-1522}" +DB_CONN="olr_test/olr_test@//localhost:1521/XEPDB1" +CHECKPOINT_VOL="debezium_olr-checkpoint" +DDL_BETWEEN_RESTARTS="${DDL_BETWEEN_RESTARTS:-true}" + +WORK_DIR=$(mktemp -d /tmp/chkpt_single_XXXXXX) +trap 'rm -rf "$WORK_DIR"' EXIT + +# ---- Config with short checkpoint interval ---- +cat > "$WORK_DIR/olr-config.json" << 'EOF' +{ + "version": "1.9.0", + "log-level": 4, + "state": { + "type": "disk", + "path": "/olr-data/checkpoint", + "interval-s": 10, + "interval-mb": 1 + }, + "memory": { + "min-mb": 64, + "max-mb": 256 + }, + "source": [ + { + "alias": "SOURCE", + "name": "XE", + "reader": { + "type": "online", + "user": "c##dbzuser", + "password": "dbz", + "server": "//oracle:1521/XEPDB1" + }, + "format": { + "type": "debezium", + "scn-type": 1, + "timestamp-type": 1, + "user-type": 0, + "redo-thread": 0 + }, + "filter": { + "table": [ + {"owner": "OLR_TEST", "table": ".*"} + ] + } + } + ], + "target": [ + { + "alias": "DEBEZIUM", + "source": "SOURCE", + "writer": { + "type": "network", + "uri": "0.0.0.0:5000" + } + } + ] +} +EOF + +_sqlplus() { + docker exec "$ORACLE_CONTAINER" sqlplus -S "$1" @"$2" +} + +_exec_sysdba() { + local sql_file="$1" + docker cp "$sql_file" "$ORACLE_CONTAINER:/tmp/$(basename "$sql_file")" + docker exec "$ORACLE_CONTAINER" bash -c "export ORACLE_SID=XE; sqlplus -S / as sysdba @/tmp/$(basename "$sql_file")" +} + +_exec_user() { + local sql_file="$1" + docker cp "$sql_file" "$ORACLE_CONTAINER:/tmp/$(basename "$sql_file")" + local output + output=$(docker exec "$ORACLE_CONTAINER" bash -c "sqlplus -S '$DB_CONN' @/tmp/$(basename "$sql_file")") + if echo "$output" | grep -q "^ORA-\|^SP2-"; then + echo "ERROR: SQL failed:" >&2 + echo "$output" >&2 + return 1 + fi + echo "$output" +} + +_log_switch() { + cat > "$WORK_DIR/log_switch.sql" <<'SQL' +SET FEEDBACK OFF +ALTER SYSTEM SWITCH LOGFILE; +BEGIN DBMS_SESSION.SLEEP(2); END; +/ +EXIT +SQL + _exec_sysdba "$WORK_DIR/log_switch.sql" > /dev/null +} + +_read_checkpoint_scn() { + if ! docker volume inspect "$CHECKPOINT_VOL" > /dev/null 2>&1; then + echo "0" + return + fi + docker run --rm -v "${CHECKPOINT_VOL}:/data" alpine sh -c \ + 'cat /data/XE-chkpt.json 2>/dev/null' 2>/dev/null | \ + python3 -c "import sys,json; print(json.load(sys.stdin).get('scn',0))" 2>/dev/null || echo "0" +} + +_read_checkpoint() { + docker run --rm -v "${CHECKPOINT_VOL}:/data" alpine sh -c \ + 'cat /data/XE-chkpt.json 2>/dev/null' 2>/dev/null | \ + python3 -c " +import sys,json +try: + d=json.load(sys.stdin) + print(f'scn={d[\"scn\"]}, idx={d.get(\"idx\",\"?\")}') +except: print('no checkpoint') +" 2>/dev/null || echo "no checkpoint" +} + +_wait_for_checkpoint() { + echo " Waiting for OLR checkpoint (up to 30s)..." + local prev_scn + prev_scn=$(_read_checkpoint_scn) + for i in $(seq 1 15); do + sleep 2 + local cur_scn + cur_scn=$(_read_checkpoint_scn) + if [[ "$cur_scn" != "0" && "$cur_scn" != "$prev_scn" ]]; then + echo " Checkpoint written: scn=$cur_scn" + return 0 + fi + if [[ "$cur_scn" != "0" && "$prev_scn" == "0" ]]; then + echo " Checkpoint written: scn=$cur_scn" + return 0 + fi + done + echo " WARNING: No checkpoint after 30s" + return 1 +} + +_launch_olr() { + echo " Launching OLR..." + docker rm -f "$OLR_CONTAINER" > /dev/null 2>&1 || true + # Get Oracle container's network + local net + net=$(docker inspect "$ORACLE_CONTAINER" --format '{{range $k,$v := .NetworkSettings.Networks}}{{$k}}{{end}}' | head -1) + + docker run -d --name "$OLR_CONTAINER" \ + --network "$net" \ + --network-alias olr \ + --group-add 54321 \ + --entrypoint bash \ + -v "$WORK_DIR/olr-config.json:/config/olr-config.json:ro" \ + -v "${CHECKPOINT_VOL}:/olr-data/checkpoint" \ + -v "debezium_oracle-data:/opt/oracle/oradata:ro" \ + -w /olr-data \ + olr-dev:latest \ + -c "mkdir -p /olr-data/checkpoint && /opt/OpenLogReplicator/OpenLogReplicator -r -f /config/olr-config.json" \ + > /dev/null + echo " OLR container started" +} + +_wait_olr_ready() { + echo " Waiting for OLR to start processing..." + for i in $(seq 1 90); do + local state + state=$(docker inspect "$OLR_CONTAINER" --format '{{.State.Status}}' 2>/dev/null || echo "unknown") + if [[ "$state" == "exited" ]]; then + echo " ERROR: OLR exited unexpectedly" >&2 + docker logs "$OLR_CONTAINER" 2>&1 | tail -20 >&2 + return 1 + fi + if docker logs "$OLR_CONTAINER" 2>&1 | grep -q "processing redo log"; then + echo " OLR: ready (${i}x2s)" + return 0 + fi + sleep 2 + done + echo " ERROR: OLR did not become ready in 180s" >&2 + docker logs "$OLR_CONTAINER" 2>&1 | tail -20 >&2 + return 1 +} + +_kill_olr() { + local cycle="${1:-unknown}" + echo " Killing OLR (SIGKILL)..." + # Preserve logs before removing container + docker logs "$OLR_CONTAINER" > "$WORK_DIR/olr-cycle-${cycle}.log" 2>&1 || true + docker kill "$OLR_CONTAINER" > /dev/null 2>&1 || true + docker rm -f "$OLR_CONTAINER" > /dev/null 2>&1 || true + echo " Last checkpoint: $(_read_checkpoint)" +} + +echo "=== Single-Instance Checkpoint/Restart Test ===" +echo " Kill cycles: $KILL_COUNT" +echo "" + +# ---- Stage 1: Verify services ---- +echo "--- Stage 1: Verify services ---" + +if ! docker ps --format '{{.Names}}' | grep -q "^${ORACLE_CONTAINER}$"; then + echo "ERROR: Oracle container not running. Run: cd tests/debezium && docker compose up -d oracle" >&2 + exit 1 +fi +echo " Oracle: OK" + +if ! curl -sf "$RECEIVER_URL/health" > /dev/null 2>&1; then + echo "ERROR: Receiver not responding. Run: cd tests/debezium && docker compose up -d receiver" >&2 + exit 1 +fi +echo " Receiver: OK" + +# ---- Stage 2: Setup ---- +echo "" +echo "--- Stage 2: Setup ---" + +cat > "$WORK_DIR/setup.sql" <<'SQL' +SET FEEDBACK OFF +SET SERVEROUTPUT ON + +BEGIN EXECUTE IMMEDIATE 'DROP TABLE olr_test.CHKPT_TEST PURGE'; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -942 THEN RAISE; END IF; END; +/ + +CREATE TABLE olr_test.CHKPT_TEST ( + id NUMBER PRIMARY KEY, + val VARCHAR2(200), + phase VARCHAR2(50), + created TIMESTAMP DEFAULT SYSTIMESTAMP +); +ALTER TABLE olr_test.CHKPT_TEST ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; + +EXIT +SQL +_exec_user "$WORK_DIR/setup.sql" +_log_switch + +# Clean checkpoint volume and fix permissions +docker rm -f "$OLR_CONTAINER" > /dev/null 2>&1 || true +docker volume rm -f "$CHECKPOINT_VOL" > /dev/null 2>&1 || true +docker volume create "$CHECKPOINT_VOL" > /dev/null +# OLR runs as uid=1000, gid=54322 — set volume ownership +docker run --rm -v "${CHECKPOINT_VOL}:/data" alpine sh -c "chown 1000:54322 /data && chmod 775 /data" +echo " Checkpoint volume cleared (permissions set for uid=1000)" + +# Restart Debezium connectors +echo " Restarting Debezium connectors..." +cd "$SCRIPT_DIR" +for svc in dbz-logminer dbz-olr; do + docker compose rm -sf "$svc" > /dev/null 2>&1 || true +done +docker volume rm -f debezium_dbz-logminer-data debezium_dbz-olr-data > /dev/null 2>&1 || true +docker compose up -d dbz-logminer > /dev/null 2>&1 +cd - > /dev/null + +for i in $(seq 1 60); do + if docker logs dbz-logminer 2>&1 | tail -20 | grep -q "Starting streaming"; then + echo " LogMiner: streaming" + break + fi + if [[ $i -eq 60 ]]; then + echo "ERROR: LogMiner did not start" >&2 + docker logs dbz-logminer 2>&1 | tail -10 >&2 + exit 1 + fi + sleep 2 +done + +curl -sf -X POST "$RECEIVER_URL/reset" > /dev/null + +# ---- Stage 3: Kill/restart cycles ---- +echo "" +echo "--- Stage 3: Kill/restart cycles ---" + +BATCH=50 +NEXT_ID=1000 +CHECKPOINT_VERIFIED=0 + +# Launch OLR, then start Debezium adapter (which connects to OLR and triggers processing) +_launch_olr +sleep 2 +cd "$SCRIPT_DIR" +docker compose up -d --no-deps dbz-olr > /dev/null 2>&1 +cd - > /dev/null +_wait_olr_ready + +for cycle in $(seq 1 "$KILL_COUNT"); do + echo "" + echo " === Cycle $cycle / $KILL_COUNT ===" + + # Phase A: DML while running + cat > "$WORK_DIR/dml.sql" < /dev/null + _log_switch + echo " c${cycle}_running: inserted $BATCH rows (IDs ${NEXT_ID}-$(( NEXT_ID + BATCH - 1 )))" + NEXT_ID=$(( NEXT_ID + BATCH )) + + # Wait for checkpoint + _wait_for_checkpoint || true + PRE_KILL_SCN=$(_read_checkpoint_scn) + + # Kill + _kill_olr "$cycle" + + # DDL while OLR is down (triggers Bug 1: schema checkpoint accumulation) + if [[ "$DDL_BETWEEN_RESTARTS" == "true" ]]; then + cat > "$WORK_DIR/ddl.sql" < /dev/null + _log_switch + echo " c${cycle}: DDL (DROP+CREATE CHKPT_AUX)" + fi + + # Phase B: DML while down + cat > "$WORK_DIR/dml.sql" < /dev/null + _log_switch + echo " c${cycle}_offline: inserted $BATCH rows (IDs ${NEXT_ID}-$(( NEXT_ID + BATCH - 1 )))" + NEXT_ID=$(( NEXT_ID + BATCH )) + + # Restart — adapter auto-reconnects (restart: unless-stopped) + _launch_olr + sleep 2 + _wait_olr_ready + + # Verify checkpoint resume + POST_RESTART_SCN=$(_read_checkpoint_scn) + if [[ "$PRE_KILL_SCN" != "0" && "$POST_RESTART_SCN" != "0" ]]; then + echo " Checkpoint resume: pre-kill=$PRE_KILL_SCN, post-restart=$POST_RESTART_SCN" + if [[ "$POST_RESTART_SCN" -ge "$PRE_KILL_SCN" ]]; then + echo " PASS: resumed from checkpoint" + CHECKPOINT_VERIFIED=$(( CHECKPOINT_VERIFIED + 1 )) + fi + fi + + _wait_for_checkpoint || true + sleep 5 + + # Phase C: DML after restart + cat > "$WORK_DIR/dml.sql" < /dev/null + _log_switch + echo " c${cycle}_resumed: inserted $BATCH rows (IDs ${NEXT_ID}-$(( NEXT_ID + BATCH - 1 )))" + NEXT_ID=$(( NEXT_ID + BATCH )) +done + +TOTAL_ROWS=$(( KILL_COUNT * 3 * BATCH )) +echo "" +echo " Total: $TOTAL_ROWS rows" + +# ---- Stage 4: Sentinel ---- +echo "" +echo "--- Stage 4: Sentinel + wait ---" +_log_switch +sleep 5 +_log_switch + +cat > "$WORK_DIR/sentinel.sql" <<'SQL' +DELETE FROM DEBEZIUM_SENTINEL; +INSERT INTO DEBEZIUM_SENTINEL VALUES (1, 'checkpoint-restart-test'); +COMMIT; +EXIT; +SQL +_exec_user "$WORK_DIR/sentinel.sql" > /dev/null +echo " Sentinel inserted" +_log_switch + +START_TIME=$(date +%s) +SENTINEL_OK=true +while true; do + ELAPSED=$(( $(date +%s) - START_TIME )) + if [[ $ELAPSED -ge $POLL_TIMEOUT ]]; then + echo "" + echo "ERROR: Timeout" >&2 + SENTINEL_OK=false + break + fi + STATUS=$(curl -sf "$RECEIVER_URL/status" 2>/dev/null || echo '{}') + LM_SENTINEL=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('logminer_sentinel',False))" 2>/dev/null || echo "False") + OLR_SENTINEL=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('olr_sentinel',False))" 2>/dev/null || echo "False") + LM_COUNT=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('logminer_count',0))" 2>/dev/null || echo "0") + OLR_COUNT=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('olr_count',0))" 2>/dev/null || echo "0") + + printf "\r [%3ds] LM: %s (sentinel: %s) | OLR: %s (sentinel: %s) " \ + "$ELAPSED" "$LM_COUNT" "$LM_SENTINEL" "$OLR_COUNT" "$OLR_SENTINEL" + + if [[ "$LM_SENTINEL" == "True" && "$OLR_SENTINEL" == "True" ]]; then + echo "" + echo " Done" + break + fi + sleep 2 +done + +# ---- Stage 5: Compare ---- +echo "" +echo "--- Stage 5: Compare ---" +LM_FILE="$SCRIPT_DIR/output/logminer.jsonl" +OLR_FILE="$SCRIPT_DIR/output/olr.jsonl" + +COMPARE_RESULT=0 +if [[ "$SENTINEL_OK" != "true" ]]; then + echo " FAIL: Sentinel timeout" + COMPARE_RESULT=1 +elif python3 "$SCRIPTS_DIR/compare-debezium.py" "$LM_FILE" "$OLR_FILE"; then + echo " Data accuracy: PASS" +else + echo " Data accuracy: FAIL" + COMPARE_RESULT=1 +fi + +# ---- Check for ASAN errors and OLR errors ---- +echo "" +echo "--- Stage 6: Error checks ---" +ASAN_RESULT=0 +OLR_ERROR_RESULT=0 +# Save final container logs and combine with per-cycle logs +docker logs "$OLR_CONTAINER" > "$WORK_DIR/olr-cycle-final.log" 2>&1 || true +cat "$WORK_DIR"/olr-cycle-*.log > "$WORK_DIR/olr-all.log" 2>/dev/null || true + +if grep -q "AddressSanitizer\|ABORTING" "$WORK_DIR/olr-all.log"; then + echo " FAIL: ASAN errors detected" + grep -A5 "AddressSanitizer" "$WORK_DIR/olr-all.log" | head -10 + ASAN_RESULT=1 +else + echo " PASS: No ASAN errors" +fi + +if grep -q "duplicate\|ERROR 50022" "$WORK_DIR/olr-all.log"; then + echo " FAIL: OLR duplicate/schema errors detected" + grep "duplicate\|ERROR 50022" "$WORK_DIR/olr-all.log" | head -5 + OLR_ERROR_RESULT=1 +else + echo " PASS: No OLR duplicate/schema errors" +fi + +# ---- Summary ---- +echo "" +echo "========================================" +echo " Single-Instance Checkpoint/Restart" +echo "========================================" +echo " Cycles: $KILL_COUNT, Rows: $TOTAL_ROWS" +CHKPT_RESULT=0 +if [[ $CHECKPOINT_VERIFIED -lt 1 ]]; then + CHKPT_RESULT=1 +fi + +echo " Checkpoint verified: $CHECKPOINT_VERIFIED / $KILL_COUNT" +echo " Accuracy: $([ $COMPARE_RESULT -eq 0 ] && echo PASS || echo FAIL)" +echo " ASAN: $([ $ASAN_RESULT -eq 0 ] && echo PASS || echo FAIL)" +echo " OLR errors: $([ $OLR_ERROR_RESULT -eq 0 ] && echo PASS || echo FAIL)" +echo " Checkpoint: $([ $CHKPT_RESULT -eq 0 ] && echo PASS || echo FAIL)" +echo " DDL between restarts: $DDL_BETWEEN_RESTARTS" + +if [[ $COMPARE_RESULT -eq 0 && $ASAN_RESULT -eq 0 && $OLR_ERROR_RESULT -eq 0 && $CHKPT_RESULT -eq 0 ]]; then + echo "" + echo "=== PASS ===" +else + echo "" + echo "=== FAIL ===" +fi + +exit $(( COMPARE_RESULT + ASAN_RESULT + OLR_ERROR_RESULT + CHKPT_RESULT )) diff --git a/tests/debezium/docker-compose.yaml b/tests/debezium/docker-compose.yaml index 8fef3161..abf2c3a3 100644 --- a/tests/debezium/docker-compose.yaml +++ b/tests/debezium/docker-compose.yaml @@ -62,6 +62,7 @@ services: dbz-olr: image: quay.io/debezium/server:3.5.0.Beta1 container_name: dbz-olr-adapter + restart: unless-stopped depends_on: olr: condition: service_started @@ -75,5 +76,6 @@ services: volumes: oracle-data: + olr-checkpoint: dbz-logminer-data: dbz-olr-data: diff --git a/tests/sql/environments/rac/.env b/tests/sql/environments/rac/.env index 82e7c57e..8b44d91c 100644 --- a/tests/sql/environments/rac/.env +++ b/tests/sql/environments/rac/.env @@ -1,3 +1,3 @@ -DB_CONN=olr_test/olr_test@//192.168.122.248:1521/ORCLPDB +DB_CONN=olr_test/olr_test@//192.168.122.130:1521/ORCLPDB PDB_NAME=ORCLPDB INCLUDE_TAGS=rac diff --git a/tests/sql/environments/rac/debezium/checkpoint-restart-test.sh b/tests/sql/environments/rac/debezium/checkpoint-restart-test.sh new file mode 100755 index 00000000..3e71b440 --- /dev/null +++ b/tests/sql/environments/rac/debezium/checkpoint-restart-test.sh @@ -0,0 +1,641 @@ +#!/usr/bin/env bash +# checkpoint-restart-test.sh — Verify OLR resumes correctly from checkpoint after crash. +# +# Runs OLR against RAC, generates DML, waits for checkpoint, kills OLR, +# generates more DML while OLR is down, restarts OLR, verifies checkpoint +# resume, generates final DML, then compares OLR output against LogMiner. +# +# A successful test proves: no duplicates, no gaps after crash + restart, +# and that OLR resumes from checkpoint SCN (not start SCN). +# +# Usage: ./checkpoint-restart-test.sh [kill-count] +# kill-count Number of kill/restart cycles (default: 3) +# +# Prerequisites: +# - RAC VM running with containers started +# - OLR image loaded on VM (podman load) +# - One-time setup done (./setup.sh) +# - Local services running (docker compose up -d) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RAC_ENV_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +SQL_DIR="$(cd "$RAC_ENV_DIR/../.." && pwd)" +TESTS_DIR="$(cd "$SQL_DIR/.." && pwd)" +PROJECT_ROOT="$(cd "$TESTS_DIR/.." && pwd)" +SCRIPTS_DIR="$SQL_DIR/scripts" + +KILL_COUNT="${1:-3}" + +# ---- RAC configuration (auto-detect VM IP) ---- +source "$RAC_ENV_DIR/vm-env.sh" +OLR_IMAGE="${OLR_IMAGE:-docker.io/library/olr-dev:latest}" +RAC_NODE1="${RAC_NODE1:-racnodep1}" +RAC_NODE2="${RAC_NODE2:-racnodep2}" +ORACLE_SID1="${ORACLE_SID1:-ORCLCDB1}" +ORACLE_SID2="${ORACLE_SID2:-ORCLCDB2}" +DB_CONN1="${DB_CONN1:-olr_test/olr_test@//racnodep1:1521/ORCLPDB}" +DB_CONN2="${DB_CONN2:-olr_test/olr_test@//racnodep2:1521/ORCLPDB}" + +OLR_CONTAINER="olr-debezium" +RECEIVER_URL="${RECEIVER_URL:-http://localhost:8080}" +POLL_TIMEOUT="${POLL_TIMEOUT:-300}" + +# ---- SSH helpers ---- +_vm_sqlplus() { + local node="$1" sid="$2" conn="$3" sql_file="$4" + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "podman exec $node su - oracle -c 'export ORACLE_SID=$sid; sqlplus -S \"$conn\" @$sql_file'" +} + +_vm_copy_in() { + local local_path="$1" container_path="$2" node="$3" + local staging="/tmp/_chkpt_staging_$$" + scp $_SSH_OPTS "$local_path" "${VM_USER}@${VM_HOST}:${staging}" + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "podman cp ${staging} ${node}:${container_path}; rm -f ${staging}" +} + +_exec_sysdba() { + local sql_file="$1" + local remote="/tmp/$(basename "$sql_file")" + _vm_copy_in "$sql_file" "$remote" "$RAC_NODE1" + _vm_sqlplus "$RAC_NODE1" "$ORACLE_SID1" "/ as sysdba" "$remote" +} + +_exec_user() { + local sql_file="$1" + local node="${2:-$RAC_NODE1}" sid="${3:-$ORACLE_SID1}" conn="${4:-$DB_CONN1}" + local remote="/tmp/$(basename "$sql_file")" + _vm_copy_in "$sql_file" "$remote" "$node" + local output + output=$(_vm_sqlplus "$node" "$sid" "$conn" "$remote") + # Check for Oracle errors in output + if echo "$output" | grep -q "^ORA-\|^SP2-"; then + echo "ERROR: SQL execution failed on $node:" >&2 + echo "$output" >&2 + return 1 + fi + echo "$output" +} + +_log_switch() { + cat > "$WORK_DIR/log_switch.sql" <<'SQL' +SET FEEDBACK OFF +ALTER SYSTEM SWITCH ALL LOGFILE; +BEGIN DBMS_SESSION.SLEEP(2); END; +/ +EXIT +SQL + _exec_sysdba "$WORK_DIR/log_switch.sql" > /dev/null +} + +_start_olr() { + echo " Starting OLR..." + # Ensure no leftover container + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "podman rm -f $OLR_CONTAINER 2>/dev/null; true" + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "podman run -d --name $OLR_CONTAINER \ + --user 1000:54335 \ + -p 5000:5000 \ + -v /root/olr-debezium/config:/config:ro,Z \ + -v /root/olr-debezium/checkpoint:/olr-data/checkpoint:Z \ + -v /shared/redo:/shared/redo:ro \ + $OLR_IMAGE \ + -r -f /config/olr-config.json" > /dev/null + sleep 3 # Let container initialize before polling logs + + # Wait for OLR to start processing (up to 3 min) + for i in $(seq 1 90); do + # Check if container exited + local state + state=$(ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "podman inspect $OLR_CONTAINER --format '{{.State.Status}}'" 2>/dev/null || echo "unknown") + if [[ "$state" == "exited" || "$state" == "stopped" ]]; then + echo " ERROR: OLR container exited unexpectedly" >&2 + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "podman logs $OLR_CONTAINER 2>&1 | tail -20" >&2 + return 1 + fi + # Check readiness via podman logs on the VM itself (avoids SSH pipe issues) + if ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "podman logs $OLR_CONTAINER 2>&1 | grep -q 'processing redo log'" 2>/dev/null; then + echo " OLR: ready (${i}x2s)" + return 0 + fi + sleep 2 + done + echo " ERROR: OLR did not become ready in 180s" >&2 + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "podman logs $OLR_CONTAINER 2>&1 | tail -20" >&2 + return 1 +} + +_read_checkpoint_scn() { + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "cat /root/olr-debezium/checkpoint/ORCLCDB-chkpt.json 2>/dev/null | python3 -c \" +import sys,json +try: + d=json.load(sys.stdin) + print(d['scn']) +except: print('0') +\" 2>/dev/null" || echo "0" +} + +_read_checkpoint() { + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "cat /root/olr-debezium/checkpoint/ORCLCDB-chkpt.json 2>/dev/null | python3 -c \" +import sys,json +try: + d=json.load(sys.stdin) + print(f'scn={d[\"scn\"]}, idx={d.get(\"idx\",\"?\")}') +except: print('no checkpoint') +\" 2>/dev/null" || echo "no checkpoint" +} + +_wait_for_checkpoint() { + echo " Waiting for OLR checkpoint (up to 30s)..." + local prev_scn + prev_scn=$(_read_checkpoint_scn) + for i in $(seq 1 15); do + sleep 2 + local cur_scn + cur_scn=$(_read_checkpoint_scn) + if [[ "$cur_scn" != "0" && "$cur_scn" != "$prev_scn" ]]; then + echo " Checkpoint written: scn=$cur_scn" + return 0 + fi + if [[ "$cur_scn" != "0" && "$prev_scn" == "0" ]]; then + echo " Checkpoint written: scn=$cur_scn" + return 0 + fi + done + local final=$(_read_checkpoint) + if [[ "$final" == "no checkpoint" ]]; then + echo " WARNING: No checkpoint after 30s" + return 1 + else + echo " Checkpoint (unchanged): $final" + return 0 + fi +} + +_kill_olr() { + local cycle="${1:-unknown}" + echo " Killing OLR (SIGKILL)..." + # Preserve logs before removing container + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "podman logs $OLR_CONTAINER 2>&1" > "$WORK_DIR/olr-cycle-${cycle}.log" 2>/dev/null || true + ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "podman stop -t0 $OLR_CONTAINER 2>/dev/null; podman rm $OLR_CONTAINER 2>/dev/null; true" + echo " Last checkpoint: $(_read_checkpoint)" +} + +_run_dml() { + local label="$1" batch="$2" node1_start="$3" node2_start="$4" + local node1_end=$(( node1_start + batch - 1 )) + local node2_end=$(( node2_start + batch - 1 )) + + cat > "$WORK_DIR/dml_node1.sql" < "$WORK_DIR/dml_node2.sql" < /dev/null + _exec_user "$WORK_DIR/dml_node2.sql" "$RAC_NODE2" "$ORACLE_SID2" "$DB_CONN2" > /dev/null + _log_switch + + echo " $label: inserted $batch rows per node (IDs ${node1_start}-${node1_end} on node1, ${node2_start}-${node2_end} on node2)" +} + +WORK_DIR=$(mktemp -d /tmp/chkpt_rac_XXXXXX) +trap 'rm -rf "$WORK_DIR"' EXIT + +echo "=== OLR RAC Checkpoint/Restart Test ===" +echo " Kill cycles: $KILL_COUNT" +echo "" + +# ---- Stage 1: Verify services ---- +echo "--- Stage 1: Verify services ---" + +if ! curl -sf "$RECEIVER_URL/health" > /dev/null 2>&1; then + echo "ERROR: Receiver not responding at $RECEIVER_URL" >&2 + exit 1 +fi +echo " Receiver: OK" + +if ! ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "podman exec $RAC_NODE1 su - oracle -c 'export ORACLE_SID=$ORACLE_SID1; printf \"SELECT 1 FROM dual;\nEXIT;\n\" | sqlplus -S / as sysdba'" 2>/dev/null | grep -q "1"; then + echo "ERROR: RAC Oracle not reachable on $VM_HOST" >&2 + exit 1 +fi +echo " Oracle RAC: OK" + +if ! docker ps --format '{{.Names}}' | grep -q "^dbz-logminer$"; then + echo "ERROR: Container dbz-logminer not running" >&2 + exit 1 +fi +echo " Debezium: OK" + +# ---- Stage 2: Setup test table ---- +echo "" +echo "--- Stage 2: Setup test table ---" + +cat > "$WORK_DIR/setup.sql" <<'SQL' +SET FEEDBACK OFF +SET SERVEROUTPUT ON + +BEGIN EXECUTE IMMEDIATE 'DROP TABLE olr_test.CHKPT_TEST PURGE'; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -942 THEN RAISE; END IF; END; +/ + +CREATE TABLE olr_test.CHKPT_TEST ( + id NUMBER PRIMARY KEY, + val VARCHAR2(200), + phase VARCHAR2(50), + node_id NUMBER(1), + created TIMESTAMP DEFAULT SYSTIMESTAMP +); +ALTER TABLE olr_test.CHKPT_TEST ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; + +DECLARE + v_scn NUMBER; +BEGIN + v_scn := DBMS_FLASHBACK.GET_SYSTEM_CHANGE_NUMBER; + DBMS_OUTPUT.PUT_LINE('CHKPT_SCN_START: ' || v_scn); +END; +/ + +EXIT +SQL +SETUP_OUT=$(_exec_user "$WORK_DIR/setup.sql") +echo "$SETUP_OUT" +_log_switch + +# Stop existing OLR + clean checkpoint +ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ + "podman stop -t5 $OLR_CONTAINER 2>/dev/null; podman rm $OLR_CONTAINER 2>/dev/null; true" +ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "mkdir -p /root/olr-debezium/config /root/olr-debezium/checkpoint" +scp $_SSH_OPTS "$SCRIPT_DIR/config/olr-config.json" "${VM_USER}@${VM_HOST}:/root/olr-debezium/config/" +ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "rm -rf /root/olr-debezium/checkpoint/* && chown -R 1000:54335 /root/olr-debezium/checkpoint" +echo " Checkpoint cleared" + +# Restart Debezium connectors with clean state +echo " Restarting Debezium connectors..." +cd "$SCRIPT_DIR" +for svc in dbz-logminer dbz-olr; do + docker compose rm -sf "$svc" > /dev/null 2>&1 +done +COMPOSE_PROJECT=$(docker compose config 2>/dev/null | grep -m1 'name:' | awk '{print $2}') +COMPOSE_PROJECT="${COMPOSE_PROJECT:-debezium}" +docker volume rm -f "${COMPOSE_PROJECT}_dbz-logminer-data" "${COMPOSE_PROJECT}_dbz-olr-data" > /dev/null 2>&1 +docker compose up -d dbz-logminer dbz-olr > /dev/null 2>&1 +cd - > /dev/null + +# Wait for Debezium connectors — verify they're actually streaming +echo " Waiting for Debezium connectors..." +for i in $(seq 1 60); do + if docker logs dbz-logminer 2>&1 | tail -20 | grep -q "Starting streaming"; then + echo " LogMiner connector: streaming" + break + fi + if [[ $i -eq 60 ]]; then + echo "ERROR: LogMiner connector did not start streaming in 120s" >&2 + docker logs dbz-logminer 2>&1 | tail -10 >&2 + exit 1 + fi + sleep 2 +done + +# OLR adapter may not connect until OLR is started — just check it's running +for i in $(seq 1 10); do + if docker ps --format '{{.Names}}' | grep -q "^dbz-olr-adapter$"; then + echo " OLR adapter: running" + break + fi + sleep 2 +done + +# Reset receiver +curl -sf -X POST "$RECEIVER_URL/reset" > /dev/null + +# ---- Stage 3: Kill/restart cycles ---- +echo "" +echo "--- Stage 3: Kill/restart cycles ---" + +BATCH=50 +NEXT_N1=1000 +NEXT_N2=2000 + +CHECKPOINT_VERIFIED=0 + +_start_olr +sleep 5 # Let OLR catch up to current SCN + +for cycle in $(seq 1 "$KILL_COUNT"); do + echo "" + echo " === Cycle $cycle / $KILL_COUNT ===" + + # Phase A: DML while OLR is running + _run_dml "c${cycle}_running" "$BATCH" "$NEXT_N1" "$NEXT_N2" + NEXT_N1=$(( NEXT_N1 + BATCH )) + NEXT_N2=$(( NEXT_N2 + BATCH )) + + # Wait for OLR to write a checkpoint (interval-s: 10) + _wait_for_checkpoint || true + PRE_KILL_SCN=$(_read_checkpoint_scn) + + # Kill OLR + _kill_olr "$cycle" + + # Phase B: DML while OLR is down + _run_dml "c${cycle}_offline" "$BATCH" "$NEXT_N1" "$NEXT_N2" + NEXT_N1=$(( NEXT_N1 + BATCH )) + NEXT_N2=$(( NEXT_N2 + BATCH )) + + # Restart OLR — should resume from checkpoint + _start_olr + + # Verify it resumed from checkpoint, not from start SCN + POST_RESTART_SCN=$(_read_checkpoint_scn) + if [[ "$PRE_KILL_SCN" != "0" && "$POST_RESTART_SCN" != "0" ]]; then + echo " Checkpoint resume: pre-kill scn=$PRE_KILL_SCN, post-restart scn=$POST_RESTART_SCN" + if [[ "$POST_RESTART_SCN" -ge "$PRE_KILL_SCN" ]]; then + echo " PASS: OLR resumed from checkpoint (not start SCN)" + CHECKPOINT_VERIFIED=$(( CHECKPOINT_VERIFIED + 1 )) + else + echo " FAIL: Post-restart SCN ($POST_RESTART_SCN) < pre-kill SCN ($PRE_KILL_SCN)" >&2 + fi + else + echo " WARNING: Could not verify checkpoint resume (pre=$PRE_KILL_SCN, post=$POST_RESTART_SCN)" + fi + + # Wait for new checkpoint after processing offline DML (non-fatal if timeout) + _wait_for_checkpoint || true + sleep 5 + + # Phase C: DML after restart + _run_dml "c${cycle}_resumed" "$BATCH" "$NEXT_N1" "$NEXT_N2" + NEXT_N1=$(( NEXT_N1 + BATCH )) + NEXT_N2=$(( NEXT_N2 + BATCH )) +done + +TOTAL_ROWS=$(( KILL_COUNT * 3 * BATCH * 2 )) +echo "" +echo " Total rows inserted: $TOTAL_ROWS ($KILL_COUNT cycles x 3 phases x $BATCH rows x 2 nodes)" + +# ---- Stage 4: Final DML + sentinel ---- +echo "" +echo "--- Stage 4: Wait for processing + sentinel ---" + +# Extra log switches to flush +_log_switch +sleep 5 +_log_switch + +cat > "$WORK_DIR/sentinel.sql" <<'SQL' +DELETE FROM DEBEZIUM_SENTINEL; +INSERT INTO DEBEZIUM_SENTINEL VALUES (1, 'checkpoint-restart-test'); +COMMIT; +EXIT; +SQL +_exec_user "$WORK_DIR/sentinel.sql" > /dev/null +echo " Sentinel inserted" +_log_switch + +# Wait for both connectors to see sentinel +SENTINEL_OK=true +START_TIME=$(date +%s) +while true; do + ELAPSED=$(( $(date +%s) - START_TIME )) + if [[ $ELAPSED -ge $POLL_TIMEOUT ]]; then + echo "" + echo "ERROR: Timeout after ${POLL_TIMEOUT}s waiting for events" >&2 + STATUS=$(curl -sf "$RECEIVER_URL/status" 2>/dev/null || echo '{}') + echo " Final status: $STATUS" >&2 + SENTINEL_OK=false + break + fi + + STATUS=$(curl -sf "$RECEIVER_URL/status" 2>/dev/null || echo '{}') + LM_SENTINEL=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('logminer_sentinel',False))" 2>/dev/null || echo "False") + OLR_SENTINEL=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('olr_sentinel',False))" 2>/dev/null || echo "False") + LM_COUNT=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('logminer_count',0))" 2>/dev/null || echo "0") + OLR_COUNT=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('olr_count',0))" 2>/dev/null || echo "0") + + printf "\r [%3ds] LogMiner: %s events (sentinel: %s) | OLR: %s events (sentinel: %s) " \ + "$ELAPSED" "$LM_COUNT" "$LM_SENTINEL" "$OLR_COUNT" "$OLR_SENTINEL" + + if [[ "$LM_SENTINEL" == "True" && "$OLR_SENTINEL" == "True" ]]; then + echo "" + echo " Both connectors have processed all events" + break + fi + + sleep 2 +done + +# ---- Stage 5: Compare outputs ---- +echo "" +echo "--- Stage 5: Compare LogMiner vs OLR ---" + +LM_FILE="$SCRIPT_DIR/output/logminer.jsonl" +OLR_FILE="$SCRIPT_DIR/output/olr.jsonl" + +COMPARE_RESULT=0 +if [[ "$SENTINEL_OK" != "true" ]]; then + echo " FAIL: Sentinel timeout — comparison unreliable" + COMPARE_RESULT=1 +elif [[ ! -s "$LM_FILE" ]]; then + echo " FAIL: LogMiner output is empty" >&2 + COMPARE_RESULT=1 +elif [[ ! -s "$OLR_FILE" ]]; then + echo " FAIL: OLR output is empty" >&2 + COMPARE_RESULT=1 +else + # Sort both files by content before comparing — RAC transactions from + # different nodes may arrive in different SCN order between LogMiner and OLR. + python3 -c " +import json, sys +records = [] +with open(sys.argv[1]) as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) +records.sort(key=lambda r: json.dumps(r.get('after') or r.get('before') or {}, sort_keys=True)) +with open(sys.argv[2], 'w') as f: + for r in records: + f.write(json.dumps(r) + '\n') +" "$LM_FILE" "$WORK_DIR/logminer-sorted.jsonl" + + python3 -c " +import json, sys +records = [] +with open(sys.argv[1]) as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) +records.sort(key=lambda r: json.dumps(r.get('after') or r.get('before') or {}, sort_keys=True)) +with open(sys.argv[2], 'w') as f: + for r in records: + f.write(json.dumps(r) + '\n') +" "$OLR_FILE" "$WORK_DIR/olr-sorted.jsonl" + + if python3 "$SCRIPTS_DIR/compare-debezium.py" "$WORK_DIR/logminer-sorted.jsonl" "$WORK_DIR/olr-sorted.jsonl"; then + echo " Data accuracy: PASS" + else + echo " Data accuracy: FAIL" + COMPARE_RESULT=1 + fi +fi + +# ---- Stage 6: Duplicate/gap check ---- +echo "" +echo "--- Stage 6: Duplicate and gap analysis ---" + +# Extract CHKPT_TEST insert IDs from OLR output +# Debezium NUMBER fields are structs: {"scale": 0, "value": "base64"} +DUP_RESULT=0 +python3 - "$OLR_FILE" "$TOTAL_ROWS" <<'PYEOF' || DUP_RESULT=$? +import json, sys, base64 + +olr_file = sys.argv[1] +expected_total = int(sys.argv[2]) + +def decode_debezium_number(val): + """Decode Debezium NUMBER struct {scale, value} to int.""" + if isinstance(val, (int, float)): + return int(val) + if isinstance(val, dict) and "value" in val: + raw = base64.b64decode(val["value"]) + n = int.from_bytes(raw, byteorder="big", signed=True) + scale = val.get("scale", 0) + if scale > 0: + return n / (10 ** scale) + return n + return None + +ids = [] +with open(olr_file) as f: + for line in f: + try: + event = json.loads(line) + payload = event.get("payload", event) + if payload.get("op") != "c": + continue + after = payload.get("after", {}) + source = payload.get("source", {}) + if source.get("table") != "CHKPT_TEST": + continue + row_id = decode_debezium_number(after.get("ID")) + if row_id is not None: + ids.append(int(row_id)) + except (json.JSONDecodeError, ValueError, KeyError): + continue + +ids.sort() +unique_ids = sorted(set(ids)) +duplicates = len(ids) - len(unique_ids) + +print(f" Total CHKPT_TEST inserts captured: {len(ids)}") +print(f" Unique IDs: {len(unique_ids)}") +print(f" Duplicates: {duplicates}") +print(f" Expected rows: {expected_total}") + +if len(ids) == 0: + print(" FAIL: No CHKPT_TEST insert events found in OLR output") + sys.exit(1) + +if duplicates > 0: + from collections import Counter + counts = Counter(ids) + dup_ids = sorted([k for k, v in counts.items() if v > 1]) + print(f" Duplicate IDs: {dup_ids[:20]}{'...' if len(dup_ids) > 20 else ''}") + print(" FAIL: Duplicates found") + sys.exit(1) + +if len(unique_ids) < expected_total: + print(f" FAIL: Captured {len(unique_ids)} < expected {expected_total} (data gap)") + sys.exit(1) + +print(" PASS: No duplicates or gaps detected") +PYEOF + +# ---- Stage 7: OLR error/ASAN check ---- +echo "" +echo "--- Stage 7: OLR error checks ---" +OLR_ERROR_RESULT=0 +# Save final container logs and combine with per-cycle logs +ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" "podman logs $OLR_CONTAINER 2>&1" > "$WORK_DIR/olr-cycle-final.log" 2>/dev/null || true +cat "$WORK_DIR"/olr-cycle-*.log > "$WORK_DIR/olr-all.log" 2>/dev/null || true + +if grep -q "AddressSanitizer\|ABORTING" "$WORK_DIR/olr-all.log"; then + echo " FAIL: ASAN errors detected" + grep -A5 "AddressSanitizer" "$WORK_DIR/olr-all.log" | head -10 + OLR_ERROR_RESULT=1 +else + echo " PASS: No ASAN errors" +fi + +if grep -q "ERROR 50022\|duplicate SYS\." "$WORK_DIR/olr-all.log"; then + echo " FAIL: OLR duplicate/schema errors detected" + grep "ERROR 50022\|duplicate SYS\." "$WORK_DIR/olr-all.log" | head -5 + OLR_ERROR_RESULT=1 +else + echo " PASS: No OLR duplicate/schema errors" +fi + +# ---- Summary ---- +echo "" +echo "========================================" +echo " Checkpoint/Restart Test Summary" +echo "========================================" +echo " Kill cycles: $KILL_COUNT" +echo " Total rows: $TOTAL_ROWS" +echo " Phases per cycle: DML -> checkpoint -> kill -> offline DML -> restart -> verify -> DML" +echo " Checkpoint resume verified: $CHECKPOINT_VERIFIED / $KILL_COUNT cycles" + +CHKPT_RESULT=0 +if [[ $CHECKPOINT_VERIFIED -lt 1 ]]; then + echo " WARNING: Checkpoint resume was never verified" + CHKPT_RESULT=1 +fi + +if [[ $COMPARE_RESULT -eq 0 && $DUP_RESULT -eq 0 && $CHKPT_RESULT -eq 0 && $OLR_ERROR_RESULT -eq 0 ]]; then + echo " Accuracy: PASS" + echo " Duplicates: PASS" + echo " Checkpoint: PASS ($CHECKPOINT_VERIFIED/$KILL_COUNT)" + echo " OLR errors: PASS" + echo "" + echo "=== PASS: Checkpoint/restart test completed ===" +else + echo " Accuracy: $([ $COMPARE_RESULT -eq 0 ] && echo PASS || echo FAIL)" + echo " Duplicates: $([ $DUP_RESULT -eq 0 ] && echo PASS || echo FAIL)" + echo " Checkpoint: $([ $CHKPT_RESULT -eq 0 ] && echo "PASS ($CHECKPOINT_VERIFIED/$KILL_COUNT)" || echo FAIL)" + echo " OLR errors: $([ $OLR_ERROR_RESULT -eq 0 ] && echo PASS || echo FAIL)" + echo "" + echo "=== FAIL: Checkpoint/restart test failed ===" + echo " LogMiner output: $LM_FILE" + echo " OLR output: $OLR_FILE" +fi + +exit $(( COMPARE_RESULT + DUP_RESULT + CHKPT_RESULT + OLR_ERROR_RESULT )) diff --git a/tests/sql/environments/rac/debezium/config/application-logminer.properties b/tests/sql/environments/rac/debezium/config/application-logminer.properties index 39389bb4..eee75dea 100644 --- a/tests/sql/environments/rac/debezium/config/application-logminer.properties +++ b/tests/sql/environments/rac/debezium/config/application-logminer.properties @@ -10,7 +10,7 @@ debezium.format.key.schemas.enable=false debezium.source.connector.class=io.debezium.connector.oracle.OracleConnector debezium.source.database.connection.adapter=logminer -debezium.source.database.hostname=192.168.122.248 +debezium.source.database.hostname=192.168.122.130 debezium.source.database.port=1521 debezium.source.database.user=c##dbzuser debezium.source.database.password=dbz diff --git a/tests/sql/environments/rac/debezium/config/application-olr.properties b/tests/sql/environments/rac/debezium/config/application-olr.properties index 03f39f38..072be2bb 100644 --- a/tests/sql/environments/rac/debezium/config/application-olr.properties +++ b/tests/sql/environments/rac/debezium/config/application-olr.properties @@ -11,9 +11,9 @@ debezium.format.key.schemas.enable=false debezium.source.connector.class=io.debezium.connector.oracle.OracleConnector debezium.source.database.connection.adapter=olr debezium.source.openlogreplicator.source=ORCLCDB -debezium.source.openlogreplicator.host=192.168.122.248 +debezium.source.openlogreplicator.host=192.168.122.130 debezium.source.openlogreplicator.port=5000 -debezium.source.database.hostname=192.168.122.248 +debezium.source.database.hostname=192.168.122.130 debezium.source.database.port=1521 debezium.source.database.user=c##dbzuser debezium.source.database.password=dbz diff --git a/tests/sql/environments/rac/debezium/config/olr-config.json b/tests/sql/environments/rac/debezium/config/olr-config.json index 179ff068..31ae9a02 100644 --- a/tests/sql/environments/rac/debezium/config/olr-config.json +++ b/tests/sql/environments/rac/debezium/config/olr-config.json @@ -3,7 +3,9 @@ "log-level": 4, "state": { "type": "disk", - "path": "/olr-data/checkpoint" + "path": "/olr-data/checkpoint", + "interval-s": 10, + "interval-mb": 1 }, "memory": { "min-mb": 64, diff --git a/tests/sql/environments/rac/debezium/docker-compose.yaml b/tests/sql/environments/rac/debezium/docker-compose.yaml index ea039751..bc9e9fb0 100644 --- a/tests/sql/environments/rac/debezium/docker-compose.yaml +++ b/tests/sql/environments/rac/debezium/docker-compose.yaml @@ -26,6 +26,7 @@ services: image: quay.io/debezium/server:3.5.0.Beta1 container_name: dbz-olr-adapter network_mode: host + restart: unless-stopped depends_on: receiver: condition: service_started diff --git a/tests/sql/environments/rac/up.sh b/tests/sql/environments/rac/up.sh index fc7ad981..35ec6a9c 100755 --- a/tests/sql/environments/rac/up.sh +++ b/tests/sql/environments/rac/up.sh @@ -1,10 +1,12 @@ #!/bin/bash -# RAC VM is managed externally — just verify it's reachable. +# RAC VM is managed externally — verify it's reachable and configs match. set -euo pipefail -SSH_KEY="$(cd "$(dirname "$0")/../../../.." && pwd)/oracle-rac/assets/vm-key" -HOST=192.168.122.248 + +source "$(cd "$(dirname "$0")" && pwd)/vm-env.sh" + +echo "RAC VM IP: $VM_HOST" echo "Checking RAC VM connectivity..." -ssh -o ConnectTimeout=5 -o BatchMode=yes -i "$SSH_KEY" root@$HOST "echo 'RAC VM is reachable'" || { - echo "ERROR: Cannot reach RAC VM at $HOST" >&2 +ssh -o ConnectTimeout=5 -o BatchMode=yes $_SSH_OPTS "${VM_USER}@${VM_HOST}" "echo 'RAC VM is reachable'" || { + echo "ERROR: Cannot reach RAC VM at $VM_HOST" >&2 exit 1 } diff --git a/tests/sql/environments/rac/vm-env.sh b/tests/sql/environments/rac/vm-env.sh new file mode 100644 index 00000000..3dc123d8 --- /dev/null +++ b/tests/sql/environments/rac/vm-env.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Shared RAC VM environment — source this from any RAC test script. +# Auto-detects VM IP via virsh and validates config files. +# +# Usage: source "$(dirname "$0")/vm-env.sh" (or path to this file) +# +# Exports: VM_HOST, VM_KEY, VM_USER, _SSH_OPTS + +_RAC_ENV_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +_PROJECT_ROOT="$(cd "$_RAC_ENV_DIR/../../../.." && pwd)" + +VM_KEY="${VM_KEY:-$_PROJECT_ROOT/oracle-rac/assets/vm-key}" +VM_USER="${VM_USER:-root}" + +# Auto-detect VM IP from virsh (filter for libvirt 192.168.122.* subnet, take first) +if [[ -z "${VM_HOST:-}" ]]; then + VM_HOST=$(virsh domifaddr oracle-rac-vm 2>/dev/null | awk '/ipv4/{print $4}' | cut -d/ -f1 | grep '^192\.168\.122\.' | head -1) + if [[ -z "$VM_HOST" ]]; then + echo "ERROR: Cannot detect RAC VM IP. Is oracle-rac-vm running?" >&2 + echo " Start with: virsh start oracle-rac-vm" >&2 + return 1 2>/dev/null || exit 1 + fi +fi + +_SSH_OPTS="-i $VM_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" + +# Validate config files match detected IP +_VM_ENV_MISMATCH=0 +_check_ip() { + local file="$1" + if [[ -f "$file" ]]; then + local found + found=$(grep -v '^\s*#\|^\s*//' "$file" | grep -oP '192\.168\.122\.\d+' | sort -u || true) + for ip in $found; do + if [[ "$ip" != "$VM_HOST" ]]; then + echo " MISMATCH: $file has $ip (expected $VM_HOST)" >&2 + _VM_ENV_MISMATCH=1 + fi + done + fi +} + +_check_ip "$_RAC_ENV_DIR/.env" +_check_ip "$_PROJECT_ROOT/tests/sql/scripts/drivers/rac.sh" +_check_ip "$_RAC_ENV_DIR/debezium/config/application-logminer.properties" +_check_ip "$_RAC_ENV_DIR/debezium/config/application-olr.properties" +_check_ip "$_RAC_ENV_DIR/debezium/config/olr-config.json" + +if [[ $_VM_ENV_MISMATCH -ne 0 ]]; then + echo "ERROR: RAC VM IP is $VM_HOST but config files have stale IPs." >&2 + echo " Fix with: sed -i 's/192.168.122.[0-9]\\+/$VM_HOST/g' " >&2 + return 1 2>/dev/null || exit 1 +fi + +export VM_HOST VM_KEY VM_USER _SSH_OPTS diff --git a/tests/sql/scripts/drivers/rac.sh b/tests/sql/scripts/drivers/rac.sh index 7af5d540..75824b81 100644 --- a/tests/sql/scripts/drivers/rac.sh +++ b/tests/sql/scripts/drivers/rac.sh @@ -36,7 +36,7 @@ DB_CONN="olr_test/olr_test@//racnodep1:1521/ORCLPDB" source "$SCRIPT_DIR/drivers/base.sh" # ---- RAC configuration ---- -VM_HOST="${VM_HOST:-192.168.122.248}" +VM_HOST="${VM_HOST:-192.168.122.130}" VM_KEY="${VM_KEY:-$PROJECT_ROOT/oracle-rac/assets/vm-key}" VM_USER="${VM_USER:-root}" OLR_IMAGE="${OLR_IMAGE:-olr-dev:latest}" @@ -61,7 +61,7 @@ _vm_sqlplus() { local conn="$3" local sql_file="$4" ssh $_SSH_OPTS "${VM_USER}@${VM_HOST}" \ - "podman exec $node su - oracle -c 'export ORACLE_SID=$sid; sqlplus -S \"$conn\" @$sql_file'" + "podman exec $node su - oracle -c 'export ORACLE_SID=$sid; export NLS_LANG=AMERICAN_AMERICA.AL32UTF8; sqlplus -S \"$conn\" @$sql_file'" } # Copy a local file into a RAC container