diff --git a/.typos.toml b/.typos.toml index 7550e6f152..86a82147ab 100644 --- a/.typos.toml +++ b/.typos.toml @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +# Whitelist valid identifiers to avoid typos false positives +[default.extend-identifiers] +# NDJSON = Newline Delimited JSON — "NdJson" is valid Rust CamelCase +NdJson = "NdJson" +nd_json = "nd_json" + # Whitelist valid technical terms to avoid typos false positives [default.extend-words] # French for coffee, used in UTF-8 test strings (cafe with accent) diff --git a/Cargo.lock b/Cargo.lock index d43b9a2db0..9fbea0cedc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -436,9 +436,9 @@ checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" [[package]] name = "arc-swap" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" dependencies = [ "rustversion", ] @@ -721,9 +721,9 @@ dependencies = [ [[package]] name = "astral-tokio-tar" -version = "0.5.6" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5" +checksum = "3c23f3af104b40a3430ccb90ed5f7bd877a8dc5c26fc92fde51a22b40890dcf9" dependencies = [ "filetime", "futures-core", @@ -1983,9 +1983,9 @@ dependencies = [ [[package]] name = "compio-buf" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e8777c3ad31ab42f8a3a4a1bd629b78f688371df9b0f528d94dfbdbe5c945c9" +checksum = "a00d719dbd8c602ab0d25d219cbc6b517008858de7a8d6c51b4dc95aefff4dce" dependencies = [ "arrayvec", "bytes", @@ -2093,9 +2093,9 @@ dependencies = [ [[package]] name = "compio-quic" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256df80066ad4901c54a3d3e495df4e10384cb911b3e98d61f8275aba48321f9" +checksum = "3864d7362ba5ec270178690e72f854e9360fa3163036fe8b88a3c4475321f8be" dependencies = [ "cfg_aliases", "compio-buf", @@ -5266,7 +5266,7 @@ dependencies = [ "tracing-subscriber", "uuid", "walkdir", - "zip 8.3.0", + "zip 8.4.0", ] [[package]] @@ -5292,7 +5292,7 @@ dependencies = [ "serde_json", "thiserror 2.0.18", "tokio", - "toml 1.0.7+spec-1.1.0", + "toml 1.1.0+spec-1.1.0", "tracing", "tracing-appender", "tracing-subscriber", @@ -5343,7 +5343,7 @@ dependencies = [ "tempfile", "thiserror 2.0.18", "tokio", - "toml 1.0.7+spec-1.1.0", + "toml 1.1.0+spec-1.1.0", "tower-http", "tracing", "tracing-opentelemetry", @@ -5476,6 +5476,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "iggy_connector_http_sink" +version = "0.1.0" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "humantime", + "iggy_connector_sdk", + "reqwest 0.13.2", + "reqwest-middleware", + "reqwest-retry", + "reqwest-tracing", + "serde", + "serde_json", + "simd-json", + "strum_macros 0.28.0", + "tokio", + "toml 1.1.0+spec-1.1.0", + "tracing", +] + [[package]] name = "iggy_connector_iceberg_sink" version = "0.3.2-edge.1" @@ -5688,7 +5710,7 @@ dependencies = [ "rgb", "tiff", "zune-core 0.5.1", - "zune-jpeg 0.5.13", + "zune-jpeg 0.5.14", ] [[package]] @@ -5855,12 +5877,12 @@ dependencies = [ "test-case", "testcontainers-modules", "tokio", - "toml 1.0.7+spec-1.1.0", + "toml 1.1.0+spec-1.1.0", "tracing", "tracing-subscriber", "twox-hash", "uuid", - "zip 8.3.0", + "zip 8.4.0", ] [[package]] @@ -5914,14 +5936,15 @@ dependencies = [ [[package]] name = "ipconfig" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b58db92f96b720de98181bbbe63c831e87005ab460c1bf306eb2622b4707997f" +checksum = "4d40460c0ce33d6ce4b0630ad68ff63d6661961c48b6dba35e5a4d81cfb48222" dependencies = [ - "socket2 0.5.10", + "socket2 0.6.3", "widestring", - "windows-sys 0.48.0", - "winreg", + "windows-registry", + "windows-result 0.4.1", + "windows-sys 0.61.2", ] [[package]] @@ -5932,9 +5955,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb" dependencies = [ "memchr", "serde", @@ -6020,7 +6043,7 @@ dependencies = [ "cesu8", "cfg-if", "combine", - "jni-sys", + "jni-sys 0.3.1", "log", "thiserror 1.0.69", "walkdir", @@ -6029,9 +6052,31 @@ dependencies = [ [[package]] name = "jni-sys" -version = "0.3.0" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" +dependencies = [ + "jni-sys 0.4.1", +] + +[[package]] +name = "jni-sys" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn 2.0.117", +] [[package]] name = "jobserver" @@ -6785,9 +6830,9 @@ dependencies = [ [[package]] name = "moka" -version = "0.12.14" +version = "0.12.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85f8024e1c8e71c778968af91d43700ce1d11b219d127d79fb2934153b82b42b" +checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" dependencies = [ "async-lock", "crossbeam-channel", @@ -7218,9 +7263,9 @@ dependencies = [ [[package]] name = "octocrab" -version = "0.49.5" +version = "0.49.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89f6f72d7084a80bf261bb6b6f83bd633323d5633d5ec7988c6c95b20448b2b5" +checksum = "481d01ffe3fa4347e55474798e16d8d678aab19b8d7ca631ebb3c607cc87f9db" dependencies = [ "arc-swap", "async-trait", @@ -8079,7 +8124,7 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ - "toml_edit 0.25.5+spec-1.1.0", + "toml_edit 0.25.8+spec-1.1.0", ] [[package]] @@ -9617,9 +9662,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776" +checksum = "876ac351060d4f882bb1032b6369eb0aef79ad9df1ea8bc404874d8cc3d0cd98" dependencies = [ "serde_core", ] @@ -9781,7 +9826,7 @@ dependencies = [ "sysinfo 0.38.4", "tempfile", "thiserror 2.0.18", - "toml 1.0.7+spec-1.1.0", + "toml 1.1.0+spec-1.1.0", "tower-http", "tracing", "tracing-appender", @@ -10622,12 +10667,12 @@ dependencies = [ [[package]] name = "terminal_size" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0" +checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix 1.1.4", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -10671,9 +10716,9 @@ dependencies = [ [[package]] name = "testcontainers" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1c0624faaa317c56d6d19136580be889677259caf5c897941c6f446b4655068" +checksum = "0bd36b06a2a6c0c3c81a83be1ab05fe86460d054d4d51bf513bc56b3e15bdc22" dependencies = [ "astral-tokio-tar", "async-trait", @@ -10797,7 +10842,7 @@ dependencies = [ "half", "quick-error", "weezl", - "zune-jpeg 0.5.13", + "zune-jpeg 0.5.14", ] [[package]] @@ -11003,14 +11048,14 @@ dependencies = [ [[package]] name = "toml" -version = "1.0.7+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd28d57d8a6f6e458bc0b8784f8fdcc4b99a437936056fa122cb234f18656a96" +checksum = "f8195ca05e4eb728f4ba94f3e3291661320af739c4e43779cbdfae82ab239fcc" dependencies = [ "indexmap 2.13.0", "serde_core", - "serde_spanned 1.0.4", - "toml_datetime 1.0.1+spec-1.1.0", + "serde_spanned 1.1.0", + "toml_datetime 1.1.0+spec-1.1.0", "toml_parser", "toml_writer", "winnow 1.0.0", @@ -11027,9 +11072,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.0.1+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" +checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f" dependencies = [ "serde_core", ] @@ -11061,21 +11106,21 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.25.5+spec-1.1.0" +version = "0.25.8+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" +checksum = "16bff38f1d86c47f9ff0647e6838d7bb362522bdf44006c7068c2b1e606f1f3c" dependencies = [ "indexmap 2.13.0", - "toml_datetime 1.0.1+spec-1.1.0", + "toml_datetime 1.1.0+spec-1.1.0", "toml_parser", "winnow 1.0.0", ] [[package]] name = "toml_parser" -version = "1.0.10+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" +checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011" dependencies = [ "winnow 1.0.0", ] @@ -11088,9 +11133,9 @@ checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" [[package]] name = "toml_writer" -version = "1.0.7+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17aaa1c6e3dc22b1da4b6bba97d066e354c7945cac2f7852d4e4e7ca7a6b56d" +checksum = "d282ade6016312faf3e41e57ebbba0c073e4056dab1232ab1cb624199648f8ed" [[package]] name = "tonic" @@ -11554,9 +11599,9 @@ checksum = "383ad40bb927465ec0ce7720e033cb4ca06912855fc35db31b5755d0de75b1ee" [[package]] name = "unicode-segmentation" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +checksum = "a559e63b5d8004e12f9bce88af5c6d939c58de839b7532cfe9653846cedd2a9e" [[package]] name = "unicode-vo" @@ -11606,9 +11651,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "3.2.1" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ab5172ab0c2b6d01a9bb4f9332f7c1211193ea002742188040d09ea4eafe867" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" dependencies = [ "base64 0.22.1", "flate2", @@ -11623,9 +11668,9 @@ dependencies = [ [[package]] name = "ureq-proto" -version = "0.5.3" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" dependencies = [ "base64 0.22.1", "http 1.4.0", @@ -12300,6 +12345,17 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link 0.2.1", + "windows-result 0.4.1", + "windows-strings 0.5.1", +] + [[package]] name = "windows-result" version = "0.3.4" @@ -12678,16 +12734,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.50.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "winsafe" version = "0.0.19" @@ -13052,9 +13098,9 @@ dependencies = [ [[package]] name = "zip" -version = "8.3.0" +version = "8.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a243cfad17427fc077f529da5a95abe4e94fd2bfdb601611870a6557cc67657" +checksum = "7756d0206d058333667493c4014f545f4b9603c4330ccd6d9b3f86dcab59f7d9" dependencies = [ "crc32fast", "flate2", @@ -13148,9 +13194,9 @@ dependencies = [ [[package]] name = "zune-jpeg" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec5f41c76397b7da451efd19915684f727d7e1d516384ca6bd0ec43ec94de23c" +checksum = "0b7a1c0af6e5d8d1363f4994b7a091ccf963d8b694f7da5b0b9cceb82da2c0a6" dependencies = [ "zune-core 0.5.1", ] diff --git a/Cargo.toml b/Cargo.toml index 5918468b07..2948a1ad2b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ members = [ "core/connectors/runtime", "core/connectors/sdk", "core/connectors/sinks/elasticsearch_sink", + "core/connectors/sinks/http_sink", "core/connectors/sinks/iceberg_sink", "core/connectors/sinks/mongodb_sink", "core/connectors/sinks/postgres_sink", diff --git a/DEPENDENCIES.md b/DEPENDENCIES.md index b6100cdfbd..7351cfa2c4 100644 --- a/DEPENDENCIES.md +++ b/DEPENDENCIES.md @@ -33,7 +33,7 @@ anstyle-wincon: 3.0.11, "Apache-2.0 OR MIT", anyhow: 1.0.102, "Apache-2.0 OR MIT", apache-avro: 0.21.0, "Apache-2.0", arbitrary: 1.4.2, "Apache-2.0 OR MIT", -arc-swap: 1.8.2, "Apache-2.0 OR MIT", +arc-swap: 1.9.0, "Apache-2.0 OR MIT", arg_enum_proc_macro: 0.3.4, "MIT", argon2: 0.5.3, "Apache-2.0 OR MIT", array-init: 2.1.0, "Apache-2.0 OR MIT", @@ -56,7 +56,7 @@ asn1-rs: 0.7.1, "Apache-2.0 OR MIT", asn1-rs-derive: 0.6.0, "Apache-2.0 OR MIT", asn1-rs-impl: 0.2.0, "Apache-2.0 OR MIT", assert_cmd: 2.2.0, "Apache-2.0 OR MIT", -astral-tokio-tar: 0.5.6, "Apache-2.0 OR MIT", +astral-tokio-tar: 0.6.0, "Apache-2.0 OR MIT", async-broadcast: 0.7.2, "Apache-2.0 OR MIT", async-channel: 2.5.0, "Apache-2.0 OR MIT", async-compression: 0.4.41, "Apache-2.0 OR MIT", @@ -170,14 +170,14 @@ colored: 3.1.1, "MPL-2.0", combine: 4.6.7, "MIT", comfy-table: 7.2.2, "MIT", compio: 0.18.0, "MIT", -compio-buf: 0.8.0, "MIT", +compio-buf: 0.8.1, "MIT", compio-driver: 0.11.3, "MIT", compio-fs: 0.11.0, "MIT", compio-io: 0.9.1, "MIT", compio-log: 0.1.0, "MIT", compio-macros: 0.1.2, "MIT", compio-net: 0.11.1, "MIT", -compio-quic: 0.7.0, "MIT", +compio-quic: 0.7.1, "MIT", compio-runtime: 0.11.0, "MIT", compio-tls: 0.9.0, "MIT", compio-ws: 0.3.0, "MIT", @@ -461,6 +461,7 @@ iggy_binary_protocol: 0.9.4-edge.1, "Apache-2.0", iggy_common: 0.9.4-edge.1, "Apache-2.0", iggy_connector_elasticsearch_sink: 0.3.2-edge.1, "Apache-2.0", iggy_connector_elasticsearch_source: 0.3.2-edge.1, "Apache-2.0", +iggy_connector_http_sink: 0.1.0, "Apache-2.0", iggy_connector_iceberg_sink: 0.3.2-edge.1, "Apache-2.0", iggy_connector_mongodb_sink: 0.3.0, "Apache-2.0", iggy_connector_postgres_sink: 0.3.2-edge.1, "Apache-2.0", @@ -492,9 +493,9 @@ inventory: 0.3.22, "Apache-2.0 OR MIT", io-uring: 0.7.11, "Apache-2.0 OR MIT", io_uring_buf_ring: 0.2.3, "MIT", iobuf: 0.1.0, "Apache-2.0", -ipconfig: 0.3.2, "Apache-2.0 OR MIT", +ipconfig: 0.3.4, "Apache-2.0 OR MIT", ipnet: 2.12.0, "Apache-2.0 OR MIT", -iri-string: 0.7.10, "Apache-2.0 OR MIT", +iri-string: 0.7.11, "Apache-2.0 OR MIT", is_terminal_polyfill: 1.70.2, "Apache-2.0 OR MIT", itertools: 0.13.0, "Apache-2.0 OR MIT", itertools: 0.14.0, "Apache-2.0 OR MIT", @@ -504,7 +505,9 @@ jiff-static: 0.2.23, "MIT OR Unlicense", jiff-tzdb: 0.1.6, "MIT OR Unlicense", jiff-tzdb-platform: 0.1.3, "MIT OR Unlicense", jni: 0.21.1, "Apache-2.0 OR MIT", -jni-sys: 0.3.0, "Apache-2.0 OR MIT", +jni-sys: 0.3.1, "Apache-2.0 OR MIT", +jni-sys: 0.4.1, "Apache-2.0 OR MIT", +jni-sys-macros: 0.4.1, "Apache-2.0 OR MIT", jobserver: 0.1.34, "Apache-2.0 OR MIT", journal: 0.1.0, "Apache-2.0", js-sys: 0.3.91, "Apache-2.0 OR MIT", @@ -582,7 +585,7 @@ miniz_oxide: 0.8.9, "Apache-2.0 OR MIT OR Zlib", mio: 1.1.1, "MIT", mockall: 0.14.0, "Apache-2.0 OR MIT", mockall_derive: 0.14.0, "Apache-2.0 OR MIT", -moka: 0.12.14, "(Apache-2.0 OR MIT) AND Apache-2.0", +moka: 0.12.15, "(Apache-2.0 OR MIT) AND Apache-2.0", mongocrypt: 0.3.2, "Apache-2.0", mongocrypt-sys: 0.1.5+1.15.1, "Apache-2.0", mongodb: 3.5.2, "Apache-2.0", @@ -623,7 +626,7 @@ objc2: 0.6.4, "MIT", objc2-core-foundation: 0.3.2, "Apache-2.0 OR MIT OR Zlib", objc2-encode: 4.1.0, "MIT", objc2-io-kit: 0.3.2, "Apache-2.0 OR MIT OR Zlib", -octocrab: 0.49.5, "Apache-2.0 OR MIT", +octocrab: 0.49.6, "Apache-2.0 OR MIT", oid-registry: 0.8.1, "Apache-2.0 OR MIT", once_cell: 1.21.4, "Apache-2.0 OR MIT", once_cell_polyfill: 1.70.2, "Apache-2.0 OR MIT", @@ -835,7 +838,7 @@ serde_json: 1.0.149, "Apache-2.0 OR MIT", serde_path_to_error: 0.1.20, "Apache-2.0 OR MIT", serde_repr: 0.1.20, "Apache-2.0 OR MIT", serde_spanned: 0.6.9, "Apache-2.0 OR MIT", -serde_spanned: 1.0.4, "Apache-2.0 OR MIT", +serde_spanned: 1.1.0, "Apache-2.0 OR MIT", serde_urlencoded: 0.7.1, "Apache-2.0 OR MIT", serde_v8: 0.260.0, "MIT", serde_with: 3.18.0, "Apache-2.0 OR MIT", @@ -914,12 +917,12 @@ take_mut: 0.2.2, "MIT", tap: 1.0.1, "MIT", tar: 0.4.45, "Apache-2.0 OR MIT", tempfile: 3.27.0, "Apache-2.0 OR MIT", -terminal_size: 0.4.3, "Apache-2.0 OR MIT", +terminal_size: 0.4.4, "Apache-2.0 OR MIT", termtree: 0.5.1, "MIT", test-case: 3.3.1, "MIT", test-case-core: 3.3.1, "MIT", test-case-macros: 3.3.1, "MIT", -testcontainers: 0.27.1, "Apache-2.0 OR MIT", +testcontainers: 0.27.2, "Apache-2.0 OR MIT", testcontainers-modules: 0.15.0, "MIT", textwrap: 0.16.2, "MIT", thin-cell: 0.1.2, "MIT", @@ -947,15 +950,15 @@ tokio-tungstenite: 0.29.0, "MIT", tokio-util: 0.7.18, "MIT", tokise: 0.2.1, "Apache-2.0 OR MIT", toml: 0.8.23, "Apache-2.0 OR MIT", -toml: 1.0.7+spec-1.1.0, "Apache-2.0 OR MIT", +toml: 1.1.0+spec-1.1.0, "Apache-2.0 OR MIT", toml_datetime: 0.6.11, "Apache-2.0 OR MIT", -toml_datetime: 1.0.1+spec-1.1.0, "Apache-2.0 OR MIT", +toml_datetime: 1.1.0+spec-1.1.0, "Apache-2.0 OR MIT", toml_edit: 0.19.15, "Apache-2.0 OR MIT", toml_edit: 0.22.27, "Apache-2.0 OR MIT", -toml_edit: 0.25.5+spec-1.1.0, "Apache-2.0 OR MIT", -toml_parser: 1.0.10+spec-1.1.0, "Apache-2.0 OR MIT", +toml_edit: 0.25.8+spec-1.1.0, "Apache-2.0 OR MIT", +toml_parser: 1.1.0+spec-1.1.0, "Apache-2.0 OR MIT", toml_write: 0.1.2, "Apache-2.0 OR MIT", -toml_writer: 1.0.7+spec-1.1.0, "Apache-2.0 OR MIT", +toml_writer: 1.1.0+spec-1.1.0, "Apache-2.0 OR MIT", tonic: 0.14.5, "MIT", tonic-prost: 0.14.5, "MIT", tools: 0.1.0, "Apache-2.0", @@ -1000,7 +1003,7 @@ unicode-linebreak: 0.1.5, "Apache-2.0", unicode-normalization: 0.1.25, "Apache-2.0 OR MIT", unicode-properties: 0.1.4, "Apache-2.0 OR MIT", unicode-script: 0.5.8, "Apache-2.0 OR MIT", -unicode-segmentation: 1.12.0, "Apache-2.0 OR MIT", +unicode-segmentation: 1.13.0, "Apache-2.0 OR MIT", unicode-vo: 0.1.0, "Apache-2.0 OR MIT", unicode-width: 0.1.14, "Apache-2.0 OR MIT", unicode-width: 0.2.2, "Apache-2.0 OR MIT", @@ -1008,8 +1011,8 @@ unicode-xid: 0.2.6, "Apache-2.0 OR MIT", universal-hash: 0.5.1, "Apache-2.0 OR MIT", unsafe-libyaml: 0.2.11, "MIT", untrusted: 0.9.0, "ISC", -ureq: 3.2.1, "Apache-2.0 OR MIT", -ureq-proto: 0.5.3, "Apache-2.0 OR MIT", +ureq: 3.3.0, "Apache-2.0 OR MIT", +ureq-proto: 0.6.0, "Apache-2.0 OR MIT", url: 2.5.8, "Apache-2.0 OR MIT", urlencoding: 2.1.3, "MIT", usvg: 0.45.1, "Apache-2.0 OR MIT", @@ -1077,6 +1080,7 @@ windows-link: 0.1.3, "Apache-2.0 OR MIT", windows-link: 0.2.1, "Apache-2.0 OR MIT", windows-numerics: 0.2.0, "Apache-2.0 OR MIT", windows-numerics: 0.3.1, "Apache-2.0 OR MIT", +windows-registry: 0.6.1, "Apache-2.0 OR MIT", windows-result: 0.3.4, "Apache-2.0 OR MIT", windows-result: 0.4.1, "Apache-2.0 OR MIT", windows-strings: 0.4.2, "Apache-2.0 OR MIT", @@ -1126,7 +1130,6 @@ windows_x86_64_msvc: 0.53.1, "Apache-2.0 OR MIT", winnow: 0.5.40, "MIT", winnow: 0.7.15, "MIT", winnow: 1.0.0, "MIT", -winreg: 0.50.0, "MIT", winsafe: 0.0.19, "MIT", wit-bindgen: 0.51.0, "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", wit-bindgen-core: 0.51.0, "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", @@ -1158,7 +1161,7 @@ zerotrie: 0.2.3, "Unicode-3.0", zerovec: 0.11.5, "Unicode-3.0", zerovec-derive: 0.11.2, "Unicode-3.0", zip: 0.6.6, "MIT", -zip: 8.3.0, "MIT", +zip: 8.4.0, "MIT", zlib-rs: 0.6.3, "Zlib", zmij: 1.0.21, "MIT", zopfli: 0.8.3, "Apache-2.0", @@ -1169,4 +1172,4 @@ zune-core: 0.4.12, "Apache-2.0 OR MIT OR Zlib", zune-core: 0.5.1, "Apache-2.0 OR MIT OR Zlib", zune-inflate: 0.2.54, "Apache-2.0 OR MIT OR Zlib", zune-jpeg: 0.4.21, "Apache-2.0 OR MIT OR Zlib", -zune-jpeg: 0.5.13, "Apache-2.0 OR MIT OR Zlib", +zune-jpeg: 0.5.14, "Apache-2.0 OR MIT OR Zlib", diff --git a/core/connectors/sdk/src/convert.rs b/core/connectors/sdk/src/convert.rs new file mode 100644 index 0000000000..c7db9937da --- /dev/null +++ b/core/connectors/sdk/src/convert.rs @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//! Value conversion utilities for connector sinks. +//! +//! Provides shared conversion functions between serialization formats used by +//! the connector ecosystem (e.g., `simd_json` ↔ `serde_json`). + +/// Convert `simd_json::OwnedValue` to `serde_json::Value` via direct structural mapping. +/// +/// NaN/Infinity f64 values are mapped to `null` since JSON has no representation +/// for these IEEE 754 special values. +pub fn owned_value_to_serde_json(value: &simd_json::OwnedValue) -> serde_json::Value { + match value { + simd_json::OwnedValue::Static(s) => match s { + simd_json::StaticNode::Null => serde_json::Value::Null, + simd_json::StaticNode::Bool(b) => serde_json::Value::Bool(*b), + simd_json::StaticNode::I64(n) => serde_json::Value::Number((*n).into()), + simd_json::StaticNode::U64(n) => serde_json::Value::Number((*n).into()), + simd_json::StaticNode::F64(n) => serde_json::Number::from_f64(*n) + .map(serde_json::Value::Number) + .unwrap_or(serde_json::Value::Null), + }, + simd_json::OwnedValue::String(s) => serde_json::Value::String(s.to_string()), + simd_json::OwnedValue::Array(arr) => { + serde_json::Value::Array(arr.iter().map(owned_value_to_serde_json).collect()) + } + simd_json::OwnedValue::Object(obj) => { + let map: serde_json::Map = obj + .iter() + .map(|(k, v)| (k.to_string(), owned_value_to_serde_json(v))) + .collect(); + serde_json::Value::Object(map) + } + } +} diff --git a/core/connectors/sdk/src/lib.rs b/core/connectors/sdk/src/lib.rs index 8ba37a0830..72a1fe0551 100644 --- a/core/connectors/sdk/src/lib.rs +++ b/core/connectors/sdk/src/lib.rs @@ -38,6 +38,7 @@ use tokio::runtime::Runtime; #[cfg(feature = "api")] pub mod api; +pub mod convert; pub mod decoders; pub mod encoders; pub mod log; @@ -45,6 +46,7 @@ pub mod sink; pub mod source; pub mod transforms; +pub use convert::owned_value_to_serde_json; pub use log::LogCallback; pub use transforms::Transform; diff --git a/core/connectors/sinks/elasticsearch_sink/src/lib.rs b/core/connectors/sinks/elasticsearch_sink/src/lib.rs index 36c76935cb..4e81839ac1 100644 --- a/core/connectors/sinks/elasticsearch_sink/src/lib.rs +++ b/core/connectors/sinks/elasticsearch_sink/src/lib.rs @@ -26,7 +26,8 @@ use elasticsearch::{ }; use iggy_common::IggyTimestamp; use iggy_connector_sdk::{ - ConsumedMessage, Error, MessagesMetadata, Payload, Sink, TopicMetadata, sink_connector, + ConsumedMessage, Error, MessagesMetadata, Payload, Sink, TopicMetadata, + convert::owned_value_to_serde_json, sink_connector, }; use secrecy::{ExposeSecret, SecretString}; use serde::{Deserialize, Serialize}; @@ -37,31 +38,6 @@ use tracing::{info, warn}; sink_connector!(ElasticsearchSink); -fn owned_value_to_serde_json(value: &OwnedValue) -> serde_json::Value { - match value { - OwnedValue::Static(s) => match s { - simd_json::StaticNode::Null => serde_json::Value::Null, - simd_json::StaticNode::Bool(b) => serde_json::Value::Bool(*b), - simd_json::StaticNode::I64(n) => serde_json::Value::Number((*n).into()), - simd_json::StaticNode::U64(n) => serde_json::Value::Number((*n).into()), - simd_json::StaticNode::F64(n) => serde_json::Number::from_f64(*n) - .map(serde_json::Value::Number) - .unwrap_or(serde_json::Value::Null), - }, - OwnedValue::String(s) => serde_json::Value::String(s.to_string()), - OwnedValue::Array(arr) => { - serde_json::Value::Array(arr.iter().map(owned_value_to_serde_json).collect()) - } - OwnedValue::Object(obj) => { - let map: serde_json::Map = obj - .iter() - .map(|(k, v)| (k.to_string(), owned_value_to_serde_json(v))) - .collect(); - serde_json::Value::Object(map) - } - } -} - #[derive(Debug)] struct State { invocations_count: usize, diff --git a/core/connectors/sinks/http_sink/Cargo.toml b/core/connectors/sinks/http_sink/Cargo.toml new file mode 100644 index 0000000000..e5e98fb1ad --- /dev/null +++ b/core/connectors/sinks/http_sink/Cargo.toml @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "iggy_connector_http_sink" +version = "0.1.0" +description = "Iggy HTTP sink connector for delivering stream messages to any HTTP endpoint via webhooks, REST APIs, or serverless functions." +edition = "2024" +license = "Apache-2.0" +keywords = ["iggy", "messaging", "streaming", "http", "sink"] +categories = ["command-line-utilities", "database", "network-programming"] +homepage = "https://iggy.apache.org" +documentation = "https://iggy.apache.org/docs" +repository = "https://github.com/apache/iggy" +readme = "../../README.md" + +[lib] +crate-type = ["cdylib", "lib"] + +[dependencies] +async-trait = { workspace = true } +base64 = { workspace = true } +bytes = { workspace = true } +humantime = { workspace = true } +iggy_connector_sdk = { workspace = true } +reqwest = { workspace = true } +reqwest-middleware = { workspace = true } +reqwest-retry = { workspace = true } +reqwest-tracing = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +simd-json = { workspace = true } +strum_macros = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +toml = { workspace = true } diff --git a/core/connectors/sinks/http_sink/README.md b/core/connectors/sinks/http_sink/README.md new file mode 100644 index 0000000000..ad4947e04a --- /dev/null +++ b/core/connectors/sinks/http_sink/README.md @@ -0,0 +1,812 @@ +# HTTP Sink Connector + +Consumes messages from Iggy streams and delivers them to any HTTP endpoint — webhooks, REST APIs, Lambda functions, or SaaS integrations. + +## Try It + +Send a JSON message through Iggy and see it arrive at an HTTP endpoint. + +**Prerequisites**: Docker running, project built (`cargo build` from repo root). + +```bash +# Start iggy-server (terminal 1) +IGGY_ROOT_USERNAME=iggy IGGY_ROOT_PASSWORD=iggy ./target/debug/iggy-server + +# Create stream and topic +./target/debug/iggy -u iggy -p iggy stream create demo_stream +./target/debug/iggy -u iggy -p iggy topic create demo_stream demo_topic 1 + +# Start a simple HTTP receiver (terminal 2) +python3 -c " +from http.server import HTTPServer, BaseHTTPRequestHandler +import json +class H(BaseHTTPRequestHandler): + def do_POST(self): + body = self.rfile.read(int(self.headers['Content-Length'])) + print(json.dumps(json.loads(body), indent=2)) + self.send_response(200) + self.end_headers() +HTTPServer(('', 9090), H).serve_forever() +" + +# Setup connector config +mkdir -p /tmp/http-sink-test/connectors +cat > /tmp/http-sink-test/config.toml << 'TOML' +[iggy] +address = "localhost:8090" +username = "iggy" +password = "iggy" +[state] +path = "/tmp/http-sink-test/state" +[connectors] +config_type = "local" +config_dir = "/tmp/http-sink-test/connectors" +TOML +cat > /tmp/http-sink-test/connectors/sink.toml << 'TOML' +type = "sink" +key = "http" +enabled = true +version = 0 +name = "test" +path = "target/debug/libiggy_connector_http_sink" +[[streams]] +stream = "demo_stream" +topics = ["demo_topic"] +schema = "json" +batch_length = 100 +poll_interval = "100ms" +consumer_group = "test_cg" +[plugin_config] +url = "http://localhost:9090/ingest" +batch_mode = "individual" +TOML + +# Start connector (terminal 3) +IGGY_CONNECTORS_CONFIG_PATH=/tmp/http-sink-test/config.toml ./target/debug/iggy-connectors + +# Send a message +./target/debug/iggy -u iggy -p iggy message send demo_stream demo_topic '{"hello":"http"}' +``` + +Expected output on the Python receiver: + +```json +{ + "metadata": { + "iggy_id": "00000000000000000000000000000001", + "iggy_offset": 0, + "iggy_stream": "demo_stream", + "iggy_topic": "demo_topic" + }, + "payload": { + "hello": "http" + } +} +``` + +Cleanup: `rm -rf /tmp/http-sink-test` + +## Quick Start + +```toml +[[streams]] +stream = "events" +topics = ["notifications"] +schema = "json" +batch_length = 50 +poll_interval = "100ms" +consumer_group = "http_sink" + +[plugin_config] +url = "https://api.example.com/ingest" +batch_mode = "nd_json" +``` + +## Configuration + +| Option | Type | Default | Description | +| ------ | ---- | ------- | ----------- | +| `url` | string | **required** | Target URL for HTTP requests | +| `method` | string | `POST` | HTTP method: `GET`, `HEAD`, `POST`, `PUT`, `PATCH`, `DELETE` | +| `timeout` | string | `30s` | Request timeout (e.g., `10s`, `500ms`) | +| `max_payload_size_bytes` | u64 | `10485760` | Max body size in bytes (10MB). `0` to disable | +| `batch_mode` | string | `individual` | `individual`, `nd_json`, `json_array`, or `raw` | +| `include_metadata` | bool | `true` | Wrap payload in metadata envelope | +| `include_checksum` | bool | `false` | Add message checksum to metadata | +| `include_origin_timestamp` | bool | `false` | Add origin timestamp to metadata | +| `health_check_enabled` | bool | `false` | Send health check request in `open()` | +| `health_check_method` | string | `HEAD` | HTTP method for health check | +| `max_retries` | u32 | `3` | Retry attempts for transient errors | +| `retry_delay` | string | `1s` | Base delay between retries | +| `retry_backoff_multiplier` | u32 | `2` | Exponential backoff multiplier (min 1) | +| `max_retry_delay` | string | `30s` | Maximum retry delay cap | +| `success_status_codes` | [u16] | `[200, 201, 202, 204]` | Status codes considered successful | +| `tls_danger_accept_invalid_certs` | bool | `false` | Skip TLS certificate validation | +| `max_connections` | usize | `10` | Max idle connections per host | +| `verbose_logging` | bool | `false` | Log request/response details at debug level | +| `headers` | table | `{}` | Custom HTTP headers (e.g., `Authorization`) | + +## Batch Modes + +### `individual` (default) + +One HTTP request per message. Best for webhooks and endpoints that accept single events. + +> With `batch_length = 50`, this produces 50 sequential HTTP round trips per poll cycle. +> For production throughput, use `nd_json` or `json_array`. + +```text +POST /ingest Content-Type: application/json +{"metadata": {"iggy_offset": 1, ...}, "payload": {"key": "value"}} +``` + +### `nd_json` + +All messages in one request, [newline-delimited JSON](https://github.com/ndjson/ndjson-spec). Best for bulk ingestion endpoints. + +```text +POST /ingest Content-Type: application/x-ndjson +{"metadata": {"iggy_offset": 1}, "payload": {"key": "value1"}} +{"metadata": {"iggy_offset": 2}, "payload": {"key": "value2"}} +``` + +### `json_array` + +All messages as a single JSON array. Best for APIs expecting array payloads. + +```text +POST /ingest Content-Type: application/json +[{"metadata": {"iggy_offset": 1}, "payload": {"key": "value1"}}, ...] +``` + +### `raw` + +Raw bytes, one request per message. For non-JSON payloads (protobuf, binary). Metadata envelope is not applied in raw mode. + +```text +POST /ingest Content-Type: application/octet-stream + +``` + +## Message Flow: What Goes In vs. What Comes Out + +The connector does **not** require or expect any particular message structure. It receives raw bytes from the Iggy runtime — whatever you published to the topic is what arrives in `consume()`. The `{metadata: {}, payload: {}}` envelope is something the **sink adds on the way out**, not something it expects on the way in. + +```text +Your app publishes: {"order_id": 123, "amount": 9.99} + | + v +Iggy stores: raw bytes of that JSON + | + v +Runtime delivers: those same raw bytes to consume() + | + v +HTTP sink wraps: {"metadata": {"iggy_offset": 0, ...}, + "payload": {"order_id": 123, "amount": 9.99}} + | + v +HTTP endpoint gets: the wrapped envelope +``` + +With `include_metadata = false`, the sink skips wrapping — your original message goes through as-is: + +```text +HTTP endpoint gets: {"order_id": 123, "amount": 9.99} +``` + +The `schema` field in `[[streams]]` controls how the sink **interprets** the incoming bytes for output formatting: + +| Schema | Interpretation | Payload in envelope | +| ------ | -------------- | ------------------- | +| `json` | Parses bytes as JSON | Embedded as JSON value | +| `text` | Treats bytes as UTF-8 string | Embedded as string | +| `raw` / `flatbuffer` / `proto` | Opaque binary | Base64-encoded with `"iggy_payload_encoding": "base64"` | + +You can publish any struct serialized in any format (JSON, protobuf, raw bytes). Set the matching `schema` in `[[streams]]`, and choose whether you want the metadata envelope (`include_metadata`) or not. + +## Metadata Envelope + +When `include_metadata = true` (default), payloads are wrapped: + +```json +{ + "metadata": { + "iggy_id": "0123456789abcdef0123456789abcdef", + "iggy_offset": 42, + "iggy_timestamp": 1710064800000000, + "iggy_stream": "my_stream", + "iggy_topic": "my_topic", + "iggy_partition_id": 0 + }, + "payload": { ... } +} +``` + +- **`iggy_id`**: Message ID formatted as 32-character lowercase hex string (no dashes) +- **Non-JSON payloads** (Raw, FlatBuffer, Proto): base64-encoded with `"iggy_payload_encoding": "base64"` in payload +- **JSON/Text payloads**: Embedded as-is + +Set `include_metadata = false` to send the raw payload without wrapping. + +## Retry Strategy + +Uses `reqwest-middleware` with `RetryTransientMiddleware` for automatic exponential backoff: + +```text +Initial request: no delay +Retry 1: retry_delay = 1s +Retry 2: retry_delay * backoff = 2s +Retry 3: retry_delay * backoff^2 = min(4s, 30s) = 4s +``` + +A custom `HttpSinkRetryStrategy` respects user-configured `success_status_codes` — codes in the success set are never retried, even if normally transient (e.g., 429 configured as "queued"). + +**Transient errors** (retry): Network errors, HTTP 429, 500, 502, 503, 504. + +**Non-transient errors** (fail immediately): HTTP 400, 401, 403, 404, 405, etc. + +**HTTP 429 `Retry-After`**: The middleware does not natively support `Retry-After` headers. When a response carries `Retry-After`, a warning is logged with the header value. The middleware uses computed exponential backoff instead. + +**Partial delivery** (`individual`/`raw` modes): If a message fails after exhausting retries, subsequent messages continue processing. After 3 consecutive HTTP failures, the remaining batch is aborted to avoid hammering a dead endpoint. + +## Use Cases + +### Webhook Delivery + +Forward stream events to webhook endpoints (Slack, PagerDuty, GitHub, custom). Use `individual` mode for one notification per event: + +```toml +[plugin_config] +url = "https://hooks.slack.com/services/T00/B00/xxx" +batch_mode = "individual" +include_metadata = false # Slack expects bare JSON payload +``` + +### REST API Ingestion + +Push data into downstream REST APIs (analytics, CRM, data warehouse loaders). Use `nd_json` or `json_array` for bulk efficiency: + +```toml +[plugin_config] +url = "https://analytics.example.com/v1/events" +batch_mode = "nd_json" +include_metadata = true # downstream can route by iggy_stream/iggy_topic + +[plugin_config.headers] +Authorization = "Bearer my-api-token" +``` + +### Serverless Function Trigger + +Invoke AWS Lambda, Google Cloud Functions, or Azure Functions via their HTTP endpoints: + +```toml +[plugin_config] +url = "https://abc123.execute-api.us-east-1.amazonaws.com/prod/ingest" +batch_mode = "json_array" +timeout = "10s" + +[plugin_config.headers] +x-api-key = "my-api-key" +``` + +### IoT / Sensor Data Relay + +Forward binary sensor payloads to processing services without JSON overhead: + +```toml +[[streams]] +stream = "sensors" +topics = ["temperature", "pressure"] +schema = "raw" +batch_length = 200 +poll_interval = "50ms" +consumer_group = "sensor_relay" + +[plugin_config] +url = "https://iot-gateway.example.com/ingest" +batch_mode = "raw" +max_retries = 5 +timeout = "5s" +``` + +### Multi-Service Event Fan-Out + +Route different event types to their respective microservices. See [Deployment Patterns](#deployment-patterns) for how to set this up with multiple connector instances. + +### Observability Pipeline + +Forward structured logs or metrics from Iggy streams to external observability platforms: + +```toml +[[streams]] +stream = "logs" +topics = ["application", "infrastructure", "security"] +schema = "json" +batch_length = 500 +poll_interval = "200ms" +consumer_group = "log_forwarder" + +[plugin_config] +url = "https://logs.example.com/api/v1/ingest" +batch_mode = "nd_json" +max_connections = 20 +timeout = "60s" +max_payload_size_bytes = 52428800 # 50MB for large log batches +include_metadata = true # iggy_stream/iggy_topic for routing + +[plugin_config.headers] +Authorization = "Bearer observability-token" +``` + +## Authentication + +The HTTP sink supports authentication via custom headers in `[plugin_config.headers]`. All headers are sent with every request, including health checks. + +### Bearer Token + +```toml +[plugin_config.headers] +Authorization = "Bearer eyJhbGciOiJSUzI1NiIs..." +``` + +### API Key + +```toml +[plugin_config.headers] +x-api-key = "my-secret-api-key" +``` + +### Basic Auth + +```toml +[plugin_config.headers] +# Base64-encoded "username:password" +Authorization = "Basic dXNlcm5hbWU6cGFzc3dvcmQ=" +``` + +### Multiple Auth Headers + +Some services require multiple authentication headers (e.g., API key + tenant ID): + +```toml +[plugin_config.headers] +Authorization = "Bearer token" +X-Tenant-ID = "tenant-123" +X-Client-Version = "iggy-http-sink/0.1" +``` + +### Limitations + +- **No OAuth2 / OIDC token refresh**: Bearer tokens are static. For services requiring token rotation, use an auth proxy (e.g., OAuth2 Proxy, Envoy with ext_authz) that handles token lifecycle and forwards requests to the upstream. +- **No AWS SigV4 signing**: For AWS services (API Gateway with IAM auth, S3, etc.), place the connector behind an API Gateway endpoint with API key auth, or use a signing proxy. +- **No mTLS client certificates**: Use `tls_danger_accept_invalid_certs` only for development. For production mTLS, terminate at a sidecar proxy. +- **Secrets in config file**: Header values (including tokens) are stored in plaintext in `config.toml`. Protect the config file with appropriate file permissions. Environment variable expansion in config values is not currently supported by the connector runtime. + +## Deployment Patterns + +### Connector Runtime Model + +A **connector instance** is a single OS process — the `iggy-connectors` binary loading one shared library (`libiggy_connector_http_sink.so`/`.dylib`) with one config file. Each process reads exactly one `config.toml` (set via `IGGY_CONNECTORS_CONFIG_PATH`), which defines one `[plugin_config]` block — including the target `url`, authentication headers, batch mode, and retry settings. + +Within that single process, the runtime spawns one async task per topic listed in `[[streams]]`. All tasks share the same plugin instance (and therefore the same HTTP client and `[plugin_config]`). There is no built-in orchestrator, no multi-connector-in-one-process mode, and no routing table that maps different topics to different URLs. + +How this works in the runtime source code: + +- **One consumer per topic**: `setup_sink_consumers()` in [`runtime/src/sink.rs`](../../../runtime/src/sink.rs) iterates `for topic in stream.topics.iter()` and creates a separate `IggyConsumer` for each topic. +- **One async task per consumer**: `spawn_consume_tasks()` in [`runtime/src/sink.rs`](../../../runtime/src/sink.rs) wraps each consumer in `tokio::spawn`, so topics are consumed concurrently within the same process. +- **One plugin instance per ID**: The `sink_connector!` macro in [`sdk/src/sink.rs`](../../sdk/src/sink.rs) creates a `static INSTANCES: DashMap` — each `plugin_id` passed to `iggy_sink_open` gets its own entry, and all topic tasks call `consume()` on the same instance. +- **Sequential consume within each topic**: `consume_messages()` in [`runtime/src/sink.rs`](../../../runtime/src/sink.rs) awaits `consume()` before polling the next batch — there is no pipelining within a single topic task. + +**"Deploying multiple instances"** means running N separate `iggy-connectors` processes — each with its own config directory, its own `[plugin_config]` (and therefore its own destination URL, headers, batch mode, etc.). In Docker or Kubernetes, this means N containers from the same image with different config mounts or environment variables. In systemd, N service units. In ECS, N task definitions. + +### What's Achievable Today vs. Not + +| Pattern | Achievable Today | How | +| ------- | :-: | --- | +| Single destination, single topic | Yes | One connector instance, one `[[streams]]` entry | +| Single destination, multiple topics | Yes | One connector instance, multiple topics in `[[streams]]` | +| Multiple destinations (topic-per-destination) | Yes | N connector instances, one per destination, each a separate OS process | +| Fan-out (same topic to multiple destinations) | Yes | N connector instances consuming same topic with different `consumer_group` names | +| Per-topic URL routing within one instance | **No** | Not supported — each instance has exactly one `url`. Requires N instances. See [Known Limitations](#known-limitations) item 6 | +| OAuth2 / OIDC token refresh | **No** | Static headers only. Use an auth proxy | +| mTLS client certificates | **No** | Use a sidecar proxy for mTLS termination | +| Environment variable expansion in config values | **No** | Use env var overrides at the process level (see [Environment Variable Overrides](#environment-variable-overrides)) | + +### Single Destination, Multiple Topics + +*Achievable today — single connector instance.* + +When all topics go to the same endpoint, use one connector with multiple `[[streams]]` entries. The downstream service can distinguish topics via the `iggy_stream` and `iggy_topic` fields in the metadata envelope. + +```text +┌─────────────────────────┐ ┌────────────────────────┐ +│ Iggy Server │ │ HTTP Endpoint │ +│ ├── stream: events │ │ POST /ingest │ +│ │ ├── topic: clicks │─────▶│ (routes internally │ +│ │ └── topic: views │ │ by iggy_topic) │ +│ └── stream: orders │ │ │ +│ └── topic: created │─────▶│ │ +└─────────────────────────┘ └────────────────────────┘ + connector-a (single instance) +``` + +**`connector-a/sink.toml`**: + +```toml +type = "sink" +key = "http" +enabled = true +version = 0 +name = "all_events" +path = "target/release/libiggy_connector_http_sink" + +[[streams]] +stream = "events" +topics = ["clicks", "views"] +schema = "json" +batch_length = 100 +poll_interval = "100ms" +consumer_group = "http_sink_events" + +[[streams]] +stream = "orders" +topics = ["created"] +schema = "json" +batch_length = 50 +poll_interval = "200ms" +consumer_group = "http_sink_orders" + +[plugin_config] +url = "https://api.example.com/ingest" +batch_mode = "nd_json" +include_metadata = true + +[plugin_config.headers] +Authorization = "Bearer shared-token" +``` + +### Multiple Destinations (One Connector Per Destination) + +*Achievable today — requires N separate OS processes.* + +When different topics need to go to different services, deploy separate connector instances. Each gets its own config directory and runs as a **separate `iggy-connectors` process** (not a config option within one process — see [Connector Runtime Model](#connector-runtime-model)). + +```text +┌───────────────────┐ +│ Iggy Server │ +│ └── stream: app │ +│ ├── clicks ──┼──▶ connector-analytics ──▶ analytics-api.example.com +│ ├── orders ──┼──▶ connector-billing ──▶ billing-api.example.com +│ └── alerts ──┼──▶ connector-slack ──▶ hooks.slack.com +└───────────────────┘ + 3 separate connector instances +``` + +**Directory layout**: + +```text +/opt/connectors/ +├── analytics/ +│ ├── config.toml # shared iggy connection settings +│ └── connectors/ +│ └── sink.toml # clicks → analytics API +├── billing/ +│ ├── config.toml +│ └── connectors/ +│ └── sink.toml # orders → billing API +└── slack/ + ├── config.toml + └── connectors/ + └── sink.toml # alerts → Slack webhook +``` + +**`analytics/connectors/sink.toml`**: + +```toml +type = "sink" +key = "http" +enabled = true +version = 0 +name = "analytics" +path = "/opt/connectors/libiggy_connector_http_sink" + +[[streams]] +stream = "app" +topics = ["clicks"] +schema = "json" +batch_length = 500 +poll_interval = "50ms" +consumer_group = "analytics_sink" + +[plugin_config] +url = "https://analytics-api.example.com/v1/events" +batch_mode = "nd_json" +max_connections = 20 + +[plugin_config.headers] +Authorization = "Bearer analytics-token" +``` + +**`billing/connectors/sink.toml`**: + +```toml +type = "sink" +key = "http" +enabled = true +version = 0 +name = "billing" +path = "/opt/connectors/libiggy_connector_http_sink" + +[[streams]] +stream = "app" +topics = ["orders"] +schema = "json" +batch_length = 50 +poll_interval = "200ms" +consumer_group = "billing_sink" + +[plugin_config] +url = "https://billing-api.example.com/v2/orders" +batch_mode = "individual" +include_metadata = false +timeout = "10s" + +[plugin_config.headers] +Authorization = "Basic YmlsbGluZzpzZWNyZXQ=" +X-Idempotency-Source = "iggy" +``` + +**`slack/connectors/sink.toml`**: + +```toml +type = "sink" +key = "http" +enabled = true +version = 0 +name = "slack_alerts" +path = "/opt/connectors/libiggy_connector_http_sink" + +[[streams]] +stream = "app" +topics = ["alerts"] +schema = "json" +batch_length = 1 +poll_interval = "500ms" +consumer_group = "slack_sink" + +[plugin_config] +url = "https://hooks.slack.com/services/T00/B00/xxx" +batch_mode = "individual" +include_metadata = false +max_retries = 5 +``` + +**Running** (3 processes, or 3 containers in Docker/ECS): + +```bash +IGGY_CONNECTORS_CONFIG_PATH=/opt/connectors/analytics/config.toml iggy-connectors & +IGGY_CONNECTORS_CONFIG_PATH=/opt/connectors/billing/config.toml iggy-connectors & +IGGY_CONNECTORS_CONFIG_PATH=/opt/connectors/slack/config.toml iggy-connectors & +``` + +### Fan-Out: One Topic to Multiple Destinations + +*Achievable today — requires N separate OS processes with different consumer groups.* + +When a single topic needs to be delivered to multiple HTTP endpoints (e.g., send order events to both the billing service AND an analytics pipeline), deploy multiple connector instances that consume from the **same topic with different consumer groups**. Each instance is a separate `iggy-connectors` process (see [Connector Runtime Model](#connector-runtime-model)). + +```text + connector-billing ──▶ billing-api.example.com + (consumer_group: billing_sink) +┌─────────────────┐ / +│ stream: orders │────────< +│ topic: created │ \ +└─────────────────┘ connector-analytics ──▶ analytics.example.com + (consumer_group: analytics_sink) +``` + +Each consumer group maintains its own offset, so both connectors independently receive every message. This is the standard Iggy fan-out pattern — not an antipattern. + +**Key requirement**: Each connector instance MUST use a **different `consumer_group`**. If they share a consumer group, messages are load-balanced (split) across instances rather than duplicated. + +**`billing/connectors/sink.toml`**: + +```toml +[[streams]] +stream = "orders" +topics = ["created"] +schema = "json" +consumer_group = "billing_sink" # unique consumer group + +[plugin_config] +url = "https://billing-api.example.com/v2/orders" +batch_mode = "individual" +``` + +**`analytics/connectors/sink.toml`**: + +```toml +[[streams]] +stream = "orders" +topics = ["created"] +schema = "json" +consumer_group = "analytics_sink" # different consumer group = fan-out + +[plugin_config] +url = "https://analytics.example.com/v1/events" +batch_mode = "nd_json" +``` + +### Docker / Container Deployment + +*Achievable today.* + +Each connector instance maps naturally to one container (one process = one container). Share the compiled `.so`/`.dylib` via a volume mount or bake it into the image: + +```dockerfile +FROM rust:latest AS builder +WORKDIR /app +COPY . . +RUN cargo build -p iggy_connector_http_sink --release + +FROM debian:bookworm-slim +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* +COPY --from=builder /app/target/release/libiggy_connector_http_sink.so /opt/connector/ +COPY --from=builder /app/target/release/iggy-connectors /usr/local/bin/ +COPY config/ /opt/connector/config/ +ENV IGGY_CONNECTORS_CONFIG_PATH=/opt/connector/config/config.toml +CMD ["iggy-connectors"] +``` + +For multiple destinations, run multiple containers from the same image with different config mounts: + +```yaml +# docker-compose.yml +services: + connector-analytics: + image: iggy-http-sink + volumes: + - ./analytics-config:/opt/connector/config + environment: + IGGY_CONNECTORS_CONFIG_PATH: /opt/connector/config/config.toml + + connector-billing: + image: iggy-http-sink + volumes: + - ./billing-config:/opt/connector/config + environment: + IGGY_CONNECTORS_CONFIG_PATH: /opt/connector/config/config.toml +``` + +### Environment Variable Overrides + +The connector runtime supports overriding any config field via environment variables using the convention `IGGY_CONNECTORS_SINK_{KEY}_
_`. This is useful for keeping secrets out of config files: + +```bash +# Override the URL and auth token at runtime +export IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_URL="https://prod-api.example.com/ingest" +export IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_HEADERS_AUTHORIZATION="Bearer prod-token" +iggy-connectors +``` + +## Performance Considerations + +### Batch Mode Selection + +The connector runtime calls `consume()` **sequentially** — the next poll cycle does not start until the current batch completes. Batch mode choice directly impacts throughput: + +| Mode | HTTP Requests per Poll | Latency per Poll | Best For | +| ---- | ---------------------- | ----------------- | -------- | +| `individual` | N (one per message) | N × round-trip | Low-volume webhooks, order-sensitive delivery | +| `nd_json` | 1 | 1 × round-trip | High-throughput bulk ingestion | +| `json_array` | 1 | 1 × round-trip | APIs expecting array payloads | +| `raw` | N (one per message) | N × round-trip | Binary payloads (protobuf, avro) | + +With `batch_length=50` in `individual` mode, each poll cycle performs 50 sequential HTTP round trips. If each takes 100ms, the poll cycle takes 5 seconds — during which no new messages are consumed from that topic. Use `nd_json` or `json_array` to collapse this to a single round trip. + +### Memory + +In `nd_json` and `json_array` modes, the entire batch is serialized into memory before sending. With `batch_length=1000` and 10KB messages, this allocates ~10MB per poll cycle. The `max_payload_size_bytes` check runs **after** serialization (the batch must be built to know its size). For very large batches, tune `batch_length` and `max_payload_size_bytes` together. + +### Connection Pooling and Keep-Alive + +The connector builds one `ClientWithMiddleware` (wrapping `reqwest::Client` with retry and tracing middleware) per plugin instance in `open()`. Because the runtime calls `consume()` sequentially within each topic task, a single-topic connector uses at most **one connection at a time**. Multi-topic connectors may use up to N concurrent connections (one per topic task), since each task calls `consume()` independently. + +reqwest uses HTTP/1.1 persistent connections (keep-alive) by default. The connector configures: + +- **`max_connections`** (default: 10) — Maximum idle connections retained per host. The pool creates additional connections beyond this limit as needed — this setting only controls how many idle connections are kept warm for reuse. +- **TCP keep-alive** (30s) — Sends TCP keep-alive probes on idle connections to detect silent drops by cloud load balancers. Without this, a connection silently closed by an intermediate LB (AWS ALB drops idle connections after ~60s, GCP after ~600s) would only be discovered on the next HTTP request, causing a failed attempt and retry delay. +- **Pool idle timeout** (90s) — Closes connections unused for 90 seconds to prevent stale connection accumulation in the pool. + +Because `reqwest::Client` clones are cheap (they share the same connection pool via `Arc`), all topic tasks within a single connector process share one pool. This means multi-topic connectors benefit from connection reuse when all topics target the same host — a connection returned to the pool by topic A's task can be reused by topic B's task. + +For multiple connector instances (separate processes), each process has its own independent `reqwest::Client` and its own connection pool. There is no cross-process connection sharing. + +### Retry Impact on Throughput + +Each failed message in `individual`/`raw` mode burns through the retry budget (default: 3 retries with exponential backoff up to 30s) before moving to the next message. The backoff delays are 1s + 2s + 4s = 7 seconds per message, but each attempt also incurs the request timeout (default 30s) for a dead endpoint. Worst case per message: 4 attempts × 30s timeout + 7s backoff = 127 seconds. + +The consecutive failure abort (`MAX_CONSECUTIVE_FAILURES = 3`) mitigates this: after 3 consecutive HTTP failures, remaining messages in the batch are skipped. This limits worst-case blocking to: 3 × (4 × 30s + 1s + 2s + 4s) = 381 seconds with default timeout, or 3 × 7s = 21 seconds of backoff delay alone. + +### Multiple Instances vs. Single Instance + +Multiple connector instances (one per destination) provide: + +- **Performance isolation**: A slow destination doesn't block other topics +- **Failure isolation**: One dead endpoint doesn't affect unrelated connectors +- **Independent tuning**: Different `batch_length`, `timeout`, `max_retries` per destination +- **Security isolation**: Each instance has its own credentials; compromise of one config doesn't expose others +- **Independent scaling**: Scale high-volume connectors without over-provisioning low-volume ones + +The overhead of multiple processes is minimal — each connector is a lightweight async runtime with low memory footprint at idle. + +## Example Configs + +### Lambda Webhook + +```toml +[plugin_config] +url = "https://abc123.execute-api.us-east-1.amazonaws.com/prod/ingest" +method = "POST" +batch_mode = "json_array" +timeout = "10s" +include_metadata = true + +[plugin_config.headers] +x-api-key = "my-api-key" +``` + +### High-Throughput Bulk Ingestion + +```toml +[plugin_config] +url = "https://ingest.example.com/bulk" +method = "POST" +batch_mode = "nd_json" +max_connections = 20 +timeout = "60s" +max_payload_size_bytes = 52428800 +``` + +## Testing + +Unit tests (no external dependencies): + +```bash +cargo test -p iggy_connector_http_sink +``` + +Integration tests (requires Docker for WireMock container): + +```bash +cargo test -p integration --test connectors -- http_sink +``` + +## Delivery Semantics + +All retry logic lives inside `consume()`. The connector runtime invokes `consume()` via an FFI callback that returns an `i32` status code. The runtime does not inspect this return value (see `process_messages()` in `runtime/src/sink.rs`), so errors logged by the sink are not propagated to the runtime's retry or alerting mechanisms. Additionally, consumer group offsets are committed before processing ([runtime issue #1](#known-limitations)). This means: + +- Failed messages are **not retried by the runtime** — only by the sink's internal retry loop +- Messages are committed **before delivery** — a crash after commit but before delivery loses messages + +The effective delivery guarantee is **at-most-once** at the runtime level. The sink's internal retries provide best-effort delivery within each `consume()` call. + +## Known Limitations + +1. **Runtime ignores `consume()` status**: The connector runtime invokes `consume()` via an FFI callback returning `i32`. The `process_messages()` function in `runtime/src/sink.rs` does not inspect the return value. Errors are logged internally by the sink but do not trigger runtime-level retry or alerting. ([#2927](https://github.com/apache/iggy/issues/2927)) + +2. **Offsets committed before processing**: The `PollingMessages` auto-commit strategy commits consumer group offsets before `consume()` is called. Combined with limitation 1, at-least-once delivery is not achievable. ([#2928](https://github.com/apache/iggy/issues/2928)) + +3. **`Retry-After` header not used for backoff**: The `reqwest-middleware` retry layer uses computed exponential backoff. `Retry-After` headers are logged as warnings but do not influence retry timing. + +4. **No dead letter queue**: Failed messages are logged at `error!` level but not persisted to a DLQ. DLQ support would be a runtime-level feature. + +5. **No request signing**: AWS SigV4, HMAC, or other signing schemes are not supported. Use custom headers or an auth proxy for signed endpoints. + +6. **No per-topic URL routing**: All topics configured in a single connector instance share the same `url`. For topic-specific routing, deploy separate connector instances (see [Deployment Patterns](#deployment-patterns)). A future enhancement could add a `[plugin_config.routing]` table for URL-per-topic within a single instance. + +7. **No OAuth2 token refresh**: Bearer tokens are static. Use an auth proxy for services requiring automatic token rotation. + +8. **No environment variable expansion in config values**: Secrets in `[plugin_config.headers]` are stored as plaintext. Use environment variable overrides (see [Environment Variable Overrides](#environment-variable-overrides)) or mount secrets from a secrets manager. diff --git a/core/connectors/sinks/http_sink/config.toml b/core/connectors/sinks/http_sink/config.toml new file mode 100644 index 0000000000..4aa5c7d504 --- /dev/null +++ b/core/connectors/sinks/http_sink/config.toml @@ -0,0 +1,90 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +type = "sink" +key = "http" +enabled = true +version = 0 +name = "HTTP sink" +path = "../../target/release/libiggy_connector_http_sink" +verbose = false + +[[streams]] +stream = "my_stream" +topics = ["my_topic"] +schema = "json" +batch_length = 50 +poll_interval = "100ms" +consumer_group = "http_sink_group" + +[plugin_config] +# Required — target URL for HTTP requests. +url = "https://api.example.com/ingest" + +# HTTP method (default: POST). Valid: GET, HEAD, POST, PUT, PATCH, DELETE. +method = "POST" + +# Request timeout (default: 30s). +timeout = "30s" + +# Maximum HTTP body size in bytes (default: 10MB). Set to 0 to disable. +max_payload_size_bytes = 10485760 + +# Payload formatting mode (default: individual). +# - "individual": one HTTP request per message +# - "nd_json": newline-delimited JSON, all messages in one request +# - "json_array": JSON array of messages in one request +# - "raw": raw bytes, individual requests only +batch_mode = "nd_json" + +# Include Iggy metadata envelope (default: true). +include_metadata = true + +# Include message checksum in metadata (default: false). +include_checksum = false + +# Include origin timestamp in metadata (default: false). +include_origin_timestamp = false + +# Health check — opt-in, disabled by default. +# Many endpoints (Lambda, API Gateway) don't support HEAD/OPTIONS. +health_check_enabled = false +health_check_method = "HEAD" + +# Retry configuration. +max_retries = 3 +retry_delay = "1s" +retry_backoff_multiplier = 2 +max_retry_delay = "30s" + +# HTTP status codes considered successful (default: [200, 201, 202, 204]). +success_status_codes = [200, 201, 202, 204] + +# TLS — accept invalid certs (default: false). Use only for development. +tls_danger_accept_invalid_certs = false + +# Connection pool — max idle connections per host (default: 10). +max_connections = 10 + +# Verbose request/response logging (default: false). +verbose_logging = false + +# Custom HTTP headers. Replace placeholder values with real credentials. +# Do not commit actual secrets — use environment variable overrides for production. +[plugin_config.headers] +Authorization = "Bearer " +X-Custom-Header = "custom-value" diff --git a/core/connectors/sinks/http_sink/src/lib.rs b/core/connectors/sinks/http_sink/src/lib.rs new file mode 100644 index 0000000000..e29f4527db --- /dev/null +++ b/core/connectors/sinks/http_sink/src/lib.rs @@ -0,0 +1,2069 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use async_trait::async_trait; +use base64::Engine; +use base64::engine::general_purpose; +use bytes::Bytes; +use humantime::Duration as HumanDuration; +use iggy_connector_sdk::{ + ConsumedMessage, Error, MessagesMetadata, Payload, Sink, TopicMetadata, + convert::owned_value_to_serde_json, sink_connector, +}; +use reqwest_middleware::{ClientBuilder, ClientWithMiddleware}; +use reqwest_retry::{ + RetryTransientMiddleware, Retryable, RetryableStrategy, policies::ExponentialBackoff, +}; +use reqwest_tracing::TracingMiddleware; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tracing::{debug, error, info, warn}; + +sink_connector!(HttpSink); + +const DEFAULT_TIMEOUT: &str = "30s"; +const DEFAULT_RETRY_DELAY: &str = "1s"; +const DEFAULT_MAX_RETRY_DELAY: &str = "30s"; +const DEFAULT_MAX_RETRIES: u32 = 3; +const DEFAULT_BACKOFF_MULTIPLIER: u32 = 2; +const DEFAULT_MAX_PAYLOAD_SIZE: u64 = 10 * 1024 * 1024; // 10 MB +const DEFAULT_MAX_CONNECTIONS: usize = 10; +/// TCP keep-alive interval for detecting dead connections behind load balancers. +/// Cloud LBs silently drop idle connections (AWS ALB ~60s, GCP ~600s); +/// probing at 30s detects these before requests fail. +const DEFAULT_TCP_KEEPALIVE_SECS: u64 = 30; +/// Close pooled connections unused for this long. Prevents stale connections +/// from accumulating when traffic is bursty. +const DEFAULT_POOL_IDLE_TIMEOUT_SECS: u64 = 90; +/// Abort remaining messages in individual/raw mode after this many consecutive HTTP failures. +/// Prevents hammering a dead endpoint with N sequential retry cycles per poll. +const MAX_CONSECUTIVE_FAILURES: u32 = 3; + +const ENCODING_BASE64: &str = "base64"; + +/// HTTP method enum — validated at deserialization, prevents invalid values like "DELEET" or "GETX". +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "UPPERCASE")] +pub enum HttpMethod { + Get, + Head, + #[default] + Post, + Put, + Patch, + Delete, +} + +/// Payload formatting mode for HTTP requests. +#[derive( + Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, strum_macros::Display, +)] +#[serde(rename_all = "snake_case")] +pub enum BatchMode { + /// One HTTP request per message (default). Note: with batch_length=50, this produces 50 + /// sequential HTTP round trips per poll cycle. Use ndjson or json_array for higher throughput. + #[default] + #[strum(to_string = "individual")] + Individual, + /// All messages in one request, newline-delimited JSON. + #[strum(to_string = "NDJSON")] + NdJson, + /// All messages as a single JSON array. + #[strum(to_string = "JSON array")] + JsonArray, + /// Raw bytes, one request per message (for non-JSON payloads). + #[strum(to_string = "raw")] + Raw, +} + +impl BatchMode { + /// Determine the Content-Type header based on batch mode. + fn content_type(&self) -> &'static str { + match self { + BatchMode::Individual | BatchMode::JsonArray => "application/json", + BatchMode::NdJson => "application/x-ndjson", + BatchMode::Raw => "application/octet-stream", + } + } +} + +/// Metadata envelope wrapping a payload with Iggy message metadata. +#[derive(Debug, Serialize)] +struct MetadataEnvelope { + metadata: IggyMetadata, + payload: serde_json::Value, +} + +/// Iggy message metadata fields. +#[derive(Debug, Serialize)] +struct IggyMetadata { + iggy_id: String, + iggy_offset: u64, + iggy_timestamp: u64, + iggy_stream: String, + iggy_topic: String, + iggy_partition_id: u32, + #[serde(skip_serializing_if = "Option::is_none")] + iggy_checksum: Option, + #[serde(skip_serializing_if = "Option::is_none")] + iggy_origin_timestamp: Option, + #[serde(skip_serializing_if = "Option::is_none")] + iggy_headers: Option>, +} + +/// Binary payload with base64 encoding marker. +#[derive(Debug, Serialize)] +struct EncodedPayload { + data: String, + iggy_payload_encoding: &'static str, +} + +/// Binary header value with base64 encoding marker. +#[derive(Debug, Serialize)] +struct EncodedHeader { + data: String, + iggy_header_encoding: &'static str, +} + +/// Configuration for the HTTP sink connector, deserialized from [plugin_config] in config.toml. +#[derive(Debug, Serialize, Deserialize)] +pub struct HttpSinkConfig { + /// Target URL for HTTP requests (required). + pub url: String, + /// HTTP method (default: POST). + pub method: Option, + /// Request timeout as a human-readable duration string, e.g. "30s" (default: 30s). + pub timeout: Option, + /// Maximum HTTP body size in bytes (default: 10MB). Set to 0 to disable. + pub max_payload_size_bytes: Option, + /// Custom HTTP headers. + pub headers: Option>, + /// Payload formatting mode (default: individual). + pub batch_mode: Option, + /// Include Iggy metadata envelope in payload (default: true). + pub include_metadata: Option, + /// Include message checksum in metadata (default: false). + pub include_checksum: Option, + /// Include origin timestamp in metadata (default: false). + pub include_origin_timestamp: Option, + /// Enable health check request in open() (default: false). + pub health_check_enabled: Option, + /// HTTP method for health check (default: HEAD). + pub health_check_method: Option, + /// Maximum number of retries for transient errors (default: 3). + pub max_retries: Option, + /// Retry delay as a human-readable duration string, e.g. "1s" (default: 1s). + pub retry_delay: Option, + /// Backoff multiplier for exponential retry delay (default: 2). + pub retry_backoff_multiplier: Option, + /// Maximum retry delay cap as a human-readable duration string (default: 30s). + pub max_retry_delay: Option, + /// HTTP status codes considered successful (default: [200, 201, 202, 204]). + pub success_status_codes: Option>, + /// Accept invalid TLS certificates (default: false). Named to signal danger. + pub tls_danger_accept_invalid_certs: Option, + /// Maximum idle connections per host (default: 10). + pub max_connections: Option, + /// Enable verbose request/response logging (default: false). + pub verbose_logging: Option, +} + +/// HTTP sink connector that delivers consumed messages to any HTTP endpoint. +/// +/// Lifecycle: `new()` → `open()` → `consume()` (repeated) → `close()`. +/// The `reqwest::Client` is built in `open()` (not `new()`) so that config-derived +/// settings (timeout, TLS, connection pool) are applied. This matches the +/// MongoDB/Elasticsearch/PostgreSQL sink initialization pattern. +#[derive(Debug)] +pub struct HttpSink { + id: u32, + url: String, + method: HttpMethod, + timeout: Duration, + max_payload_size_bytes: u64, + headers: HashMap, + batch_mode: BatchMode, + include_metadata: bool, + include_checksum: bool, + include_origin_timestamp: bool, + health_check_enabled: bool, + health_check_method: HttpMethod, + max_retries: u32, + retry_delay: Duration, + retry_backoff_multiplier: u32, + max_retry_delay: Duration, + success_status_codes: HashSet, + tls_danger_accept_invalid_certs: bool, + max_connections: usize, + verbose: bool, + /// Pre-built HTTP headers (excluding Content-Type). Built once in `open()` from validated + /// `self.headers`, reused for every request. `None` before `open()` is called. + request_headers: Option, + /// Initialized in `open()` with config-derived settings. `None` before `open()` is called. + client: Option, + send_attempts: AtomicU64, + messages_delivered: AtomicU64, + errors_count: AtomicU64, + /// Epoch seconds of last successful HTTP request. + last_success_timestamp: AtomicU64, +} + +impl HttpSink { + pub fn new(id: u32, config: HttpSinkConfig) -> Self { + let url = config.url; + let method = config.method.unwrap_or_default(); + let timeout = parse_duration(config.timeout.as_deref(), DEFAULT_TIMEOUT); + let max_payload_size_bytes = config + .max_payload_size_bytes + .unwrap_or(DEFAULT_MAX_PAYLOAD_SIZE); + let headers = config.headers.unwrap_or_default(); + let batch_mode = config.batch_mode.unwrap_or_default(); + let include_metadata = config.include_metadata.unwrap_or(true); + let include_checksum = config.include_checksum.unwrap_or(false); + let include_origin_timestamp = config.include_origin_timestamp.unwrap_or(false); + let health_check_enabled = config.health_check_enabled.unwrap_or(false); + let health_check_method = config.health_check_method.unwrap_or(HttpMethod::Head); + let max_retries = config.max_retries.unwrap_or(DEFAULT_MAX_RETRIES); + let mut retry_delay = parse_duration(config.retry_delay.as_deref(), DEFAULT_RETRY_DELAY); + let retry_backoff_multiplier = config + .retry_backoff_multiplier + .unwrap_or(DEFAULT_BACKOFF_MULTIPLIER) + .max(1); + let mut max_retry_delay = + parse_duration(config.max_retry_delay.as_deref(), DEFAULT_MAX_RETRY_DELAY); + let success_status_codes: HashSet = config + .success_status_codes + .unwrap_or_else(|| vec![200, 201, 202, 204]) + .into_iter() + .collect(); + let tls_danger_accept_invalid_certs = + config.tls_danger_accept_invalid_certs.unwrap_or(false); + let max_connections = config.max_connections.unwrap_or(DEFAULT_MAX_CONNECTIONS); + let verbose = config.verbose_logging.unwrap_or(false); + + if retry_delay > max_retry_delay { + warn!( + "HTTP sink ID: {} — retry_delay ({:?}) exceeds max_retry_delay ({:?}). \ + Swapping values to prevent ExponentialBackoff panic.", + id, retry_delay, max_retry_delay, + ); + std::mem::swap(&mut retry_delay, &mut max_retry_delay); + } + + if tls_danger_accept_invalid_certs { + warn!( + "HTTP sink ID: {} — tls_danger_accept_invalid_certs is enabled. \ + TLS certificate validation is DISABLED.", + id + ); + } + + if batch_mode == BatchMode::Raw && include_metadata { + warn!( + "HTTP sink ID: {} — batch_mode=raw ignores include_metadata. \ + Raw mode sends payload bytes directly without metadata envelope.", + id + ); + } + + if matches!(method, HttpMethod::Get | HttpMethod::Head) + && batch_mode != BatchMode::Individual + { + warn!( + "HTTP sink ID: {} — {:?} with batch_mode={:?} will send a request body. \ + Some servers may reject GET/HEAD requests with a body.", + id, method, batch_mode, + ); + } + + HttpSink { + id, + url, + method, + timeout, + max_payload_size_bytes, + headers, + batch_mode, + include_metadata, + include_checksum, + include_origin_timestamp, + health_check_enabled, + health_check_method, + max_retries, + retry_delay, + retry_backoff_multiplier, + max_retry_delay, + success_status_codes, + tls_danger_accept_invalid_certs, + max_connections, + verbose, + request_headers: None, + client: None, + send_attempts: AtomicU64::new(0), + messages_delivered: AtomicU64::new(0), + errors_count: AtomicU64::new(0), + last_success_timestamp: AtomicU64::new(0), + } + } + + /// Build the `reqwest::Client` wrapped with retry and tracing middleware. + fn build_client(&self) -> Result { + let raw_client = reqwest::Client::builder() + .timeout(self.timeout) + .pool_max_idle_per_host(self.max_connections) + .pool_idle_timeout(Duration::from_secs(DEFAULT_POOL_IDLE_TIMEOUT_SECS)) + .tcp_keepalive(Duration::from_secs(DEFAULT_TCP_KEEPALIVE_SECS)) + .danger_accept_invalid_certs(self.tls_danger_accept_invalid_certs) + .build() + .map_err(|e| Error::InitError(format!("Failed to build HTTP client: {}", e)))?; + + let retry_policy = ExponentialBackoff::builder() + .retry_bounds(self.retry_delay, self.max_retry_delay) + .base(self.retry_backoff_multiplier) + .build_with_max_retries(self.max_retries); + + let retry_strategy = HttpSinkRetryStrategy { + success_status_codes: self.success_status_codes.clone(), + }; + + let retry_middleware = + RetryTransientMiddleware::new_with_policy_and_strategy(retry_policy, retry_strategy); + + Ok(ClientBuilder::new(raw_client) + .with(TracingMiddleware::default()) + .with(retry_middleware) + .build()) + } + + /// Returns the initialized HTTP client, or an error if `open()` was not called. + fn client(&self) -> Result<&ClientWithMiddleware, Error> { + self.client.as_ref().ok_or_else(|| { + Error::InitError("HTTP client not initialized — was open() called?".to_string()) + }) + } + + /// Convert a `Payload` to a JSON value for metadata wrapping. + /// Non-JSON payloads are base64-encoded with a `iggy_payload_encoding` marker. + /// + /// Note: All current `Payload` variants produce infallible conversions. + /// The `Result` return type exists as a safety net for future variants. + fn payload_to_json(&self, payload: Payload) -> Result { + match payload { + Payload::Json(value) => { + // Direct structural conversion (not serialization roundtrip). + // Follows the Elasticsearch sink pattern. NaN/Infinity f64 → null. + Ok(owned_value_to_serde_json(&value)) + } + Payload::Text(text) => Ok(serde_json::Value::String(text)), + Payload::Raw(bytes) | Payload::FlatBuffer(bytes) => { + let encoded = EncodedPayload { + data: general_purpose::STANDARD.encode(&bytes), + iggy_payload_encoding: ENCODING_BASE64, + }; + serde_json::to_value(encoded) + .map_err(|e| Error::Serialization(format!("EncodedPayload: {}", e))) + } + Payload::Proto(proto_str) => { + let encoded = EncodedPayload { + data: general_purpose::STANDARD.encode(proto_str.as_bytes()), + iggy_payload_encoding: ENCODING_BASE64, + }; + serde_json::to_value(encoded) + .map_err(|e| Error::Serialization(format!("EncodedPayload: {}", e))) + } + } + } + + /// Build a message envelope with optional metadata wrapping. + fn build_envelope( + &self, + message: &ConsumedMessage, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + payload_json: serde_json::Value, + ) -> Result { + if !self.include_metadata { + return Ok(payload_json); + } + + let headers_map = if let Some(ref headers) = message.headers + && !headers.is_empty() + { + let map: serde_json::Map = headers + .iter() + .map(|(k, v)| { + // Raw bytes: base64-encode to avoid Rust debug format in JSON output. + // as_raw() returns Ok only for HeaderKind::Raw. + let value = if let Ok(raw) = v.as_raw() { + let encoded = EncodedHeader { + data: general_purpose::STANDARD.encode(raw), + iggy_header_encoding: ENCODING_BASE64, + }; + serde_json::to_value(encoded) + .map_err(|e| Error::Serialization(format!("EncodedHeader: {}", e)))? + } else { + serde_json::Value::String(v.to_string_value()) + }; + Ok((k.to_string_value(), value)) + }) + .collect::, Error>>()?; + Some(map) + } else { + None + }; + + let metadata = IggyMetadata { + iggy_id: format_u128_as_hex(message.id), + iggy_offset: message.offset, + iggy_timestamp: message.timestamp, + iggy_stream: topic_metadata.stream.clone(), + iggy_topic: topic_metadata.topic.clone(), + iggy_partition_id: messages_metadata.partition_id, + iggy_checksum: if self.include_checksum { + Some(message.checksum) + } else { + None + }, + iggy_origin_timestamp: if self.include_origin_timestamp { + Some(message.origin_timestamp) + } else { + None + }, + iggy_headers: headers_map, + }; + + let envelope = MetadataEnvelope { + metadata, + payload: payload_json, + }; + + serde_json::to_value(envelope) + .map_err(|e| Error::Serialization(format!("MetadataEnvelope: {}", e))) + } + + /// Record a successful request timestamp. + fn record_success(&self) { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + self.last_success_timestamp.store(now, Ordering::Relaxed); + } + + /// Send an HTTP request with retry via reqwest-middleware. Returns Ok on success, + /// Err after exhausting retries. Retry logic (backoff, transient classification) + /// is handled by the middleware configured in `build_client()`. + async fn send_with_retry(&self, body: Bytes, content_type: &str) -> Result<(), Error> { + let client = self.client()?; + let headers = self.request_headers.as_ref().ok_or_else(|| { + Error::InitError("HTTP headers not initialized — was open() called?".to_string()) + })?; + + if self.verbose { + debug!( + "HTTP sink ID: {} — sending {:?} {} ({} bytes)", + self.id, + self.method, + self.url, + body.len(), + ); + } + + self.send_attempts.fetch_add(1, Ordering::Relaxed); + + let response = build_request(self.method, client, &self.url) + .headers(headers.clone()) + .header("content-type", content_type) + .body(body) + .send() + .await + .map_err(|e| { + self.errors_count.fetch_add(1, Ordering::Relaxed); + error!( + "HTTP sink ID: {} — request to {} failed after middleware retries: {:#}", + self.id, self.url, e + ); + Error::HttpRequestFailed(format!("HTTP {} — {}", self.url, e)) + })?; + + let status = response.status(); + // success_status_codes is checked in BOTH the retry strategy (to stop retrying) + // AND here (to classify the final response). Both must use the same set. + if self.success_status_codes.contains(&status.as_u16()) { + if self.verbose { + debug!( + "HTTP sink ID: {} — success (status {})", + self.id, + status.as_u16() + ); + } + self.record_success(); + return Ok(()); + } + + // Non-success status after middleware exhausted retries — read body for diagnostics + let response_body = match response.text().await { + Ok(body) => body, + Err(e) => format!("", e), + }; + + error!( + "HTTP sink ID: {} — request failed (status {}). Response: {}", + self.id, + status.as_u16(), + truncate_response(&response_body, 500), + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + Err(Error::HttpRequestFailed(format!( + "HTTP {} — status: {}", + self.url, + status.as_u16() + ))) + } + + /// Shared per-message send loop for `individual` and `raw` modes. + /// + /// Iterates `messages`, builds a body for each via `build_body`, enforces payload size + /// limits, sends via `send_with_retry`, and tracks partial delivery. + /// Aborts after `MAX_CONSECUTIVE_FAILURES` consecutive HTTP failures. + /// + /// `build_body` takes ownership of each `ConsumedMessage` — callers must extract + /// all needed fields (payload, metadata) within the closure. + async fn send_per_message( + &self, + messages: Vec, + content_type: &str, + mut build_body: F, + ) -> Result<(), Error> + where + F: FnMut(ConsumedMessage) -> Result, Error>, + { + let total = messages.len(); + let mut delivered = 0u64; + let mut http_failures = 0u64; + let mut serialization_failures = 0u64; + let mut consecutive_failures = 0u32; + let mut last_error: Option = None; + + for message in messages { + let offset = message.offset; + let body = match build_body(message) { + Ok(b) => b, + Err(e) => { + error!( + "HTTP sink ID: {} — failed to build {} body at offset {}: {}", + self.id, self.batch_mode, offset, e + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + serialization_failures += 1; + last_error = Some(e); + continue; + } + }; + + if self.max_payload_size_bytes > 0 && body.len() as u64 > self.max_payload_size_bytes { + error!( + "HTTP sink ID: {} — {} payload at offset {} exceeds max size ({} > {} bytes). Skipping.", + self.id, + self.batch_mode, + offset, + body.len(), + self.max_payload_size_bytes, + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + serialization_failures += 1; + last_error = Some(Error::HttpRequestFailed(format!( + "Payload exceeds max size: {} bytes", + body.len() + ))); + continue; + } + + match self.send_with_retry(Bytes::from(body), content_type).await { + Ok(()) => { + delivered += 1; + consecutive_failures = 0; + } + Err(e) => { + error!( + "HTTP sink ID: {} — failed to deliver {} message at offset {} after retries: {}", + self.id, self.batch_mode, offset, e + ); + http_failures += 1; + consecutive_failures += 1; + last_error = Some(e); + + if consecutive_failures >= MAX_CONSECUTIVE_FAILURES { + let processed = delivered + http_failures + serialization_failures; + debug_assert!( + processed <= total as u64, + "processed ({processed}) > total ({total}) — accounting bug" + ); + let skipped = (total as u64).saturating_sub(processed); + error!( + "HTTP sink ID: {} — aborting {} batch after {} consecutive HTTP failures \ + ({} remaining messages skipped)", + self.id, self.batch_mode, consecutive_failures, skipped, + ); + self.errors_count.fetch_add(skipped, Ordering::Relaxed); + break; + } + } + } + } + + self.messages_delivered + .fetch_add(delivered, Ordering::Relaxed); + + match last_error { + Some(e) => { + error!( + "HTTP sink ID: {} — partial {} delivery: {}/{} delivered, \ + {} HTTP failures, {} serialization errors", + self.id, + self.batch_mode, + delivered, + total, + http_failures, + serialization_failures, + ); + Err(e) + } + None => Ok(()), + } + } + + /// Send messages in `individual` mode — one HTTP request per message. + async fn send_individual( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + messages: Vec, + ) -> Result<(), Error> { + self.send_per_message(messages, self.batch_mode.content_type(), |mut message| { + let payload = std::mem::replace(&mut message.payload, Payload::Raw(vec![])); + let payload_json = self.payload_to_json(payload)?; + let envelope = + self.build_envelope(&message, topic_metadata, messages_metadata, payload_json)?; + serde_json::to_vec(&envelope) + .map_err(|e| Error::Serialization(format!("Envelope serialize: {}", e))) + }) + .await + } + + /// Sends a batch body and updates delivery/error accounting. + /// + /// Shared by `send_ndjson` and `send_json_array` — the post-send accounting logic + /// (error propagation, skip warnings) is identical across batch modes. + async fn send_batch_body(&self, body: Bytes, count: u64, skipped: u64) -> Result<(), Error> { + debug_assert!( + count > 0, + "send_batch_body called with count=0 — callers must guard against empty batches" + ); + if let Err(e) = self + .send_with_retry(body, self.batch_mode.content_type()) + .await + { + // send_with_retry already added 1 to errors_count for the HTTP failure. + // Add the remaining messages that were serialized but not delivered. + if count > 1 { + self.errors_count.fetch_add(count - 1, Ordering::Relaxed); + } + if skipped > 0 { + error!( + "HTTP sink ID: {} — {} batch failed with {} serialization skips", + self.id, self.batch_mode, skipped, + ); + } + return Err(e); + } + self.messages_delivered.fetch_add(count, Ordering::Relaxed); + if skipped > 0 { + warn!( + "HTTP sink ID: {} — {} batch: {} delivered, {} skipped (serialization errors)", + self.id, self.batch_mode, count, skipped, + ); + } + Ok(()) + } + + /// Send messages in `ndjson` mode — all messages in one request, newline-delimited. + /// Skips individual messages that fail serialization rather than aborting the batch. + async fn send_ndjson( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + messages: Vec, + ) -> Result<(), Error> { + let mut lines = Vec::with_capacity(messages.len()); + let mut skipped = 0u64; + + for mut message in messages { + let payload = std::mem::replace(&mut message.payload, Payload::Raw(vec![])); + let payload_json = match self.payload_to_json(payload) { + Ok(json) => json, + Err(e) => { + error!( + "HTTP sink ID: {} — skipping message at offset {} in NDJSON batch: {}", + self.id, message.offset, e + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + skipped += 1; + continue; + } + }; + let envelope = match self.build_envelope( + &message, + topic_metadata, + messages_metadata, + payload_json, + ) { + Ok(env) => env, + Err(e) => { + error!( + "HTTP sink ID: {} — skipping message at offset {} in NDJSON batch (envelope): {}", + self.id, message.offset, e + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + skipped += 1; + continue; + } + }; + match serde_json::to_string(&envelope) { + Ok(line) => lines.push(line), + Err(e) => { + error!( + "HTTP sink ID: {} — skipping message at offset {} in NDJSON batch (serialize): {}", + self.id, message.offset, e + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + skipped += 1; + continue; + } + } + } + + if lines.is_empty() { + return Err(Error::Serialization( + "All messages in NDJSON batch failed serialization".to_string(), + )); + } + + let count = lines.len() as u64; + + let mut body_str = lines.join("\n"); + body_str.push('\n'); // NDJSON spec requires trailing newline + let body = body_str.into_bytes(); + + if self.max_payload_size_bytes > 0 && body.len() as u64 > self.max_payload_size_bytes { + error!( + "HTTP sink ID: {} — NDJSON batch exceeds max payload size ({} > {} bytes)", + self.id, + body.len(), + self.max_payload_size_bytes, + ); + // Count all successfully-serialized messages as errors (skipped already counted individually) + self.errors_count.fetch_add(count, Ordering::Relaxed); + return Err(Error::HttpRequestFailed(format!( + "NDJSON batch exceeds max size: {} bytes", + body.len() + ))); + } + + self.send_batch_body(Bytes::from(body), count, skipped) + .await + } + + /// Send messages in `json_array` mode — all messages as a single JSON array. + /// Skips individual messages that fail serialization rather than aborting the batch. + async fn send_json_array( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + messages: Vec, + ) -> Result<(), Error> { + let mut envelopes = Vec::with_capacity(messages.len()); + let mut skipped = 0u64; + + for mut message in messages { + let payload = std::mem::replace(&mut message.payload, Payload::Raw(vec![])); + let payload_json = match self.payload_to_json(payload) { + Ok(json) => json, + Err(e) => { + error!( + "HTTP sink ID: {} — skipping message at offset {} in JSON array batch: {}", + self.id, message.offset, e + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + skipped += 1; + continue; + } + }; + let envelope = match self.build_envelope( + &message, + topic_metadata, + messages_metadata, + payload_json, + ) { + Ok(env) => env, + Err(e) => { + error!( + "HTTP sink ID: {} — skipping message at offset {} in JSON array batch (envelope): {}", + self.id, message.offset, e + ); + self.errors_count.fetch_add(1, Ordering::Relaxed); + skipped += 1; + continue; + } + }; + envelopes.push(envelope); + } + + if envelopes.is_empty() { + return Err(Error::Serialization( + "All messages in JSON array batch failed serialization".to_string(), + )); + } + + let count = envelopes.len() as u64; + + let body = match serde_json::to_vec(&envelopes) { + Ok(b) => b, + Err(e) => { + error!( + "HTTP sink ID: {} — failed to serialize JSON array batch \ + ({} envelopes, {} skipped): {}", + self.id, + envelopes.len(), + skipped, + e, + ); + // Count all successfully-built envelopes as errors (skipped already counted individually) + self.errors_count.fetch_add(count, Ordering::Relaxed); + return Err(Error::Serialization(format!( + "JSON array serialize ({} envelopes): {}", + envelopes.len(), + e + ))); + } + }; + + if self.max_payload_size_bytes > 0 && body.len() as u64 > self.max_payload_size_bytes { + error!( + "HTTP sink ID: {} — JSON array batch exceeds max payload size ({} > {} bytes)", + self.id, + body.len(), + self.max_payload_size_bytes, + ); + // Count all successfully-serialized messages as errors (skipped already counted individually) + self.errors_count.fetch_add(count, Ordering::Relaxed); + return Err(Error::HttpRequestFailed(format!( + "JSON array batch exceeds max size: {} bytes", + body.len() + ))); + } + + self.send_batch_body(Bytes::from(body), count, skipped) + .await + } + + /// Send messages in `raw` mode — one HTTP request per message with raw bytes. + async fn send_raw(&self, messages: Vec) -> Result<(), Error> { + self.send_per_message(messages, self.batch_mode.content_type(), |message| { + message + .payload + .try_into_vec() + .map_err(|e| Error::Serialization(format!("Raw payload convert: {}", e))) + }) + .await + } +} + +/// Parse a human-readable duration string, falling back to a default on failure. +fn parse_duration(input: Option<&str>, default: &str) -> Duration { + let raw = input.unwrap_or(default); + HumanDuration::from_str(raw) + .map(|d| *d) + .unwrap_or_else(|e| { + warn!( + "Invalid duration '{}': {}, using default '{}'", + raw, e, default + ); + *HumanDuration::from_str(default).expect("default duration must be valid") + }) +} + +/// Custom retry strategy that respects user-configured success_status_codes. +/// +/// Codes in the success set are never retried (even if normally transient like 429). +/// Remaining 429/5xx are classified as transient for retry. +struct HttpSinkRetryStrategy { + success_status_codes: HashSet, +} + +impl RetryableStrategy for HttpSinkRetryStrategy { + fn handle(&self, res: &reqwest_middleware::Result) -> Option { + match res { + Ok(response) => { + let status = response.status().as_u16(); + if self.success_status_codes.contains(&status) { + return None; + } + if let Some(retry_after) = response.headers().get(reqwest::header::RETRY_AFTER) { + let header_str = retry_after.to_str().unwrap_or(""); + warn!( + "Server returned {} with Retry-After: {} — middleware uses computed \ + backoff which may be insufficient", + status, header_str, + ); + } + match status { + 429 | 500 | 502 | 503 | 504 => Some(Retryable::Transient), + _ => Some(Retryable::Fatal), + } + } + Err(_) => Some(Retryable::Transient), + } + } +} + +/// Map an `HttpMethod` to a `reqwest_middleware::RequestBuilder` for the given URL. +fn build_request( + method: HttpMethod, + client: &ClientWithMiddleware, + url: &str, +) -> reqwest_middleware::RequestBuilder { + match method { + HttpMethod::Get => client.get(url), + HttpMethod::Head => client.head(url), + HttpMethod::Post => client.post(url), + HttpMethod::Put => client.put(url), + HttpMethod::Patch => client.patch(url), + HttpMethod::Delete => client.delete(url), + } +} + +/// Format a u128 message ID as a 32-character lowercase hex string (no dashes). +fn format_u128_as_hex(id: u128) -> String { + format!("{:032x}", id) +} + +/// Truncate a response body string for log output, respecting UTF-8 char boundaries. +fn truncate_response(body: &str, max_len: usize) -> &str { + if body.len() <= max_len { + body + } else { + // Find the last valid UTF-8 char boundary at or before max_len + let end = body.floor_char_boundary(max_len); + &body[..end] + } +} + +#[async_trait] +impl Sink for HttpSink { + async fn open(&mut self) -> Result<(), Error> { + // Validate success_status_codes — empty would cause every response to be treated as failure + if self.success_status_codes.is_empty() { + return Err(Error::InitError( + "success_status_codes must not be empty — would cause retry storms against healthy endpoints".to_string(), + )); + } + for &code in &self.success_status_codes { + if !(200..=599).contains(&code) { + return Err(Error::InitError(format!( + "Invalid status code {} in success_status_codes — must be 200-599", + code, + ))); + } + } + + // Warn if success codes overlap with transient retry codes — these will be treated + // as success, silently disabling retry for those status codes. + const TRANSIENT_CODES: &[u16] = &[429, 500, 502, 503, 504]; + let overlap: Vec = self + .success_status_codes + .iter() + .filter(|c| TRANSIENT_CODES.contains(c)) + .copied() + .collect(); + if !overlap.is_empty() { + warn!( + "HTTP sink ID: {} — success_status_codes {:?} overlap with transient retry codes. \ + These will be treated as success, disabling retry.", + self.id, overlap + ); + } + + // Validate URL + if self.url.is_empty() { + return Err(Error::InitError( + "HTTP sink URL is empty — 'url' is required in [plugin_config]".to_string(), + )); + } + match reqwest::Url::parse(&self.url) { + Ok(parsed) => { + let scheme = parsed.scheme(); + if scheme != "http" && scheme != "https" { + return Err(Error::InitError(format!( + "HTTP sink URL scheme '{}' is not allowed — only 'http' and 'https' are supported (url: '{}')", + scheme, self.url, + ))); + } + } + Err(e) => { + return Err(Error::InitError(format!( + "HTTP sink URL '{}' is not a valid URL: {}", + self.url, e, + ))); + } + } + + // Warn if user supplied a Content-Type header — it will be overridden by batch_mode. + if self + .headers + .keys() + .any(|k| k.eq_ignore_ascii_case("content-type")) + { + warn!( + "HTTP sink ID: {} — custom 'Content-Type' header in [headers] is ignored. \ + Content-Type is set by batch_mode ({:?} -> '{}'). \ + Remove it from [headers] to silence this warning.", + self.id, + self.batch_mode, + self.batch_mode.content_type(), + ); + } + + // Validate custom headers — fail fast rather than per-request errors + for (key, value) in &self.headers { + reqwest::header::HeaderName::from_bytes(key.as_bytes()) + .map_err(|e| Error::InitError(format!("Invalid header name '{}': {}", key, e)))?; + reqwest::header::HeaderValue::from_str(value).map_err(|e| { + Error::InitError(format!("Invalid header value for '{}': {}", key, e)) + })?; + } + + // Pre-build the HeaderMap once — avoids re-parsing on every request. + // Header names and values were validated above, so expect() is safe here. + let mut header_map = reqwest::header::HeaderMap::new(); + for (key, value) in &self.headers { + if key.eq_ignore_ascii_case("content-type") { + continue; + } + let name = reqwest::header::HeaderName::from_bytes(key.as_bytes()) + .expect("header name validated above"); + let val = reqwest::header::HeaderValue::from_str(value) + .expect("header value validated above"); + header_map.insert(name, val); + } + self.request_headers = Some(header_map); + + // Build the HTTP client with config-derived settings + self.client = Some(self.build_client()?); + + // Optional health check — uses same pre-built headers and success_status_codes as consume() + if self.health_check_enabled { + let client = self.client.as_ref().expect("client just built"); + let headers = self + .request_headers + .as_ref() + .expect("request_headers just built"); + let health_request = + build_request(self.health_check_method, client, &self.url).headers(headers.clone()); + + let response = health_request.send().await.map_err(|e| { + Error::Connection(format!("Health check failed for URL '{}': {}", self.url, e)) + })?; + + let status = response.status(); + if !self.success_status_codes.contains(&status.as_u16()) { + return Err(Error::Connection(format!( + "Health check returned status {} (not in success_status_codes {:?}) for URL '{}'", + status.as_u16(), + self.success_status_codes, + self.url, + ))); + } + + info!( + "HTTP sink ID: {} — health check passed (status {})", + self.id, + status.as_u16() + ); + } + + info!( + "Opened HTTP sink connector with ID: {} for URL: {} (method: {:?}, \ + batch_mode: {:?}, timeout: {:?}, max_retries: {})", + self.id, self.url, self.method, self.batch_mode, self.timeout, self.max_retries, + ); + Ok(()) + } + + /// Deliver messages to the configured HTTP endpoint. + /// + /// **Worst-case latency upper bound** (individual/raw modes): + /// `batch_length * (max_retries + 1) * (timeout + max_retry_delay)`. + /// Example: 50 * 4 * (30s + 30s) = 12000s. `MAX_CONSECUTIVE_FAILURES` (3) + /// mitigates this by aborting early, but a fail-succeed-fail pattern can bypass it. + /// + /// **Runtime note**: The FFI boundary in `sdk/src/sink.rs` maps `consume()`'s `Result` to + /// `i32` (0=ok, 1=err), but the runtime's `process_messages()` in `runtime/src/sink.rs` + /// discards that return code. All retry logic lives inside this method — returning `Err` + /// does not trigger a runtime-level retry. + async fn consume( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: MessagesMetadata, + messages: Vec, + ) -> Result<(), Error> { + let messages_count = messages.len(); + if messages_count == 0 { + return Ok(()); + } + + if self.verbose { + debug!( + "HTTP sink ID: {} — received {} messages (schema: {}, stream: {}, topic: {})", + self.id, + messages_count, + messages_metadata.schema, + topic_metadata.stream, + topic_metadata.topic, + ); + } + + let result = match self.batch_mode { + BatchMode::Individual => { + self.send_individual(topic_metadata, &messages_metadata, messages) + .await + } + BatchMode::NdJson => { + self.send_ndjson(topic_metadata, &messages_metadata, messages) + .await + } + BatchMode::JsonArray => { + self.send_json_array(topic_metadata, &messages_metadata, messages) + .await + } + BatchMode::Raw => self.send_raw(messages).await, + }; + + if let Err(ref e) = result { + error!( + "HTTP sink ID: {} — consume() returning error (runtime ignores FFI status code): {}", + self.id, e + ); + } + + result + } + + async fn close(&mut self) -> Result<(), Error> { + let requests = self.send_attempts.load(Ordering::Relaxed); + let delivered = self.messages_delivered.load(Ordering::Relaxed); + let errors = self.errors_count.load(Ordering::Relaxed); + let last_success = self.last_success_timestamp.load(Ordering::Relaxed); + + info!( + "HTTP sink connector ID: {} closed. Stats: {} send attempts, \ + {} messages delivered, {} errors, last success epoch: {}.", + self.id, requests, delivered, errors, last_success, + ); + + self.request_headers = None; + self.client = None; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use iggy_connector_sdk::Schema; + + const FIELD_DATA: &str = "data"; + const FIELD_PAYLOAD_ENCODING: &str = "iggy_payload_encoding"; + const FIELD_METADATA: &str = "metadata"; + const FIELD_PAYLOAD: &str = "payload"; + const FIELD_ID: &str = "iggy_id"; + const FIELD_OFFSET: &str = "iggy_offset"; + const FIELD_TIMESTAMP: &str = "iggy_timestamp"; + const FIELD_STREAM: &str = "iggy_stream"; + const FIELD_TOPIC: &str = "iggy_topic"; + const FIELD_PARTITION_ID: &str = "iggy_partition_id"; + const FIELD_CHECKSUM: &str = "iggy_checksum"; + const FIELD_ORIGIN_TIMESTAMP: &str = "iggy_origin_timestamp"; + const FIELD_HEADERS: &str = "iggy_headers"; + + #[test] + fn given_all_none_config_should_apply_defaults() { + let sink = given_sink_with_defaults(); + + assert_eq!(sink.method, HttpMethod::Post); + assert_eq!(sink.timeout, Duration::from_secs(30)); + assert_eq!(sink.max_payload_size_bytes, DEFAULT_MAX_PAYLOAD_SIZE); + assert_eq!(sink.batch_mode, BatchMode::Individual); + assert!(sink.include_metadata); + assert!(!sink.include_checksum); + assert!(!sink.include_origin_timestamp); + assert!(!sink.health_check_enabled); + assert_eq!(sink.health_check_method, HttpMethod::Head); + assert_eq!(sink.max_retries, DEFAULT_MAX_RETRIES); + assert_eq!(sink.retry_delay, Duration::from_secs(1)); + assert_eq!(sink.retry_backoff_multiplier, DEFAULT_BACKOFF_MULTIPLIER); + assert_eq!(sink.max_retry_delay, Duration::from_secs(30)); + assert_eq!( + sink.success_status_codes, + HashSet::from([200, 201, 202, 204]) + ); + assert!(!sink.tls_danger_accept_invalid_certs); + assert_eq!(sink.max_connections, DEFAULT_MAX_CONNECTIONS); + assert!(!sink.verbose); + assert!(sink.client.is_none()); + } + + #[test] + fn given_explicit_config_values_should_override_defaults() { + let config = HttpSinkConfig { + url: "https://example.com".to_string(), + method: Some(HttpMethod::Put), + timeout: Some("10s".to_string()), + max_payload_size_bytes: Some(5000), + headers: Some(HashMap::from([("X-Key".to_string(), "val".to_string())])), + batch_mode: Some(BatchMode::NdJson), + include_metadata: Some(false), + include_checksum: Some(true), + include_origin_timestamp: Some(true), + health_check_enabled: Some(true), + health_check_method: Some(HttpMethod::Get), + max_retries: Some(5), + retry_delay: Some("500ms".to_string()), + retry_backoff_multiplier: Some(3), + max_retry_delay: Some("60s".to_string()), + success_status_codes: Some(vec![200, 202]), + tls_danger_accept_invalid_certs: Some(true), + max_connections: Some(20), + verbose_logging: Some(true), + }; + + let sink = HttpSink::new(1, config); + assert_eq!(sink.method, HttpMethod::Put); + assert_eq!(sink.timeout, Duration::from_secs(10)); + assert_eq!(sink.max_payload_size_bytes, 5000); + assert_eq!(sink.headers.len(), 1); + assert_eq!(sink.batch_mode, BatchMode::NdJson); + assert!(!sink.include_metadata); + assert!(sink.include_checksum); + assert!(sink.include_origin_timestamp); + assert!(sink.health_check_enabled); + assert_eq!(sink.health_check_method, HttpMethod::Get); + assert_eq!(sink.max_retries, 5); + assert_eq!(sink.retry_delay, Duration::from_millis(500)); + assert_eq!(sink.retry_backoff_multiplier, 3); + assert_eq!(sink.max_retry_delay, Duration::from_secs(60)); + assert_eq!(sink.success_status_codes, HashSet::from([200, 202])); + assert!(sink.tls_danger_accept_invalid_certs); + assert_eq!(sink.max_connections, 20); + assert!(sink.verbose); + } + + #[test] + fn given_backoff_multiplier_below_one_should_clamp_to_one() { + let mut config = given_default_config(); + config.retry_backoff_multiplier = Some(0); + let sink = HttpSink::new(1, config); + assert_eq!(sink.retry_backoff_multiplier, 1); + } + + #[test] + fn given_invalid_duration_string_should_fall_back_to_default() { + let mut config = given_default_config(); + config.timeout = Some("not_a_duration".to_string()); + config.retry_delay = Some("xyz".to_string()); + let sink = HttpSink::new(1, config); + assert_eq!(sink.timeout, Duration::from_secs(30)); + assert_eq!(sink.retry_delay, Duration::from_secs(1)); + } + + #[test] + fn given_valid_duration_strings_should_parse_correctly() { + let cases = [ + ("30s", Duration::from_secs(30)), + ("500ms", Duration::from_millis(500)), + ("2m", Duration::from_secs(120)), + ("1h", Duration::from_secs(3600)), + ]; + + for (input, expected) in cases { + assert_eq!( + parse_duration(Some(input), "1s"), + expected, + "input: {}", + input + ); + } + } + + #[test] + fn given_none_duration_should_use_default() { + assert_eq!(parse_duration(None, "5s"), Duration::from_secs(5)); + } + + #[test] + fn given_http_method_should_serialize_as_uppercase() { + let cases = [ + (HttpMethod::Get, "\"GET\""), + (HttpMethod::Head, "\"HEAD\""), + (HttpMethod::Post, "\"POST\""), + (HttpMethod::Put, "\"PUT\""), + (HttpMethod::Patch, "\"PATCH\""), + (HttpMethod::Delete, "\"DELETE\""), + ]; + + for (method, expected_json) in cases { + let json = serde_json::to_string(&method).unwrap(); + assert_eq!(json, expected_json); + } + } + + #[test] + fn given_uppercase_json_should_deserialize_to_method() { + let cases = [ + ("\"GET\"", HttpMethod::Get), + ("\"POST\"", HttpMethod::Post), + ("\"DELETE\"", HttpMethod::Delete), + ]; + + for (json, expected) in cases { + let method: HttpMethod = serde_json::from_str(json).unwrap(); + assert_eq!(method, expected); + } + } + + #[test] + fn given_invalid_method_string_should_fail_deserialization() { + let result: Result = serde_json::from_str("\"DELEET\""); + assert!(result.is_err()); + } + + #[test] + fn given_batch_mode_should_serialize_as_snake_case() { + let cases = [ + (BatchMode::Individual, "\"individual\""), + (BatchMode::NdJson, "\"nd_json\""), + (BatchMode::JsonArray, "\"json_array\""), + (BatchMode::Raw, "\"raw\""), + ]; + + for (mode, expected_json) in cases { + let json = serde_json::to_string(&mode).unwrap(); + assert_eq!(json, expected_json); + } + } + + #[test] + fn given_batch_mode_display_should_return_human_readable_name() { + assert_eq!(BatchMode::Individual.to_string(), "individual"); + assert_eq!(BatchMode::NdJson.to_string(), "NDJSON"); + assert_eq!(BatchMode::JsonArray.to_string(), "JSON array"); + assert_eq!(BatchMode::Raw.to_string(), "raw"); + } + + #[test] + fn given_batch_mode_should_return_correct_content_type() { + let cases = [ + (BatchMode::Individual, "application/json"), + (BatchMode::NdJson, "application/x-ndjson"), + (BatchMode::JsonArray, "application/json"), + (BatchMode::Raw, "application/octet-stream"), + ]; + + for (mode, expected) in cases { + assert_eq!(mode.content_type(), expected); + } + } + + #[test] + fn given_zero_id_should_format_as_32_char_hex() { + let result = format_u128_as_hex(0); + assert_eq!(result.len(), 32); + assert_eq!(result, "00000000000000000000000000000000"); + } + + #[test] + fn given_max_u128_should_format_as_32_char_hex() { + let result = format_u128_as_hex(u128::MAX); + assert_eq!(result.len(), 32); + assert_eq!(result, "ffffffffffffffffffffffffffffffff"); + } + + #[test] + fn given_specific_id_should_produce_correct_hex() { + let id: u128 = 0x0123456789abcdef0123456789abcdef; + let result = format_u128_as_hex(id); + assert_eq!(result.len(), 32); + assert_eq!(result, "0123456789abcdef0123456789abcdef"); + } + + #[test] + fn given_short_string_should_return_unchanged() { + assert_eq!(truncate_response("hello", 10), "hello"); + } + + #[test] + fn given_long_string_should_truncate_at_boundary() { + let result = truncate_response("hello world", 5); + assert_eq!(result, "hello"); + } + + #[test] + fn given_multibyte_string_should_truncate_at_char_boundary() { + // "héllo" — 'é' is 2 bytes in UTF-8, so bytes are: h(1) é(2) l(1) l(1) o(1) + // floor_char_boundary(2) can't include the 2-byte 'é', returns 1 → "h" + let result = truncate_response("héllo", 2); + assert_eq!(result, "h"); + } + + #[test] + fn given_json_payload_should_convert_to_serde_json() { + let sink = given_sink_with_defaults(); + let payload = Payload::Json(simd_json_from_str(r#"{"name":"test","count":42}"#)); + + let result = sink.payload_to_json(payload).unwrap(); + assert_eq!(result["name"], "test"); + assert_eq!(result["count"], 42); + } + + #[test] + fn given_text_payload_should_convert_to_string_value() { + let sink = given_sink_with_defaults(); + let result = sink + .payload_to_json(Payload::Text("hello".to_string())) + .unwrap(); + assert_eq!(result, serde_json::Value::String("hello".to_string())); + } + + #[test] + fn given_raw_payload_should_base64_encode() { + let sink = given_sink_with_defaults(); + let result = sink.payload_to_json(Payload::Raw(vec![1, 2, 3])).unwrap(); + assert_eq!(result[FIELD_PAYLOAD_ENCODING], "base64"); + assert_eq!( + result[FIELD_DATA], + general_purpose::STANDARD.encode([1, 2, 3]) + ); + } + + #[test] + fn given_flatbuffer_payload_should_base64_encode() { + let sink = given_sink_with_defaults(); + let result = sink + .payload_to_json(Payload::FlatBuffer(vec![4, 5, 6])) + .unwrap(); + assert_eq!(result[FIELD_PAYLOAD_ENCODING], "base64"); + assert_eq!( + result[FIELD_DATA], + general_purpose::STANDARD.encode([4, 5, 6]) + ); + } + + #[test] + fn given_proto_payload_should_base64_encode_string_bytes() { + let sink = given_sink_with_defaults(); + let result = sink + .payload_to_json(Payload::Proto("proto_data".to_string())) + .unwrap(); + assert_eq!(result[FIELD_PAYLOAD_ENCODING], "base64"); + assert_eq!( + result[FIELD_DATA], + general_purpose::STANDARD.encode(b"proto_data") + ); + } + + #[test] + fn given_include_metadata_true_should_wrap_payload() { + let sink = given_sink_with_defaults(); + let message = given_json_message(42, 10); + let topic_meta = given_topic_metadata(); + let msg_meta = given_messages_metadata(); + let payload_json = sink.payload_to_json(message.payload.clone()).unwrap(); + + let envelope = sink + .build_envelope(&message, &topic_meta, &msg_meta, payload_json) + .unwrap(); + + assert!(envelope.get(FIELD_METADATA).is_some()); + assert!(envelope.get(FIELD_PAYLOAD).is_some()); + + let metadata = &envelope[FIELD_METADATA]; + assert_eq!(metadata[FIELD_OFFSET], 10); + assert_eq!(metadata[FIELD_TIMESTAMP], 1710064800000000u64); + assert_eq!(metadata[FIELD_STREAM], "test_stream"); + assert_eq!(metadata[FIELD_TOPIC], "test_topic"); + assert_eq!(metadata[FIELD_PARTITION_ID], 0); + assert_eq!(metadata[FIELD_ID], format_u128_as_hex(42)); + // Verify conditional fields are absent by default + assert!(metadata.get(FIELD_CHECKSUM).is_none()); + assert!(metadata.get(FIELD_ORIGIN_TIMESTAMP).is_none()); + } + + #[test] + fn given_include_metadata_false_should_return_raw_payload() { + let mut config = given_default_config(); + config.include_metadata = Some(false); + let sink = HttpSink::new(1, config); + + let message = given_json_message(1, 0); + let topic_meta = given_topic_metadata(); + let msg_meta = given_messages_metadata(); + let payload_json = sink.payload_to_json(message.payload.clone()).unwrap(); + + let envelope = sink + .build_envelope(&message, &topic_meta, &msg_meta, payload_json.clone()) + .unwrap(); + + // Should be the payload itself, not wrapped + assert_eq!(envelope, payload_json); + assert!(envelope.get(FIELD_METADATA).is_none()); + } + + #[test] + fn given_include_checksum_should_add_checksum_to_metadata() { + let mut config = given_default_config(); + config.include_checksum = Some(true); + let sink = HttpSink::new(1, config); + + let message = given_json_message(1, 0); + let topic_meta = given_topic_metadata(); + let msg_meta = given_messages_metadata(); + let payload_json = sink.payload_to_json(message.payload.clone()).unwrap(); + + let envelope = sink + .build_envelope(&message, &topic_meta, &msg_meta, payload_json) + .unwrap(); + assert_eq!(envelope[FIELD_METADATA][FIELD_CHECKSUM], 12345); + } + + #[test] + fn given_include_origin_timestamp_should_add_to_metadata() { + let mut config = given_default_config(); + config.include_origin_timestamp = Some(true); + let sink = HttpSink::new(1, config); + + let message = given_json_message(1, 0); + let topic_meta = given_topic_metadata(); + let msg_meta = given_messages_metadata(); + let payload_json = sink.payload_to_json(message.payload.clone()).unwrap(); + + let envelope = sink + .build_envelope(&message, &topic_meta, &msg_meta, payload_json) + .unwrap(); + assert_eq!( + envelope[FIELD_METADATA][FIELD_ORIGIN_TIMESTAMP], + 1710064799000000u64 + ); + } + + #[test] + fn given_message_with_headers_should_include_iggy_headers_in_metadata() { + use iggy_connector_sdk::ConsumedMessage; + + let sink = given_sink_with_defaults(); + let topic_meta = given_topic_metadata(); + let msg_meta = given_messages_metadata(); + + let mut headers = HashMap::new(); + headers.insert( + "x-correlation-id".parse().unwrap(), + "abc-123".parse().unwrap(), + ); + + let message = ConsumedMessage { + id: 1, + offset: 0, + checksum: 0, + timestamp: 1710064800000000, + origin_timestamp: 0, + headers: Some(headers), + payload: Payload::Json(simd_json_from_str(r#"{"key":"value"}"#)), + }; + + let payload_json = sink.payload_to_json(message.payload.clone()).unwrap(); + let envelope = sink + .build_envelope(&message, &topic_meta, &msg_meta, payload_json) + .unwrap(); + + let iggy_headers = &envelope[FIELD_METADATA][FIELD_HEADERS]; + assert!( + !iggy_headers.is_null(), + "Expected iggy_headers in metadata when message has headers" + ); + assert!( + iggy_headers.get("x-correlation-id").is_some(), + "Expected header key in iggy_headers, got: {iggy_headers}" + ); + } + + #[test] + fn given_message_without_headers_should_not_include_iggy_headers() { + let sink = given_sink_with_defaults(); + let message = given_json_message(1, 0); + let topic_meta = given_topic_metadata(); + let msg_meta = given_messages_metadata(); + let payload_json = sink.payload_to_json(message.payload.clone()).unwrap(); + + let envelope = sink + .build_envelope(&message, &topic_meta, &msg_meta, payload_json) + .unwrap(); + assert!( + envelope[FIELD_METADATA].get(FIELD_HEADERS).is_none(), + "Expected no iggy_headers when message has no headers" + ); + } + + #[test] + fn given_null_value_should_convert_to_null() { + let v = simd_json::OwnedValue::Static(simd_json::StaticNode::Null); + assert_eq!(owned_value_to_serde_json(&v), serde_json::Value::Null); + } + + #[test] + fn given_bool_value_should_convert_correctly() { + let v = simd_json::OwnedValue::Static(simd_json::StaticNode::Bool(true)); + assert_eq!(owned_value_to_serde_json(&v), serde_json::Value::Bool(true)); + } + + #[test] + fn given_integer_values_should_convert_correctly() { + let i64_val = simd_json::OwnedValue::Static(simd_json::StaticNode::I64(-42)); + assert_eq!(owned_value_to_serde_json(&i64_val), serde_json::json!(-42)); + + let u64_val = simd_json::OwnedValue::Static(simd_json::StaticNode::U64(42)); + assert_eq!(owned_value_to_serde_json(&u64_val), serde_json::json!(42)); + } + + #[test] + fn given_f64_value_should_convert_correctly() { + let v = simd_json::OwnedValue::Static(simd_json::StaticNode::F64(3.54)); + let result = owned_value_to_serde_json(&v); + assert_eq!(result.as_f64().unwrap(), 3.54); + } + + #[test] + fn given_nan_f64_should_convert_to_null() { + let v = simd_json::OwnedValue::Static(simd_json::StaticNode::F64(f64::NAN)); + assert_eq!(owned_value_to_serde_json(&v), serde_json::Value::Null); + } + + #[test] + fn given_infinity_f64_should_convert_to_null() { + let v = simd_json::OwnedValue::Static(simd_json::StaticNode::F64(f64::INFINITY)); + assert_eq!(owned_value_to_serde_json(&v), serde_json::Value::Null); + } + + #[test] + fn given_nested_object_should_convert_recursively() { + let v = simd_json_from_str(r#"{"nested":{"key":"val"},"arr":[1,2]}"#); + + let result = owned_value_to_serde_json(&v); + assert_eq!(result["nested"]["key"], "val"); + assert_eq!(result["arr"][0], 1); + assert_eq!(result["arr"][1], 2); + } + + #[test] + fn given_minimal_toml_config_should_deserialize() { + let toml_str = r#"url = "https://example.com""#; + let config: HttpSinkConfig = toml::from_str(toml_str).unwrap(); + assert_eq!(config.url, "https://example.com"); + assert!(config.method.is_none()); + assert!(config.headers.is_none()); + assert!(config.batch_mode.is_none()); + } + + #[test] + fn given_full_toml_config_should_deserialize_all_fields() { + let toml_str = r#" + url = "https://example.com/api" + method = "PUT" + timeout = "10s" + max_payload_size_bytes = 5000 + batch_mode = "nd_json" + include_metadata = false + include_checksum = true + include_origin_timestamp = true + health_check_enabled = true + health_check_method = "GET" + max_retries = 5 + retry_delay = "2s" + retry_backoff_multiplier = 3 + max_retry_delay = "60s" + success_status_codes = [200, 201] + tls_danger_accept_invalid_certs = true + max_connections = 20 + verbose_logging = true + + [headers] + Authorization = "Bearer token" + X-Custom = "value" + "#; + + let config: HttpSinkConfig = toml::from_str(toml_str).unwrap(); + assert_eq!(config.url, "https://example.com/api"); + assert_eq!(config.method, Some(HttpMethod::Put)); + assert_eq!(config.batch_mode, Some(BatchMode::NdJson)); + assert_eq!(config.max_retries, Some(5)); + assert_eq!(config.success_status_codes, Some(vec![200, 201])); + let headers = config.headers.unwrap(); + assert_eq!(headers["Authorization"], "Bearer token"); + assert_eq!(headers["X-Custom"], "value"); + } + + #[test] + fn given_invalid_method_in_toml_should_fail() { + let toml_str = r#" + url = "https://example.com" + method = "DELEET" + "#; + let result: Result = toml::from_str(toml_str); + assert!(result.is_err()); + } + + #[test] + fn given_invalid_batch_mode_in_toml_should_fail() { + let toml_str = r#" + url = "https://example.com" + batch_mode = "xml" + "#; + let result: Result = toml::from_str(toml_str); + assert!(result.is_err()); + } + + #[tokio::test] + async fn given_empty_url_should_fail_open() { + let mut config = given_default_config(); + config.url = String::new(); + let mut sink = HttpSink::new(1, config); + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("empty"), + "Error should mention empty URL: {}", + err + ); + } + + #[tokio::test] + async fn given_invalid_url_should_fail_open() { + let mut config = given_default_config(); + config.url = "not a url".to_string(); + let mut sink = HttpSink::new(1, config); + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("not a valid URL"), + "Error should mention invalid URL: {}", + err + ); + } + + #[tokio::test] + async fn given_empty_success_status_codes_should_fail_open() { + let mut config = given_default_config(); + config.success_status_codes = Some(vec![]); + let mut sink = HttpSink::new(1, config); + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("success_status_codes"), + "Error should mention success_status_codes: {}", + err + ); + } + + #[tokio::test] + async fn given_valid_config_should_build_client_in_open() { + let mut sink = given_sink_with_defaults(); + // Disable health check so open() doesn't try to connect + sink.health_check_enabled = false; + let result = sink.open().await; + assert!(result.is_ok()); + assert!(sink.client.is_some()); + } + + #[test] + fn given_raw_mode_with_include_metadata_should_still_use_raw_content_type() { + let mut config = given_default_config(); + config.batch_mode = Some(BatchMode::Raw); + config.include_metadata = Some(true); + let sink = HttpSink::new(1, config); + // Raw mode uses octet-stream regardless of include_metadata + assert_eq!(sink.batch_mode.content_type(), "application/octet-stream"); + assert_eq!(sink.batch_mode, BatchMode::Raw); + // include_metadata is set but irrelevant in raw mode (warned at construction) + assert!(sink.include_metadata); + } + + #[tokio::test] + async fn given_file_scheme_url_should_fail_open() { + let mut config = given_default_config(); + config.url = "file:///etc/passwd".to_string(); + let mut sink = HttpSink::new(1, config); + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("not allowed"), + "Expected scheme rejection: {}", + err + ); + } + + #[tokio::test] + async fn given_ftp_scheme_url_should_fail_open() { + let mut config = given_default_config(); + config.url = "ftp://fileserver.local/data".to_string(); + let mut sink = HttpSink::new(1, config); + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("not allowed"), + "Expected scheme rejection: {}", + err + ); + } + + #[tokio::test] + async fn given_http_scheme_url_should_pass_open() { + let mut config = given_default_config(); + config.url = "http://localhost:8080/ingest".to_string(); + let mut sink = HttpSink::new(1, config); + sink.health_check_enabled = false; + let result = sink.open().await; + assert!(result.is_ok()); + } + + #[tokio::test] + async fn given_https_scheme_url_should_pass_open() { + let mut sink = given_sink_with_defaults(); // default URL is https + sink.health_check_enabled = false; + let result = sink.open().await; + assert!(result.is_ok()); + } + + #[tokio::test] + async fn given_invalid_header_name_should_fail_open() { + let mut config = given_default_config(); + config.headers = Some(HashMap::from([( + "Invalid Header\r\n".to_string(), + "value".to_string(), + )])); + let mut sink = HttpSink::new(1, config); + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("Invalid header name"), + "Expected header name error: {}", + err + ); + } + + #[tokio::test] + async fn given_invalid_header_value_should_fail_open() { + let mut config = given_default_config(); + config.headers = Some(HashMap::from([( + "X-Good-Name".to_string(), + "bad\r\nvalue".to_string(), + )])); + let mut sink = HttpSink::new(1, config); + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("Invalid header value"), + "Expected header value error: {}", + err + ); + } + + #[tokio::test] + async fn given_valid_headers_should_pass_open() { + let mut config = given_default_config(); + config.headers = Some(HashMap::from([ + ("Authorization".to_string(), "Bearer token123".to_string()), + ("X-Custom-ID".to_string(), "abc-def".to_string()), + ])); + let mut sink = HttpSink::new(1, config); + sink.health_check_enabled = false; + let result = sink.open().await; + assert!(result.is_ok()); + } + + #[test] + fn given_user_content_type_header_should_be_filtered_in_open() { + // Note: This test validates the Content-Type filter used when building + // request_headers in open(). We verify the predicate matches what open() uses. + let mut config = given_default_config(); + config.headers = Some(HashMap::from([ + ("Content-Type".to_string(), "text/plain".to_string()), + ("content-type".to_string(), "text/xml".to_string()), + ("X-Custom".to_string(), "keep-me".to_string()), + ])); + let sink = HttpSink::new(1, config); + // Count how many headers survive the Content-Type filter + let surviving: Vec<&String> = sink + .headers + .keys() + .filter(|k| !k.eq_ignore_ascii_case("content-type")) + .collect(); + assert_eq!( + surviving.len(), + 1, + "Only non-Content-Type headers should survive, got: {:?}", + surviving + ); + assert!( + surviving.iter().any(|k| *k == "X-Custom"), + "X-Custom should survive the filter, got: {:?}", + surviving + ); + } + + #[tokio::test] + async fn given_invalid_status_code_should_fail_open() { + let mut config = given_default_config(); + config.success_status_codes = Some(vec![200, 999]); + let mut sink = HttpSink::new(1, config); + sink.health_check_enabled = false; + let result = sink.open().await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("999"), + "Expected invalid code in error: {}", + err + ); + } + + #[tokio::test] + async fn given_zero_status_code_should_fail_open() { + let mut config = given_default_config(); + config.success_status_codes = Some(vec![0]); + let mut sink = HttpSink::new(1, config); + sink.health_check_enabled = false; + let result = sink.open().await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn given_informational_status_code_should_fail_open() { + let mut config = given_default_config(); + config.success_status_codes = Some(vec![100]); + let mut sink = HttpSink::new(1, config); + sink.health_check_enabled = false; + let result = sink.open().await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn given_consume_called_before_open_should_return_init_error() { + let sink = given_sink_with_defaults(); + let topic_metadata = given_topic_metadata(); + let messages_metadata = given_messages_metadata(); + let messages = vec![given_json_message(1, 0)]; + let result = sink + .consume(&topic_metadata, messages_metadata, messages) + .await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("not initialized") || err.contains("open()"), + "Expected init error: {}", + err + ); + } + + fn simd_json_from_str(s: &str) -> simd_json::OwnedValue { + let mut bytes = s.as_bytes().to_vec(); + simd_json::to_owned_value(&mut bytes).expect("valid JSON for test") + } + + fn given_default_config() -> HttpSinkConfig { + HttpSinkConfig { + url: "https://api.example.com/ingest".to_string(), + method: None, + timeout: None, + max_payload_size_bytes: None, + headers: None, + batch_mode: None, + include_metadata: None, + include_checksum: None, + include_origin_timestamp: None, + health_check_enabled: None, + health_check_method: None, + max_retries: None, + retry_delay: None, + retry_backoff_multiplier: None, + max_retry_delay: None, + success_status_codes: None, + tls_danger_accept_invalid_certs: None, + max_connections: None, + verbose_logging: None, + } + } + + fn given_sink_with_defaults() -> HttpSink { + HttpSink::new(1, given_default_config()) + } + + fn given_topic_metadata() -> TopicMetadata { + TopicMetadata { + stream: "test_stream".to_string(), + topic: "test_topic".to_string(), + } + } + + fn given_messages_metadata() -> MessagesMetadata { + MessagesMetadata { + partition_id: 0, + current_offset: 0, + schema: Schema::Json, + } + } + + fn given_json_message(id: u128, offset: u64) -> ConsumedMessage { + ConsumedMessage { + id, + offset, + checksum: 12345, + timestamp: 1710064800000000, + origin_timestamp: 1710064799000000, + headers: None, + payload: Payload::Json(simd_json_from_str(r#"{"key":"value"}"#)), + } + } +} diff --git a/core/integration/src/harness/seeds.rs b/core/integration/src/harness/seeds.rs index dd91d7fec0..aa5554c704 100644 --- a/core/integration/src/harness/seeds.rs +++ b/core/integration/src/harness/seeds.rs @@ -36,6 +36,7 @@ pub mod names { pub const STREAM: &str = "test_stream"; pub const TOPIC: &str = "test_topic"; + pub const TOPIC_2: &str = "test_topic_2"; pub const MESSAGE_PAYLOAD: &str = "test_message"; pub const CONSUMER_GROUP: &str = "test_consumer_group"; pub const CONSUMER: &str = "mcp"; @@ -87,6 +88,41 @@ pub async fn connector_stream(client: &IggyClient) -> Result<(), SeedError> { Ok(()) } +/// Seed for multi-topic connector tests: creates stream with two topics. +/// Both topics must exist before connector runtime starts (runtime health check +/// validates all configured topics). +pub async fn connector_multi_topic_stream(client: &IggyClient) -> Result<(), SeedError> { + let stream_id: Identifier = names::STREAM.try_into()?; + + client.create_stream(names::STREAM).await?; + + client + .create_topic( + &stream_id, + names::TOPIC, + 1, + CompressionAlgorithm::None, + None, + IggyExpiry::ServerDefault, + MaxTopicSize::ServerDefault, + ) + .await?; + + client + .create_topic( + &stream_id, + names::TOPIC_2, + 1, + CompressionAlgorithm::None, + None, + IggyExpiry::ServerDefault, + MaxTopicSize::ServerDefault, + ) + .await?; + + Ok(()) +} + /// Standard MCP test data: stream, topic, message, consumer group, consumer offset, user, PAT. pub async fn mcp_standard(client: &IggyClient) -> Result<(), SeedError> { let stream_id: Identifier = names::STREAM.try_into()?; diff --git a/core/integration/tests/connectors/fixtures/http/container.rs b/core/integration/tests/connectors/fixtures/http/container.rs new file mode 100644 index 0000000000..1f52048716 --- /dev/null +++ b/core/integration/tests/connectors/fixtures/http/container.rs @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use integration::harness::TestBinaryError; +use std::time::Duration; +use testcontainers_modules::testcontainers::core::WaitFor::Healthcheck; +use testcontainers_modules::testcontainers::core::wait::HealthWaitStrategy; +use testcontainers_modules::testcontainers::core::{IntoContainerPort, Mount}; +use testcontainers_modules::testcontainers::runners::AsyncRunner; +use testcontainers_modules::testcontainers::{ContainerAsync, GenericImage, ImageExt}; +use tokio::time::sleep; +use tracing::info; + +const WIREMOCK_IMAGE: &str = "wiremock/wiremock"; +const WIREMOCK_TAG: &str = "3.13.2"; +const WIREMOCK_PORT: u16 = 8080; + +pub(super) const DEFAULT_TEST_STREAM: &str = "test_stream"; +pub(super) const DEFAULT_TEST_TOPIC: &str = "test_topic"; +pub(super) const DEFAULT_TEST_TOPIC_2: &str = "test_topic_2"; + +pub(super) const DEFAULT_POLL_ATTEMPTS: usize = 100; +pub(super) const DEFAULT_POLL_INTERVAL_MS: u64 = 100; + +// HTTP sink env vars follow the convention: IGGY_CONNECTORS_SINK_HTTP_
_ +pub(super) const ENV_SINK_PATH: &str = "IGGY_CONNECTORS_SINK_HTTP_PATH"; +pub(super) const ENV_SINK_STREAMS_0_STREAM: &str = "IGGY_CONNECTORS_SINK_HTTP_STREAMS_0_STREAM"; +pub(super) const ENV_SINK_STREAMS_0_TOPICS: &str = "IGGY_CONNECTORS_SINK_HTTP_STREAMS_0_TOPICS"; +pub(super) const ENV_SINK_STREAMS_0_SCHEMA: &str = "IGGY_CONNECTORS_SINK_HTTP_STREAMS_0_SCHEMA"; +pub(super) const ENV_SINK_STREAMS_0_CONSUMER_GROUP: &str = + "IGGY_CONNECTORS_SINK_HTTP_STREAMS_0_CONSUMER_GROUP"; + +// plugin_config fields +pub(super) const ENV_SINK_URL: &str = "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_URL"; +pub(super) const ENV_SINK_BATCH_MODE: &str = "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_BATCH_MODE"; +pub(super) const ENV_SINK_INCLUDE_METADATA: &str = + "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_INCLUDE_METADATA"; +pub(super) const ENV_SINK_METHOD: &str = "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_METHOD"; +pub(super) const ENV_SINK_TIMEOUT: &str = "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_TIMEOUT"; +pub(super) const ENV_SINK_MAX_RETRIES: &str = "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_MAX_RETRIES"; +pub(super) const ENV_SINK_RETRY_DELAY: &str = "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_RETRY_DELAY"; +pub(super) const ENV_SINK_VERBOSE_LOGGING: &str = + "IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_VERBOSE_LOGGING"; + +/// WireMock container for HTTP sink integration tests. +/// +/// Provides a real HTTP endpoint that accepts requests and exposes an admin API +/// for verifying received requests at `/__admin/requests`. +pub struct HttpSinkWireMockContainer { + #[allow(dead_code)] + container: ContainerAsync, + /// Base URL of the WireMock container (e.g., `http://localhost:32768`). + pub(super) base_url: String, +} + +impl HttpSinkWireMockContainer { + pub(super) async fn start() -> Result { + let current_dir = std::env::current_dir().map_err(|e| TestBinaryError::FixtureSetup { + fixture_type: "HttpSinkWireMockContainer".to_string(), + message: format!("Failed to get current dir: {e}"), + })?; + + let container = GenericImage::new(WIREMOCK_IMAGE, WIREMOCK_TAG) + .with_exposed_port(WIREMOCK_PORT.tcp()) + .with_wait_for(Healthcheck(HealthWaitStrategy::default())) + .with_mount(Mount::bind_mount( + current_dir + .join("tests/connectors/http/wiremock/mappings") + .to_string_lossy() + .to_string(), + "/home/wiremock/mappings", + )) + .start() + .await + .map_err(|e| TestBinaryError::FixtureSetup { + fixture_type: "HttpSinkWireMockContainer".to_string(), + message: format!("Failed to start container: {e}"), + })?; + + let host = container + .get_host() + .await + .map_err(|e| TestBinaryError::FixtureSetup { + fixture_type: "HttpSinkWireMockContainer".to_string(), + message: format!("Failed to get host: {e}"), + })?; + + let host_port = container + .get_host_port_ipv4(WIREMOCK_PORT) + .await + .map_err(|e| TestBinaryError::FixtureSetup { + fixture_type: "HttpSinkWireMockContainer".to_string(), + message: format!("Failed to get port: {e}"), + })?; + + let base_url = format!("http://{host}:{host_port}"); + info!("HTTP sink WireMock container available at {base_url}"); + + Ok(Self { + container, + base_url, + }) + } + + /// Query WireMock's admin API and return all received requests. + pub async fn get_received_requests(&self) -> Result, TestBinaryError> { + let url = format!("{}/__admin/requests", self.base_url); + let response = reqwest::get(&url) + .await + .map_err(|e| TestBinaryError::InvalidState { + message: format!("Failed to query WireMock admin API: {e}"), + })?; + + let body: serde_json::Value = + response + .json() + .await + .map_err(|e| TestBinaryError::InvalidState { + message: format!("Failed to parse WireMock admin response: {e}"), + })?; + + let empty = vec![]; + let requests = body["requests"] + .as_array() + .unwrap_or(&empty) + .iter() + .map(|r| WireMockRequest { + method: r["request"]["method"].as_str().unwrap_or("").to_string(), + url: r["request"]["url"].as_str().unwrap_or("").to_string(), + body: r["request"]["body"].as_str().unwrap_or("").to_string(), + headers: r["request"]["headers"].clone(), + }) + .collect(); + + Ok(requests) + } + + /// Poll WireMock until the expected number of requests have been received. + pub async fn wait_for_requests( + &self, + expected: usize, + ) -> Result, TestBinaryError> { + for _ in 0..DEFAULT_POLL_ATTEMPTS { + let requests = self.get_received_requests().await?; + if requests.len() >= expected { + info!( + "WireMock received {} requests (expected {})", + requests.len(), + expected + ); + return Ok(requests); + } + sleep(Duration::from_millis(DEFAULT_POLL_INTERVAL_MS)).await; + } + + let actual = self.get_received_requests().await?.len(); + Err(TestBinaryError::InvalidState { + message: format!( + "Expected at least {expected} requests in WireMock after {} attempts, got {actual}", + DEFAULT_POLL_ATTEMPTS + ), + }) + } + + /// Reset WireMock's request journal (clear received requests). + #[allow(dead_code)] + pub async fn reset_requests(&self) -> Result<(), TestBinaryError> { + let url = format!("{}/__admin/requests", self.base_url); + let client = reqwest::Client::new(); + client + .delete(&url) + .send() + .await + .map_err(|e| TestBinaryError::InvalidState { + message: format!("Failed to reset WireMock requests: {e}"), + })?; + Ok(()) + } +} + +/// A request captured by WireMock's admin API. +#[derive(Debug, Clone)] +pub struct WireMockRequest { + pub method: String, + pub url: String, + pub body: String, + pub headers: serde_json::Value, +} + +impl WireMockRequest { + /// Parse the body as JSON. + pub fn body_as_json(&self) -> Result { + serde_json::from_str(&self.body).map_err(|e| TestBinaryError::InvalidState { + message: format!("Failed to parse request body as JSON: {e}"), + }) + } + + /// Get a header value by name (case-insensitive per RFC 7230). + /// WireMock may return header keys in any case, so we iterate and compare. + pub fn header(&self, name: &str) -> Option { + // WireMock returns headers as {"Header-Name": {"values": ["value"]}} + // or just as a direct string value depending on version. + let obj = self.headers.as_object()?; + for (key, value) in obj { + if key.eq_ignore_ascii_case(name) { + if let Some(values) = value.get("values") { + return values.get(0).and_then(|v| v.as_str()).map(String::from); + } + return value.as_str().map(String::from); + } + } + None + } +} diff --git a/core/integration/tests/connectors/fixtures/http/mod.rs b/core/integration/tests/connectors/fixtures/http/mod.rs new file mode 100644 index 0000000000..c0e5f2c406 --- /dev/null +++ b/core/integration/tests/connectors/fixtures/http/mod.rs @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +mod container; +mod sink; + +pub use sink::{ + HttpSinkIndividualFixture, HttpSinkJsonArrayFixture, HttpSinkMultiTopicFixture, + HttpSinkNdjsonFixture, HttpSinkNoMetadataFixture, HttpSinkRawFixture, +}; diff --git a/core/integration/tests/connectors/fixtures/http/sink.rs b/core/integration/tests/connectors/fixtures/http/sink.rs new file mode 100644 index 0000000000..28ba025731 --- /dev/null +++ b/core/integration/tests/connectors/fixtures/http/sink.rs @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use super::container::{ + DEFAULT_TEST_STREAM, DEFAULT_TEST_TOPIC, DEFAULT_TEST_TOPIC_2, ENV_SINK_BATCH_MODE, + ENV_SINK_INCLUDE_METADATA, ENV_SINK_MAX_RETRIES, ENV_SINK_METHOD, ENV_SINK_PATH, + ENV_SINK_RETRY_DELAY, ENV_SINK_STREAMS_0_CONSUMER_GROUP, ENV_SINK_STREAMS_0_SCHEMA, + ENV_SINK_STREAMS_0_STREAM, ENV_SINK_STREAMS_0_TOPICS, ENV_SINK_TIMEOUT, ENV_SINK_URL, + ENV_SINK_VERBOSE_LOGGING, HttpSinkWireMockContainer, +}; +use async_trait::async_trait; +use integration::harness::{TestBinaryError, TestFixture}; +use std::collections::HashMap; + +/// Base HTTP sink fixture — individual batch mode with metadata enabled. +pub struct HttpSinkIndividualFixture { + container: HttpSinkWireMockContainer, +} + +impl HttpSinkIndividualFixture { + pub fn container(&self) -> &HttpSinkWireMockContainer { + &self.container + } + + fn base_envs(container: &HttpSinkWireMockContainer) -> HashMap { + let mut envs = HashMap::new(); + envs.insert( + ENV_SINK_URL.to_string(), + format!("{}/ingest", container.base_url), + ); + envs.insert(ENV_SINK_METHOD.to_string(), "POST".to_string()); + envs.insert(ENV_SINK_BATCH_MODE.to_string(), "individual".to_string()); + envs.insert(ENV_SINK_INCLUDE_METADATA.to_string(), "true".to_string()); + envs.insert(ENV_SINK_TIMEOUT.to_string(), "10s".to_string()); + envs.insert(ENV_SINK_MAX_RETRIES.to_string(), "1".to_string()); + envs.insert(ENV_SINK_RETRY_DELAY.to_string(), "100ms".to_string()); + envs.insert(ENV_SINK_VERBOSE_LOGGING.to_string(), "true".to_string()); + envs.insert( + ENV_SINK_STREAMS_0_STREAM.to_string(), + DEFAULT_TEST_STREAM.to_string(), + ); + envs.insert( + ENV_SINK_STREAMS_0_TOPICS.to_string(), + format!("[{}]", DEFAULT_TEST_TOPIC), + ); + envs.insert(ENV_SINK_STREAMS_0_SCHEMA.to_string(), "json".to_string()); + envs.insert( + ENV_SINK_STREAMS_0_CONSUMER_GROUP.to_string(), + "http_sink_cg".to_string(), + ); + envs.insert( + ENV_SINK_PATH.to_string(), + "../../target/debug/libiggy_connector_http_sink".to_string(), + ); + envs + } +} + +#[async_trait] +impl TestFixture for HttpSinkIndividualFixture { + async fn setup() -> Result { + let container = HttpSinkWireMockContainer::start().await?; + Ok(Self { container }) + } + + fn connectors_runtime_envs(&self) -> HashMap { + Self::base_envs(&self.container) + } +} + +/// HTTP sink fixture with NDJSON batch mode. +pub struct HttpSinkNdjsonFixture { + container: HttpSinkWireMockContainer, +} + +impl HttpSinkNdjsonFixture { + pub fn container(&self) -> &HttpSinkWireMockContainer { + &self.container + } +} + +#[async_trait] +impl TestFixture for HttpSinkNdjsonFixture { + async fn setup() -> Result { + let container = HttpSinkWireMockContainer::start().await?; + Ok(Self { container }) + } + + fn connectors_runtime_envs(&self) -> HashMap { + let mut envs = HttpSinkIndividualFixture::base_envs(&self.container); + envs.insert(ENV_SINK_BATCH_MODE.to_string(), "ndjson".to_string()); + envs + } +} + +/// HTTP sink fixture with JSON array batch mode. +pub struct HttpSinkJsonArrayFixture { + container: HttpSinkWireMockContainer, +} + +impl HttpSinkJsonArrayFixture { + pub fn container(&self) -> &HttpSinkWireMockContainer { + &self.container + } +} + +#[async_trait] +impl TestFixture for HttpSinkJsonArrayFixture { + async fn setup() -> Result { + let container = HttpSinkWireMockContainer::start().await?; + Ok(Self { container }) + } + + fn connectors_runtime_envs(&self) -> HashMap { + let mut envs = HttpSinkIndividualFixture::base_envs(&self.container); + envs.insert(ENV_SINK_BATCH_MODE.to_string(), "json_array".to_string()); + envs + } +} + +/// HTTP sink fixture with raw batch mode (binary payloads). +pub struct HttpSinkRawFixture { + container: HttpSinkWireMockContainer, +} + +impl HttpSinkRawFixture { + pub fn container(&self) -> &HttpSinkWireMockContainer { + &self.container + } +} + +#[async_trait] +impl TestFixture for HttpSinkRawFixture { + async fn setup() -> Result { + let container = HttpSinkWireMockContainer::start().await?; + Ok(Self { container }) + } + + fn connectors_runtime_envs(&self) -> HashMap { + let mut envs = HttpSinkIndividualFixture::base_envs(&self.container); + envs.insert(ENV_SINK_BATCH_MODE.to_string(), "raw".to_string()); + envs.insert(ENV_SINK_STREAMS_0_SCHEMA.to_string(), "raw".to_string()); + envs + } +} + +/// HTTP sink fixture with metadata disabled. +pub struct HttpSinkNoMetadataFixture { + container: HttpSinkWireMockContainer, +} + +impl HttpSinkNoMetadataFixture { + pub fn container(&self) -> &HttpSinkWireMockContainer { + &self.container + } +} + +#[async_trait] +impl TestFixture for HttpSinkNoMetadataFixture { + async fn setup() -> Result { + let container = HttpSinkWireMockContainer::start().await?; + Ok(Self { container }) + } + + fn connectors_runtime_envs(&self) -> HashMap { + let mut envs = HttpSinkIndividualFixture::base_envs(&self.container); + envs.insert(ENV_SINK_INCLUDE_METADATA.to_string(), "false".to_string()); + envs + } +} + +/// HTTP sink fixture subscribed to two topics on the same stream. +/// Demonstrates the multi-topic single-connector deployment pattern. +pub struct HttpSinkMultiTopicFixture { + container: HttpSinkWireMockContainer, +} + +impl HttpSinkMultiTopicFixture { + pub fn container(&self) -> &HttpSinkWireMockContainer { + &self.container + } +} + +#[async_trait] +impl TestFixture for HttpSinkMultiTopicFixture { + async fn setup() -> Result { + let container = HttpSinkWireMockContainer::start().await?; + Ok(Self { container }) + } + + fn connectors_runtime_envs(&self) -> HashMap { + let mut envs = HttpSinkIndividualFixture::base_envs(&self.container); + // Subscribe to both topics — runtime spawns one task per topic + envs.insert( + ENV_SINK_STREAMS_0_TOPICS.to_string(), + format!("[{},{}]", DEFAULT_TEST_TOPIC, DEFAULT_TEST_TOPIC_2), + ); + envs + } +} diff --git a/core/integration/tests/connectors/fixtures/mod.rs b/core/integration/tests/connectors/fixtures/mod.rs index 6deae48664..ddde216631 100644 --- a/core/integration/tests/connectors/fixtures/mod.rs +++ b/core/integration/tests/connectors/fixtures/mod.rs @@ -18,6 +18,7 @@ */ mod elasticsearch; +mod http; mod iceberg; mod mongodb; mod postgres; @@ -25,6 +26,10 @@ mod quickwit; mod wiremock; pub use elasticsearch::{ElasticsearchSinkFixture, ElasticsearchSourcePreCreatedFixture}; +pub use http::{ + HttpSinkIndividualFixture, HttpSinkJsonArrayFixture, HttpSinkMultiTopicFixture, + HttpSinkNdjsonFixture, HttpSinkNoMetadataFixture, HttpSinkRawFixture, +}; pub use iceberg::{DEFAULT_NAMESPACE, DEFAULT_TABLE, IcebergOps, IcebergPreCreatedFixture}; pub use mongodb::{ MongoDbOps, MongoDbSinkAutoCreateFixture, MongoDbSinkBatchFixture, MongoDbSinkFailpointFixture, diff --git a/core/integration/tests/connectors/http/http_sink.rs b/core/integration/tests/connectors/http/http_sink.rs new file mode 100644 index 0000000000..e41c59ab61 --- /dev/null +++ b/core/integration/tests/connectors/http/http_sink.rs @@ -0,0 +1,854 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//! HTTP Sink Connector: Integration Tests +//! +//! **Purpose**: End-to-end validation of the HTTP sink connector — messages flow from +//! Iggy streams through the connector runtime, get transformed by the sink plugin, and +//! arrive at a real HTTP endpoint where we verify format, headers, metadata, and content. +//! +//! ## Connector Architecture +//! +//! The HTTP sink runs inside the Iggy connector runtime as a dynamically loaded plugin: +//! +//! ```text +//! ┌──────────────┐ ┌──────────────────────┐ ┌──────────────────┐ +//! │ Test Code │ │ Connector Runtime │ │ WireMock │ +//! │ │ │ │ │ │ +//! │ send_messages├───►│ iggy-server (poll) │ │ /__admin/ │ +//! │ │ │ │ │ │ (verify reqs) │ +//! │ │ │ ┌─────▼──────────┐ │ │ │ +//! │ wait_for_ │ │ │ HTTP Sink │ │ │ /ingest │ +//! │ requests ◄───┼────┤ │ (.so/.dylib) ├──┼───►│ (accept POST) │ +//! │ │ │ └────────────────┘ │ │ │ +//! └──────────────┘ └──────────────────────┘ └──────────────────┘ +//! ``` +//! +//! **Key components**: +//! 1. **iggy-server**: Stores messages in streams/topics, serves them to consumers +//! 2. **Connector runtime**: `iggy-connectors` binary, loads the HTTP sink `.so`/`.dylib` +//! plugin via FFI, polls topics, calls `iggy_sink_consume()` per batch +//! 3. **HTTP sink plugin**: Transforms messages into HTTP requests (4 batch modes), +//! applies metadata envelope, retries on failure +//! 4. **WireMock**: Docker container accepting all POSTs to `/ingest`, recording +//! requests for later verification via `/__admin/requests` +//! +//! **Runtime model**: 1 process = 1 config = 1 plugin. The runtime reads `config.toml`, +//! loads the plugin binary, iterates `for topic in stream.topics`, and spawns one +//! `tokio::spawn` task per topic. Each task creates an `IggyConsumer` and polls +//! sequentially — `consume()` is awaited before the next poll. +//! +//! See `setup_sink_consumers()` and `spawn_consume_tasks()` in `runtime/src/sink.rs`. +//! +//! ## What These Tests Validate +//! +//! **Test 1 — Individual Mode**: Each message becomes a separate HTTP POST with +//! metadata envelope (`{metadata: {...}, payload: {...}}`). Validates envelope +//! structure, content type, and per-message delivery. +//! +//! **Test 2 — NDJSON Batch Mode**: All messages arrive in one HTTP request as +//! newline-delimited JSON. Validates line count, per-line envelope structure, +//! and `application/x-ndjson` content type. +//! +//! **Test 3 — JSON Array Batch Mode**: All messages arrive in one HTTP request +//! as a JSON array. Validates array length, per-item envelope structure, and +//! `application/json` content type. +//! +//! **Test 4 — Raw Mode**: Each message sent as raw bytes without metadata envelope. +//! Validates `application/octet-stream` content type and absence of envelope wrapper. +//! +//! **Test 5 — Metadata Disabled**: Individual mode with `include_metadata=false`. +//! Validates that the bare payload arrives without the `{metadata, payload}` wrapper. +//! +//! **Test 6 — Sequential Offsets**: Sends 5 messages and verifies `iggy_offset` values +//! in metadata are contiguous (each offset = previous + 1). Validates that the +//! connector preserves Iggy's offset ordering through the HTTP delivery pipeline. +//! +//! **Test 7 — Multi-Topic**: One connector consuming from two topics on the same +//! stream. Validates that `iggy_topic` metadata correctly identifies the source topic, +//! and that messages from both topics arrive at the shared endpoint. Exercises the +//! runtime's per-topic task spawning (`spawn_consume_tasks()` in `runtime/src/sink.rs`). +//! +//! ## Test Infrastructure +//! +//! **Full-Stack Integration** (all components are real — no mocks): +//! - **iggy-server**: Started by `#[iggy_harness]` macro, in-process +//! - **Connector runtime**: Started by harness with `connectors_runtime(config_path = ...)` +//! - **HTTP sink plugin**: Built from `core/connectors/sinks/http_sink/` (must be compiled) +//! - **WireMock**: Docker container (`wiremock/wiremock:3.13.2`) via testcontainers +//! - **Test fixtures**: `HttpSink*Fixture` structs configure batch mode, metadata, topics +//! via environment variables that override `config.toml` fields +//! +//! **Fixture Architecture**: +//! Each fixture implements `TestFixture` trait, returning `connectors_runtime_envs()` that +//! override the plugin config. The base configuration (`HttpSinkIndividualFixture::base_envs`) +//! sets URL, method, timeout, retries, stream/topic, and schema. Specialized fixtures +//! (NDJSON, JSON array, raw, no-metadata, multi-topic) override specific fields. +//! +//! **WireMock Container**: +//! Accepts all POSTs to `/ingest` (via `accept-ingest.json` mapping). Exposes +//! `/__admin/requests` for polling received requests. The container uses a bind mount +//! for mappings and a health check wait strategy for readiness. +//! +//! **Seed Data**: +//! `seeds::connector_stream` creates the stream (`test_stream`) and first topic +//! (`test_topic`). The multi-topic test creates a second topic inline to avoid +//! polluting the shared harness with HTTP-sink-specific constants. +//! +//! **Configuration** (`tests/connectors/http/sink.toml`): +//! ```toml +//! [connectors] +//! config_type = "local" +//! config_dir = "../connectors/sinks/http_sink" +//! ``` +//! Environment variables override `config.toml` fields at runtime. Convention: +//! `IGGY_CONNECTORS_SINK_HTTP_PLUGIN_CONFIG_` (e.g., `..._BATCH_MODE=ndjson`). +//! +//! ## Running Tests +//! +//! ```bash +//! # Prerequisites: Docker running, HTTP sink plugin compiled +//! cargo build -p iggy_connector_http_sink +//! +//! # Run all HTTP sink integration tests +//! cargo test -p integration --test connectors -- http_sink --nocapture +//! +//! # Run a specific test +//! cargo test -p integration --test connectors -- individual_json_messages --nocapture +//! +//! # Run with test isolation (sequential) +//! cargo test -p integration --test connectors -- http_sink --test-threads=1 --nocapture +//! ``` +//! +//! ## Success Criteria +//! +//! - **All 4 batch modes**: Messages arrive in correct format (individual, ndjson, json_array, raw) +//! - **Metadata envelope**: Present when `include_metadata=true`, absent when `false` +//! - **Content types**: `application/json` (individual/json_array), `application/x-ndjson`, +//! `application/octet-stream` (raw) +//! - **Offset ordering**: Sequential, contiguous offsets in metadata +//! - **Multi-topic routing**: `iggy_topic` metadata matches source topic for each message +//! - **Message counts**: Exact match between sent and received message counts +//! +//! ## Related Documentation +//! +//! - **HTTP Sink README**: `core/connectors/sinks/http_sink/README.md` — Config reference, +//! deployment patterns, retry strategy, connection pooling, message flow +//! - **Connector Runtime**: `runtime/src/sink.rs` — `setup_sink_consumers()`, +//! `spawn_consume_tasks()`, `consume_messages()`, FFI boundary +//! - **SDK Macro**: `sdk/src/sink.rs` — `sink_connector!` macro, `SinkContainer`, DashMap +//! - **Fixtures**: `tests/connectors/fixtures/http/` — WireMock container, fixture structs +//! - **PR**: https://github.com/apache/iggy/pull/2925 +//! - **Discussion**: https://github.com/apache/iggy/discussions/2919 +//! +//! ## Known Limitations +//! +//! 1. **FFI return value ignored**: The runtime's `process_messages()` discards `consume()`'s +//! `i32` return code. Errors are logged by the sink but invisible to the runtime. +//! See [#2927](https://github.com/apache/iggy/issues/2927). +//! 2. **Offsets committed before processing**: `PollingMessages` auto-commit strategy commits +//! offsets before `consume()`. Combined with (1), effective guarantee is at-most-once. +//! See [#2928](https://github.com/apache/iggy/issues/2928). +//! +//! ## Test History +//! +//! - **2026-03-10**: Initial test suite — 6 tests covering all batch modes, metadata toggle, +//! and sequential offset verification. +//! - **2026-03-11**: Added multi-topic test (Test 7) using `seeds::connector_multi_topic_stream` +//! and `seeds::names::TOPIC_2`. Connector runtime requires all configured topics to exist +//! before startup, so the seed creates both topics. +//! - **2026-03-12**: Code review rounds 3+4 (double-review protocol). Fixed: magic string +//! match arms replaced with constants (M9). +//! - **2026-03-20**: Maintainer review (hubcio). Addressed 13 items: consuming iterator, +//! DRY refactor, pre-built HeaderMap, HashSet status codes, UUID v8, iggy headers forwarding, +//! overlap warning, latency docs, test doc trimming, config cleanup. + +use super::TEST_MESSAGE_COUNT; +use crate::connectors::fixtures::{ + HttpSinkIndividualFixture, HttpSinkJsonArrayFixture, HttpSinkMultiTopicFixture, + HttpSinkNdjsonFixture, HttpSinkNoMetadataFixture, HttpSinkRawFixture, +}; +use bytes::Bytes; +use iggy_common::{Identifier, IggyMessage, MessageClient, Partitioning}; +use integration::harness::seeds; +use integration::iggy_harness; + +// ============================================================================ +// Test 1: Individual Batch Mode +// ============================================================================ + +/// Validates `batch_mode=individual`: one HTTP POST per message, each with metadata envelope. +/// Checks request count = message count, envelope structure, and `application/json` content type. +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/http/sink.toml")), + seed = seeds::connector_stream +)] +async fn individual_json_messages_delivered_as_separate_posts( + harness: &TestHarness, + fixture: HttpSinkIndividualFixture, +) { + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + // Step 1: Build 3 JSON messages with distinct payloads + let json_payloads: Vec = vec![ + serde_json::json!({"name": "Alice", "age": 30}), + serde_json::json!({"name": "Bob", "score": 99}), + serde_json::json!({"name": "Carol", "active": true}), + ]; + + let mut messages: Vec = json_payloads + .iter() + .enumerate() + .map(|(i, payload)| { + let bytes = serde_json::to_vec(payload).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(bytes)) + .build() + .expect("Failed to build message") + }) + .collect(); + + // Step 2: Publish messages to Iggy + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::partition_id(0), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + // Step 3: Wait for WireMock to receive all 3 individual HTTP requests + // In individual mode, each message becomes a separate HTTP request. + let requests = fixture + .container() + .wait_for_requests(TEST_MESSAGE_COUNT) + .await + .expect("WireMock did not receive expected number of requests"); + + assert_eq!( + requests.len(), + TEST_MESSAGE_COUNT, + "Expected exactly {TEST_MESSAGE_COUNT} individual requests, got {}", + requests.len() + ); + + // Step 4: Verify each request has correct method, URL, and envelope structure + for req in &requests { + assert_eq!(req.method, "POST", "Expected POST method"); + assert_eq!(req.url, "/ingest", "Expected /ingest URL"); + + let body = req.body_as_json().expect("Body should be valid JSON"); + + // Metadata envelope: {metadata: {...}, payload: {...}} + assert!( + body.get("metadata").is_some(), + "Expected metadata envelope in individual mode, got: {body}" + ); + assert!( + body.get("payload").is_some(), + "Expected payload field in individual mode, got: {body}" + ); + + // Verify standard metadata fields from Iggy context + let metadata = &body["metadata"]; + assert!( + metadata.get("iggy_stream").is_some(), + "Expected iggy_stream in metadata" + ); + assert!( + metadata.get("iggy_topic").is_some(), + "Expected iggy_topic in metadata" + ); + assert!( + metadata.get("iggy_offset").is_some(), + "Expected iggy_offset in metadata" + ); + } + + // Step 5: Verify content type header + let ct = requests[0] + .header("Content-Type") + .expect("Content-Type header must be present"); + assert!( + ct.contains("application/json"), + "Expected application/json content type, got: {ct}" + ); +} + +// ============================================================================ +// Test 2: NDJSON Batch Mode +// ============================================================================ + +/// Validates `batch_mode=ndjson`: all messages in one request as newline-delimited JSON. +/// Checks single request, line count = message count, per-line envelope, `application/x-ndjson`. +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/http/sink.toml")), + seed = seeds::connector_stream +)] +async fn ndjson_messages_delivered_as_single_request( + harness: &TestHarness, + fixture: HttpSinkNdjsonFixture, +) { + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + // Step 1: Build 3 JSON event messages + let json_payloads: Vec = vec![ + serde_json::json!({"event": "login", "user": 1}), + serde_json::json!({"event": "click", "user": 2}), + serde_json::json!({"event": "logout", "user": 3}), + ]; + + let mut messages: Vec = json_payloads + .iter() + .enumerate() + .map(|(i, payload)| { + let bytes = serde_json::to_vec(payload).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(bytes)) + .build() + .expect("Failed to build message") + }) + .collect(); + + // Step 2: Publish messages to Iggy + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::partition_id(0), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + // Step 3: Wait for single NDJSON request (all messages batched into one) + let requests = fixture + .container() + .wait_for_requests(1) + .await + .expect("WireMock did not receive NDJSON request"); + + let req = &requests[0]; + assert_eq!(req.method, "POST", "Expected POST method"); + assert_eq!(req.url, "/ingest", "Expected /ingest URL"); + + // Step 4: Parse NDJSON body — each line is a separate JSON envelope + let lines: Vec<&str> = req.body.trim().lines().collect(); + assert_eq!( + lines.len(), + TEST_MESSAGE_COUNT, + "Expected {TEST_MESSAGE_COUNT} NDJSON lines, got {}", + lines.len() + ); + + for (i, line) in lines.iter().enumerate() { + let parsed: serde_json::Value = + serde_json::from_str(line).unwrap_or_else(|e| panic!("NDJSON line {i} invalid: {e}")); + assert!( + parsed.get("metadata").is_some(), + "Expected metadata in NDJSON line {i}" + ); + assert!( + parsed.get("payload").is_some(), + "Expected payload in NDJSON line {i}" + ); + } + + // Step 5: Verify NDJSON content type + let ct = req + .header("Content-Type") + .expect("Content-Type header must be present"); + assert!( + ct.contains("application/x-ndjson"), + "Expected application/x-ndjson content type, got: {ct}" + ); +} + +// ============================================================================ +// Test 3: JSON Array Batch Mode +// ============================================================================ + +/// Validates `batch_mode=json_array`: all messages as a single JSON array in one request. +/// Checks single request, array length = message count, per-item envelope, `application/json`. +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/http/sink.toml")), + seed = seeds::connector_stream +)] +async fn json_array_messages_delivered_as_single_request( + harness: &TestHarness, + fixture: HttpSinkJsonArrayFixture, +) { + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + // Step 1: Build 3 JSON messages representing different event types + let json_payloads: Vec = vec![ + serde_json::json!({"id": 1, "type": "order"}), + serde_json::json!({"id": 2, "type": "payment"}), + serde_json::json!({"id": 3, "type": "refund"}), + ]; + + let mut messages: Vec = json_payloads + .iter() + .enumerate() + .map(|(i, payload)| { + let bytes = serde_json::to_vec(payload).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(bytes)) + .build() + .expect("Failed to build message") + }) + .collect(); + + // Step 2: Publish messages to Iggy + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::partition_id(0), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + // Step 3: Wait for single JSON array request (all messages in one body) + let requests = fixture + .container() + .wait_for_requests(1) + .await + .expect("WireMock did not receive JSON array request"); + + let req = &requests[0]; + assert_eq!(req.method, "POST", "Expected POST method"); + assert_eq!(req.url, "/ingest", "Expected /ingest URL"); + + // Step 4: Parse body as JSON array and verify structure + let body = req.body_as_json().expect("Body should be valid JSON"); + assert!(body.is_array(), "Expected JSON array body, got: {body}"); + + let arr = body.as_array().unwrap(); + assert_eq!( + arr.len(), + TEST_MESSAGE_COUNT, + "Expected {TEST_MESSAGE_COUNT} items in JSON array, got {}", + arr.len() + ); + + for (i, item) in arr.iter().enumerate() { + assert!( + item.get("metadata").is_some(), + "Expected metadata in array item {i}" + ); + assert!( + item.get("payload").is_some(), + "Expected payload in array item {i}" + ); + } + + // Step 5: Verify JSON content type + let ct = req + .header("Content-Type") + .expect("Content-Type header must be present"); + assert!( + ct.contains("application/json"), + "Expected application/json content type, got: {ct}" + ); +} + +// ============================================================================ +// Test 4: Raw Batch Mode +// ============================================================================ + +/// Validates `batch_mode=raw`: each message as raw bytes without metadata envelope. +/// Checks request count = message count, no envelope wrapper, `application/octet-stream`. +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/http/sink.toml")), + seed = seeds::connector_stream +)] +async fn raw_binary_messages_delivered_without_envelope( + harness: &TestHarness, + fixture: HttpSinkRawFixture, +) { + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + // Step 1: Build 3 raw byte messages + let raw_payloads: Vec> = vec![ + b"plain text message".to_vec(), + b"another raw payload".to_vec(), + b"third raw message".to_vec(), + ]; + + let mut messages: Vec = raw_payloads + .iter() + .enumerate() + .map(|(i, payload)| { + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(payload.clone())) + .build() + .expect("Failed to build message") + }) + .collect(); + + // Step 2: Publish messages to Iggy + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::partition_id(0), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + // Step 3: Wait for all 3 raw HTTP requests (raw mode is always 1:1) + let requests = fixture + .container() + .wait_for_requests(TEST_MESSAGE_COUNT) + .await + .expect("WireMock did not receive expected raw requests"); + + assert_eq!( + requests.len(), + TEST_MESSAGE_COUNT, + "Expected exactly {TEST_MESSAGE_COUNT} raw requests, got {}", + requests.len() + ); + + // Step 4: Verify raw mode — no metadata envelope + for req in &requests { + assert_eq!(req.method, "POST", "Expected POST method"); + assert_eq!(req.url, "/ingest", "Expected /ingest URL"); + + // Raw mode: body is raw bytes, NOT a JSON envelope. + // If the body happens to parse as JSON, it must NOT have "metadata" key. + if let Ok(json) = req.body_as_json() { + assert!( + json.get("metadata").is_none(), + "Raw mode should not include metadata envelope" + ); + } + } + + // Step 5: Verify raw content type + let ct = requests[0] + .header("Content-Type") + .expect("Content-Type header must be present"); + assert!( + ct.contains("application/octet-stream"), + "Expected application/octet-stream for raw mode, got: {ct}" + ); +} + +// ============================================================================ +// Test 5: Metadata Disabled +// ============================================================================ + +/// Validates `include_metadata=false`: bare payload without `{metadata, payload}` envelope. +/// Checks no metadata field in body, original payload fields at top level. +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/http/sink.toml")), + seed = seeds::connector_stream +)] +async fn metadata_disabled_sends_bare_payload( + harness: &TestHarness, + fixture: HttpSinkNoMetadataFixture, +) { + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + // Step 1: Build 3 simple JSON messages + let json_payloads: Vec = vec![ + serde_json::json!({"key": "value1"}), + serde_json::json!({"key": "value2"}), + serde_json::json!({"key": "value3"}), + ]; + + let mut messages: Vec = json_payloads + .iter() + .enumerate() + .map(|(i, payload)| { + let bytes = serde_json::to_vec(payload).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(bytes)) + .build() + .expect("Failed to build message") + }) + .collect(); + + // Step 2: Publish messages to Iggy + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::partition_id(0), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + // Step 3: Wait for WireMock to receive all requests + let requests = fixture + .container() + .wait_for_requests(TEST_MESSAGE_COUNT) + .await + .expect("WireMock did not receive requests"); + + // Step 4: Verify bare payload — no metadata wrapper + for (i, req) in requests.iter().enumerate() { + let body = req + .body_as_json() + .unwrap_or_else(|e| panic!("Request {i} body should be valid JSON: {e}")); + + // Without metadata, the body IS the payload — no wrapping + assert!( + body.get("metadata").is_none(), + "Expected no metadata envelope when include_metadata=false, got: {body}" + ); + + // The original payload fields should be at the top level + assert!( + body.get("key").is_some(), + "Expected bare payload with 'key' field, got: {body}" + ); + } +} + +// ============================================================================ +// Test 6: Sequential Offset Verification +// ============================================================================ + +/// Validates sequential offset integrity: `iggy_offset` values are contiguous across +/// 5 delivered messages. Sorts by offset and checks each = previous + 1. +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/http/sink.toml")), + seed = seeds::connector_stream +)] +async fn individual_messages_have_sequential_offsets( + harness: &TestHarness, + fixture: HttpSinkIndividualFixture, +) { + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + // Step 1: Build 5 messages (more than default 3 to better test ordering) + let mut messages: Vec = (0..5) + .map(|i| { + let payload = + serde_json::to_vec(&serde_json::json!({"idx": i})).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(payload)) + .build() + .expect("Failed to build message") + }) + .collect(); + + // Step 2: Publish messages to Iggy + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::partition_id(0), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + // Step 3: Wait for all 5 requests + let requests = fixture + .container() + .wait_for_requests(5) + .await + .expect("WireMock did not receive all 5 requests"); + + // Step 4: Extract offsets from metadata + // Note: offsets may not start at 0 if the seed already published messages. + let mut offsets: Vec = requests + .iter() + .enumerate() + .map(|(i, r)| { + let body = r + .body_as_json() + .unwrap_or_else(|e| panic!("Request {i} body is not valid JSON: {e}")); + body["metadata"]["iggy_offset"].as_i64().unwrap_or_else(|| { + panic!( + "Request {i} missing or non-integer iggy_offset in metadata: {}", + body["metadata"] + ) + }) + }) + .collect(); + + // Step 5: Sort and verify contiguous offsets (delivery order may vary) + offsets.sort(); + assert_eq!( + offsets.len(), + 5, + "Expected 5 offsets, got {}", + offsets.len() + ); + + for window in offsets.windows(2) { + assert_eq!( + window[1], + window[0] + 1, + "Offsets must be contiguous: got {} then {}", + window[0], + window[1] + ); + } +} + +// ============================================================================ +// Test 7: Multi-Topic Delivery +// ============================================================================ + +/// Validates multi-topic delivery: one connector consuming two topics on the same stream. +/// Sends 2 messages to topic 1, 1 to topic 2, verifies `iggy_topic` metadata matches source. +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/http/sink.toml")), + seed = seeds::connector_multi_topic_stream +)] +async fn multi_topic_messages_delivered_with_correct_topic_metadata( + harness: &TestHarness, + fixture: HttpSinkMultiTopicFixture, +) { + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_1_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + // Step 1: Both topics created by connector_multi_topic_stream seed (runs before + // connector runtime starts — runtime health check requires all configured topics). + let topic_2_id: Identifier = seeds::names::TOPIC_2.try_into().unwrap(); + + // Step 2: Send 2 messages to topic 1 with source identifier in payload + let mut topic_1_messages: Vec = vec![ + IggyMessage::builder() + .payload(Bytes::from( + serde_json::to_vec(&serde_json::json!({"source": "topic_1", "idx": 0})).unwrap(), + )) + .build() + .unwrap(), + IggyMessage::builder() + .payload(Bytes::from( + serde_json::to_vec(&serde_json::json!({"source": "topic_1", "idx": 1})).unwrap(), + )) + .build() + .unwrap(), + ]; + + client + .send_messages( + &stream_id, + &topic_1_id, + &Partitioning::partition_id(0), + &mut topic_1_messages, + ) + .await + .expect("Failed to send messages to topic 1"); + + // Step 3: Send 1 message to topic 2 with different source identifier + let mut topic_2_messages: Vec = vec![ + IggyMessage::builder() + .payload(Bytes::from( + serde_json::to_vec(&serde_json::json!({"source": "topic_2", "idx": 0})).unwrap(), + )) + .build() + .unwrap(), + ]; + + client + .send_messages( + &stream_id, + &topic_2_id, + &Partitioning::partition_id(0), + &mut topic_2_messages, + ) + .await + .expect("Failed to send messages to topic 2"); + + // Step 4: Wait for all 3 messages (2 from topic 1 + 1 from topic 2) + let requests = fixture + .container() + .wait_for_requests(3) + .await + .expect("WireMock did not receive all 3 requests"); + + // Step 5: Group by iggy_topic metadata and verify counts + payload content + let mut topic_1_count = 0usize; + let mut topic_2_count = 0usize; + + for (i, req) in requests.iter().enumerate() { + let body = req + .body_as_json() + .unwrap_or_else(|e| panic!("Request {i} body is not valid JSON: {e}")); + + let iggy_topic = body["metadata"]["iggy_topic"].as_str().unwrap_or_else(|| { + panic!( + "Request {i} missing iggy_topic in metadata: {}", + body["metadata"] + ) + }); + + // Match against constants — not magic strings (code review M9) + match iggy_topic { + t if t == seeds::names::TOPIC => { + topic_1_count += 1; + let source = body["payload"]["source"] + .as_str() + .expect("Missing source field"); + assert_eq!(source, "topic_1", "Topic 1 message has wrong source"); + } + t if t == seeds::names::TOPIC_2 => { + topic_2_count += 1; + let source = body["payload"]["source"] + .as_str() + .expect("Missing source field"); + assert_eq!(source, "topic_2", "Topic 2 message has wrong source"); + } + other => panic!("Unexpected iggy_topic value: {other}"), + } + } + + // Step 6: Verify exact message counts per topic + assert_eq!( + topic_1_count, 2, + "Expected 2 messages from topic 1, got {topic_1_count}" + ); + assert_eq!( + topic_2_count, 1, + "Expected 1 message from topic 2, got {topic_2_count}" + ); +} diff --git a/core/integration/tests/connectors/http/mod.rs b/core/integration/tests/connectors/http/mod.rs new file mode 100644 index 0000000000..637ce349f2 --- /dev/null +++ b/core/integration/tests/connectors/http/mod.rs @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +mod http_sink; + +const TEST_MESSAGE_COUNT: usize = 3; diff --git a/core/integration/tests/connectors/http/sink.toml b/core/integration/tests/connectors/http/sink.toml new file mode 100644 index 0000000000..0d8fa9b2c8 --- /dev/null +++ b/core/integration/tests/connectors/http/sink.toml @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[connectors] +config_type = "local" +config_dir = "../connectors/sinks/http_sink" diff --git a/core/integration/tests/connectors/http/wiremock/mappings/accept-ingest.json b/core/integration/tests/connectors/http/wiremock/mappings/accept-ingest.json new file mode 100644 index 0000000000..52378fe12d --- /dev/null +++ b/core/integration/tests/connectors/http/wiremock/mappings/accept-ingest.json @@ -0,0 +1,13 @@ +{ + "request": { + "method": "POST", + "urlPattern": "/ingest.*" + }, + "response": { + "status": 200, + "headers": { + "Content-Type": "application/json" + }, + "body": "{\"status\":\"ok\"}" + } +} diff --git a/core/integration/tests/connectors/mod.rs b/core/integration/tests/connectors/mod.rs index 0d93529049..3fc9e9ac1f 100644 --- a/core/integration/tests/connectors/mod.rs +++ b/core/integration/tests/connectors/mod.rs @@ -20,6 +20,7 @@ mod api; mod elasticsearch; mod fixtures; +mod http; mod http_config_provider; mod iceberg; mod mongodb;