-
Notifications
You must be signed in to change notification settings - Fork 4.6k
[GSoC 2026] Kafka Streams runner — KafkaStreamsTestRunner test harness #39211
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: feat/18479-kafka-streams-runner-skeleton
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,190 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.beam.runners.kafka.streams; | ||
|
|
||
| import java.time.Duration; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Properties; | ||
| import java.util.Set; | ||
| import java.util.UUID; | ||
| import org.apache.beam.model.pipeline.v1.RunnerApi; | ||
| import org.apache.beam.runners.fnexecution.provisioning.JobInfo; | ||
| import org.apache.beam.runners.kafka.streams.translation.KafkaStreamsPipelineTranslator; | ||
| import org.apache.beam.runners.kafka.streams.translation.KafkaStreamsTranslationContext; | ||
| import org.apache.beam.sdk.Pipeline; | ||
| import org.apache.beam.sdk.options.PipelineOptions; | ||
| import org.apache.beam.sdk.options.PipelineOptionsFactory; | ||
| import org.apache.beam.sdk.options.PortablePipelineOptions; | ||
| import org.apache.beam.sdk.testing.CrashingRunner; | ||
| import org.apache.beam.sdk.util.construction.Environments; | ||
| import org.apache.beam.sdk.util.construction.PipelineOptionsTranslation; | ||
| import org.apache.beam.sdk.util.construction.PipelineTranslation; | ||
| import org.apache.kafka.common.serialization.ByteArrayDeserializer; | ||
| import org.apache.kafka.common.serialization.ByteArraySerializer; | ||
| import org.apache.kafka.common.serialization.Serdes; | ||
| import org.apache.kafka.streams.StreamsConfig; | ||
| import org.apache.kafka.streams.TestInputTopic; | ||
| import org.apache.kafka.streams.TestOutputTopic; | ||
| import org.apache.kafka.streams.Topology; | ||
| import org.apache.kafka.streams.TopologyDescription; | ||
| import org.apache.kafka.streams.TopologyTestDriver; | ||
| import org.apache.kafka.streams.test.TestRecord; | ||
|
|
||
| /** | ||
| * Test harness that runs a Beam {@link Pipeline} through the Kafka Streams runner's translation and | ||
| * a {@link TopologyTestDriver}, so tests do not repeat the translate + drive boilerplate. | ||
| * | ||
| * <p>Usage: build a pipeline with {@link #testOptions()}, then call {@link #run(Pipeline)}. Side | ||
| * effects (e.g. a {@code SharedTestCollector} written by a recording DoFn) have completed when it | ||
| * returns. | ||
| * | ||
| * <p>{@link TopologyTestDriver} does not loop a low-level sink topic back into its source, so an | ||
| * internal repartition topic (one that is both a sink and a source in the topology — e.g. the one | ||
| * GroupByKey introduces) would otherwise dead-end. {@link #run(Pipeline)} discovers those topics | ||
| * from the {@link TopologyDescription} and round-trips them until no more records flow, standing in | ||
| * for the broker. | ||
| */ | ||
| public final class KafkaStreamsTestRunner { | ||
|
|
||
| private static final int MAX_ROUND_TRIPS = 100; | ||
|
|
||
| private KafkaStreamsTestRunner() {} | ||
|
|
||
| /** Pipeline options for a Kafka Streams runner test: the EMBEDDED harness and a unique app id. */ | ||
| public static PipelineOptions testOptions() { | ||
| String applicationId = "ks-test-" + UUID.randomUUID(); | ||
| PipelineOptions options = | ||
| PipelineOptionsFactory.fromArgs("--applicationId=" + applicationId).create(); | ||
| options.setRunner(CrashingRunner.class); | ||
| options.as(KafkaStreamsPipelineOptions.class).setApplicationId(applicationId); | ||
| options | ||
| .as(PortablePipelineOptions.class) | ||
| .setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED); | ||
| return options; | ||
| } | ||
|
|
||
| /** | ||
| * Translates the pipeline into a Kafka Streams {@link KafkaStreamsTranslationContext}. Tests that | ||
| * need the {@link Topology} (e.g. to attach a capture processor before driving) use this and | ||
| * build their own {@link TopologyTestDriver}; simpler tests use {@link #run(Pipeline)}. | ||
| */ | ||
| public static KafkaStreamsTranslationContext translate(Pipeline pipeline) { | ||
| KafkaStreamsPipelineOptions options = | ||
| pipeline.getOptions().as(KafkaStreamsPipelineOptions.class); | ||
| RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline); | ||
| KafkaStreamsPipelineTranslator translator = new KafkaStreamsPipelineTranslator(); | ||
| JobInfo jobInfo = | ||
| JobInfo.create( | ||
| options.getApplicationId(), | ||
| options.getJobName(), | ||
| "", | ||
| PipelineOptionsTranslation.toProto(options)); | ||
| KafkaStreamsTranslationContext context = translator.createTranslationContext(jobInfo, options); | ||
| translator.translate(context, translator.prepareForTranslation(pipelineProto)); | ||
| return context; | ||
| } | ||
|
|
||
| /** Translates and drives the pipeline to quiescence through a {@link TopologyTestDriver}. */ | ||
| public static void run(Pipeline pipeline) { | ||
| KafkaStreamsTranslationContext context = translate(pipeline); | ||
| Topology topology = context.getTopology(); | ||
| try (TopologyTestDriver driver = new TopologyTestDriver(topology, streamsConfig(pipeline))) { | ||
| // Fire the Impulse wall-clock punctuator and let the initial records flow. | ||
| driver.advanceWallClockTime(Duration.ofSeconds(1)); | ||
| driver.advanceWallClockTime(Duration.ofSeconds(1)); | ||
| roundTripInternalTopics(driver, internalTopics(topology)); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * The name of the single processor node with no successors (the topology leaf). Tests attach a | ||
| * capture processor here to observe what the last stage forwards. | ||
| */ | ||
| public static String leafProcessorName(Topology topology) { | ||
| for (TopologyDescription.Subtopology subtopology : topology.describe().subtopologies()) { | ||
| for (TopologyDescription.Node node : subtopology.nodes()) { | ||
| if (node instanceof TopologyDescription.Processor && node.successors().isEmpty()) { | ||
| return node.name(); | ||
| } | ||
| } | ||
| } | ||
| throw new IllegalStateException("no leaf processor found in topology"); | ||
| } | ||
|
|
||
| /** Repartition/internal topics are the ones that appear as both a sink and a source. */ | ||
| private static Set<String> internalTopics(Topology topology) { | ||
| Set<String> sinkTopics = new HashSet<>(); | ||
| Set<String> sourceTopics = new HashSet<>(); | ||
| for (TopologyDescription.Subtopology subtopology : topology.describe().subtopologies()) { | ||
| for (TopologyDescription.Node node : subtopology.nodes()) { | ||
| if (node instanceof TopologyDescription.Sink) { | ||
| String topic = ((TopologyDescription.Sink) node).topic(); | ||
| if (topic != null) { | ||
| sinkTopics.add(topic); | ||
| } | ||
| } else if (node instanceof TopologyDescription.Source) { | ||
| sourceTopics.addAll(((TopologyDescription.Source) node).topicSet()); | ||
| } | ||
|
Comment on lines
+141
to
+143
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If a source node in the topology is configured using a } else if (node instanceof TopologyDescription.Source) {
Set<String> topics = ((TopologyDescription.Source) node).topicSet();
if (topics != null) {
sourceTopics.addAll(topics);
}
} |
||
| } | ||
| } | ||
| sinkTopics.retainAll(sourceTopics); | ||
| return sinkTopics; | ||
| } | ||
|
|
||
| /** Drains each internal topic's output and feeds it back into the source until nothing flows. */ | ||
| private static void roundTripInternalTopics(TopologyTestDriver driver, Set<String> topics) { | ||
| for (int round = 0; round < MAX_ROUND_TRIPS; round++) { | ||
| boolean progressed = false; | ||
| for (String topic : topics) { | ||
| TestOutputTopic<byte[], byte[]> out = | ||
| driver.createOutputTopic( | ||
| topic, new ByteArrayDeserializer(), new ByteArrayDeserializer()); | ||
| List<TestRecord<byte[], byte[]>> records = out.readRecordsToList(); | ||
| if (records.isEmpty()) { | ||
| continue; | ||
| } | ||
| progressed = true; | ||
| TestInputTopic<byte[], byte[]> in = | ||
| driver.createInputTopic(topic, new ByteArraySerializer(), new ByteArraySerializer()); | ||
| for (TestRecord<byte[], byte[]> record : records) { | ||
| in.pipeInput(record); | ||
| } | ||
| } | ||
| if (!progressed) { | ||
| return; | ||
| } | ||
| } | ||
| throw new IllegalStateException( | ||
| "Internal topics did not reach quiescence after " + MAX_ROUND_TRIPS + " round trips"); | ||
| } | ||
|
Comment on lines
+151
to
+175
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Creating private static void roundTripInternalTopics(TopologyTestDriver driver, Set<String> topics) {
java.util.Map<String, TestOutputTopic<byte[], byte[]>> outputs = new java.util.HashMap<>();
java.util.Map<String, TestInputTopic<byte[], byte[]>> inputs = new java.util.HashMap<>();
for (String topic : topics) {
outputs.put(
topic,
driver.createOutputTopic(
topic, new ByteArrayDeserializer(), new ByteArrayDeserializer()));
inputs.put(
topic,
driver.createInputTopic(topic, new ByteArraySerializer(), new ByteArraySerializer()));
}
for (int round = 0; round < MAX_ROUND_TRIPS; round++) {
boolean progressed = false;
for (String topic : topics) {
TestOutputTopic<byte[], byte[]> out = outputs.get(topic);
List<TestRecord<byte[], byte[]>> records = out.readRecordsToList();
if (records.isEmpty()) {
continue;
}
progressed = true;
TestInputTopic<byte[], byte[]> in = inputs.get(topic);
for (TestRecord<byte[], byte[]> record : records) {
in.pipeInput(record);
}
}
if (!progressed) {
return;
}
}
throw new IllegalStateException(
"Internal topics did not reach quiescence after " + MAX_ROUND_TRIPS + " round trips");
} |
||
|
|
||
| /** Kafka Streams config for a {@link TopologyTestDriver} built from the pipeline's app id. */ | ||
| public static Properties streamsConfig(Pipeline pipeline) { | ||
| String applicationId = | ||
| pipeline.getOptions().as(KafkaStreamsPipelineOptions.class).getApplicationId(); | ||
| Properties props = new Properties(); | ||
| props.put(StreamsConfig.APPLICATION_ID_CONFIG, applicationId); | ||
| props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); | ||
| props.put( | ||
| StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName()); | ||
| props.put( | ||
| StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName()); | ||
| return props; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The iteration order of
subtopology.nodes()(which returns aSet) is not guaranteed to be deterministic. If a topology contains multiple leaf processors (processors with no successors),leafProcessorNamewill return whichever leaf it encounters first, which could lead to non-deterministic test failures. Collecting all leaf processors and asserting that exactly one exists makes the test harness more robust and deterministic.