diff --git a/dev/archery/archery/benchmark/jmh.py b/dev/archery/archery/benchmark/jmh.py new file mode 100644 index 00000000000..f531b6de163 --- /dev/null +++ b/dev/archery/archery/benchmark/jmh.py @@ -0,0 +1,201 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from itertools import filterfalse, groupby, tee +import json +import subprocess +from tempfile import NamedTemporaryFile + +from .core import Benchmark +from ..utils.command import Command +from ..utils.maven import Maven + + +def partition(pred, iterable): + # adapted from python's examples + t1, t2 = tee(iterable) + return list(filter(pred, t1)), list(filterfalse(pred, t2)) + + +class JavaMicrobenchmarkHarnessCommand(Command): + """ Run a Java Micro Benchmark Harness + + This assumes the binary supports the standard command line options, + notably `-Dbenchmark_filter` + """ + + def __init__(self, build, benchmark_filter=None): + self.benchmark_filter = benchmark_filter + self.build = build + self.maven = Maven() + + """ Extract benchmark names from output between "Benchmarks:" and "[INFO]". + Assume the following output: + ... + Benchmarks: + org.apache.arrow.vector.IntBenchmarks.setIntDirectly + ... + org.apache.arrow.vector.IntBenchmarks.setWithValueHolder + org.apache.arrow.vector.IntBenchmarks.setWithWriter + ... + [INFO] + """ + + def list_benchmarks(self): + argv = [] + if self.benchmark_filter: + argv.append("-Dbenchmark.filter={}".format(self.benchmark_filter)) + result = self.build.list( + *argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + lists = [] + benchmarks = False + for line in str.splitlines(result.stdout.decode("utf-8")): + if not benchmarks: + if line.startswith("Benchmarks:"): + benchmarks = True + else: + if line.startswith("org.apache.arrow"): + lists.append(line) + if line.startswith("[INFO]"): + break + return lists + + def results(self, repetitions): + with NamedTemporaryFile(suffix=".json") as out: + argv = ["-Dbenchmark.runs={}".format(repetitions), + "-Dbenchmark.resultfile={}".format(out.name), + "-Dbenchmark.resultformat=json"] + if self.benchmark_filter: + argv.append( + "-Dbenchmark.filter={}".format(self.benchmark_filter) + ) + + self.build.benchmark(*argv, check=True) + return json.load(out) + + +class JavaMicrobenchmarkHarnessObservation: + """ Represents one run of a single Java Microbenchmark Harness + """ + + def __init__(self, benchmark, primaryMetric, + forks, warmupIterations, measurementIterations, **counters): + self.name = benchmark + self.primaryMetric = primaryMetric + self.score = primaryMetric["score"] + self.score_unit = primaryMetric["scoreUnit"] + self.forks = forks + self.warmups = warmupIterations + self.runs = measurementIterations + self.counters = { + "mode": counters["mode"], + "threads": counters["threads"], + "warmups": warmupIterations, + "warmupTime": counters["warmupTime"], + "measurements": measurementIterations, + "measurementTime": counters["measurementTime"], + "jvmArgs": counters["jvmArgs"] + } + self.reciprocal_value = True if self.score_unit.endswith( + "/op") else False + if self.score_unit.startswith("ops/"): + idx = self.score_unit.find("/") + self.normalizePerSec(self.score_unit[idx+1:]) + elif self.score_unit.endswith("/op"): + idx = self.score_unit.find("/") + self.normalizePerSec(self.score_unit[:idx]) + else: + self.normalizeFactor = 1 + + @property + def value(self): + """ Return the benchmark value.""" + val = 1 / self.score if self.reciprocal_value else self.score + return val * self.normalizeFactor + + def normalizePerSec(self, unit): + if unit == "ns": + self.normalizeFactor = 1000 * 1000 * 1000 + elif unit == "us": + self.normalizeFactor = 1000 * 1000 + elif unit == "ms": + self.normalizeFactor = 1000 + elif unit == "min": + self.normalizeFactor = 1 / 60 + elif unit == "hr": + self.normalizeFactor = 1 / (60 * 60) + elif unit == "day": + self.normalizeFactor = 1 / (60 * 60 * 24) + else: + self.normalizeFactor = 1 + + @property + def unit(self): + if self.score_unit.startswith("ops/"): + return "items_per_second" + elif self.score_unit.endswith("/op"): + return "items_per_second" + else: + return "?" + + def __repr__(self): + return str(self.value) + + +class JavaMicrobenchmarkHarness(Benchmark): + """ A set of JavaMicrobenchmarkHarnessObservations. """ + + def __init__(self, name, runs): + """ Initialize a JavaMicrobenchmarkHarness. + + Parameters + ---------- + name: str + Name of the benchmark + forks: int + warmups: int + runs: int + runs: list(JavaMicrobenchmarkHarnessObservation) + Repetitions of JavaMicrobenchmarkHarnessObservation run. + + """ + self.name = name + self.runs = sorted(runs, key=lambda b: b.value) + unit = self.runs[0].unit + time_unit = "N/A" + less_is_better = not unit.endswith("per_second") + values = [b.value for b in self.runs] + times = [] + # Slight kludge to extract the UserCounters for each benchmark + counters = self.runs[0].counters + super().__init__(name, unit, less_is_better, values, time_unit, times, + counters) + + def __repr__(self): + return "JavaMicrobenchmark[name={},runs={}]".format( + self.name, self.runs) + + @classmethod + def from_json(cls, payload): + def group_key(x): + return x.name + + benchmarks = map( + lambda x: JavaMicrobenchmarkHarnessObservation(**x), payload) + groups = groupby(sorted(benchmarks, key=group_key), group_key) + return [cls(k, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index 5718bcaf108..fc6d354b180 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -22,8 +22,11 @@ from .core import BenchmarkSuite from .google import GoogleBenchmarkCommand, GoogleBenchmark +from .jmh import JavaMicrobenchmarkHarnessCommand, JavaMicrobenchmarkHarness from ..lang.cpp import CppCMakeDefinition, CppConfiguration +from ..lang.java import JavaMavenDefinition, JavaConfiguration from ..utils.cmake import CMakeBuild +from ..utils.maven import MavenBuild from ..utils.logger import logger @@ -50,40 +53,8 @@ def suites(self): @staticmethod def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): - """ Returns a BenchmarkRunner from a path or a git revision. - - First, it checks if `rev_or_path` is a valid path (or string) of a json - object that can deserialize to a BenchmarkRunner. If so, it initialize - a StaticBenchmarkRunner from it. This allows memoizing the result of a - run in a file or a string. - - Second, it checks if `rev_or_path` points to a valid CMake build - directory. If so, it creates a CppBenchmarkRunner with this existing - CMakeBuild. - - Otherwise, it assumes `rev_or_path` is a revision and clone/checkout - the given revision and create a fresh CMakeBuild. - """ - build = None - if StaticBenchmarkRunner.is_json_result(rev_or_path): - return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs) - elif CMakeBuild.is_build_dir(rev_or_path): - build = CMakeBuild.from_path(rev_or_path) - return CppBenchmarkRunner(build, **kwargs) - else: - # Revisions can references remote via the `/` character, ensure - # that the revision is path friendly - path_rev = rev_or_path.replace("/", "_") - root_rev = os.path.join(root, path_rev) - os.mkdir(root_rev) - - clone_dir = os.path.join(root_rev, "arrow") - # Possibly checkout the sources at given revision, no need to - # perform cleanup on cloned repository as root_rev is reclaimed. - src_rev, _ = src.at_revision(rev_or_path, clone_dir) - cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) - build_dir = os.path.join(root_rev, "build") - return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs) + raise NotImplementedError( + "BenchmarkRunner must implement from_rev_or_path") class StaticBenchmarkRunner(BenchmarkRunner): @@ -210,3 +181,133 @@ def suites(self): continue yield suite + + @staticmethod + def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): + """ Returns a BenchmarkRunner from a path or a git revision. + + First, it checks if `rev_or_path` is a valid path (or string) of a json + object that can deserialize to a BenchmarkRunner. If so, it initialize + a StaticBenchmarkRunner from it. This allows memoizing the result of a + run in a file or a string. + + Second, it checks if `rev_or_path` points to a valid CMake build + directory. If so, it creates a CppBenchmarkRunner with this existing + CMakeBuild. + + Otherwise, it assumes `rev_or_path` is a revision and clone/checkout + the given revision and create a fresh CMakeBuild. + """ + build = None + if StaticBenchmarkRunner.is_json_result(rev_or_path): + return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs) + elif CMakeBuild.is_build_dir(rev_or_path): + build = CMakeBuild.from_path(rev_or_path) + return CppBenchmarkRunner(build, **kwargs) + else: + # Revisions can references remote via the `/` character, ensure + # that the revision is path friendly + path_rev = rev_or_path.replace("/", "_") + root_rev = os.path.join(root, path_rev) + os.mkdir(root_rev) + + clone_dir = os.path.join(root_rev, "arrow") + # Possibly checkout the sources at given revision, no need to + # perform cleanup on cloned repository as root_rev is reclaimed. + src_rev, _ = src.at_revision(rev_or_path, clone_dir) + cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) + build_dir = os.path.join(root_rev, "build") + return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs) + + +class JavaBenchmarkRunner(BenchmarkRunner): + """ Run suites for Java. """ + + # default repetitions is 5 for Java microbenchmark harness + def __init__(self, build, **kwargs): + """ Initialize a JavaBenchmarkRunner. """ + self.build = build + super().__init__(**kwargs) + + @staticmethod + def default_configuration(**kwargs): + """ Returns the default benchmark configuration. """ + return JavaConfiguration(**kwargs) + + def suite(self, name): + """ Returns the resulting benchmarks for a given suite. """ + # update .m2 directory, which installs target jars + self.build.build() + + suite_cmd = JavaMicrobenchmarkHarnessCommand( + self.build, self.benchmark_filter) + + # Ensure there will be data + benchmark_names = suite_cmd.list_benchmarks() + if not benchmark_names: + return None + + results = suite_cmd.results(repetitions=self.repetitions) + benchmarks = JavaMicrobenchmarkHarness.from_json(results) + return BenchmarkSuite(name, benchmarks) + + @property + def list_benchmarks(self): + """ Returns all suite names """ + # Ensure build is up-to-date to run benchmarks + self.build.build() + + suite_cmd = JavaMicrobenchmarkHarnessCommand(self.build) + benchmark_names = suite_cmd.list_benchmarks() + for benchmark_name in benchmark_names: + yield "{}".format(benchmark_name) + + @property + def suites(self): + """ Returns all suite for a runner. """ + suite_name = "JavaBenchmark" + suite = self.suite(suite_name) + + # Filter may exclude all benchmarks + if not suite: + logger.debug("Suite {} executed but no results" + .format(suite_name)) + return + + yield suite + + @staticmethod + def from_rev_or_path(src, root, rev_or_path, maven_conf, **kwargs): + """ Returns a BenchmarkRunner from a path or a git revision. + + First, it checks if `rev_or_path` is a valid path (or string) of a json + object that can deserialize to a BenchmarkRunner. If so, it initialize + a StaticBenchmarkRunner from it. This allows memoizing the result of a + run in a file or a string. + + Second, it checks if `rev_or_path` points to a valid Maven build + directory. If so, it creates a JavaBenchmarkRunner with this existing + MavenBuild. + + Otherwise, it assumes `rev_or_path` is a revision and clone/checkout + the given revision and create a fresh MavenBuild. + """ + if StaticBenchmarkRunner.is_json_result(rev_or_path): + return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs) + elif MavenBuild.is_build_dir(rev_or_path): + maven_def = JavaMavenDefinition(rev_or_path, maven_conf) + return JavaBenchmarkRunner(maven_def.build(rev_or_path), **kwargs) + else: + # Revisions can references remote via the `/` character, ensure + # that the revision is path friendly + path_rev = rev_or_path.replace("/", "_") + root_rev = os.path.join(root, path_rev) + os.mkdir(root_rev) + + clone_dir = os.path.join(root_rev, "arrow") + # Possibly checkout the sources at given revision, no need to + # perform cleanup on cloned repository as root_rev is reclaimed. + src_rev, _ = src.at_revision(rev_or_path, clone_dir) + maven_def = JavaMavenDefinition(src_rev.java, maven_conf) + build_dir = os.path.join(root_rev, "arrow/java") + return JavaBenchmarkRunner(maven_def.build(build_dir), **kwargs) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index bcaddf1c795..06dd6b60370 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -27,7 +27,7 @@ from .benchmark.codec import JsonEncoder from .benchmark.compare import RunnerComparator, DEFAULT_THRESHOLD -from .benchmark.runner import BenchmarkRunner, CppBenchmarkRunner +from .benchmark.runner import CppBenchmarkRunner, JavaBenchmarkRunner from .lang.cpp import CppCMakeDefinition, CppConfiguration from .utils.lint import linter, python_numpydoc, LintValidationException from .utils.logger import logger, ctx as log_ctx @@ -120,6 +120,15 @@ def cpp_toolchain_options(cmd): return _apply_options(cmd, options) +def java_toolchain_options(cmd): + options = [ + click.option("--java-home", metavar="", + help="Path to Java Developers Kit."), + click.option("--java-options", help="java compiler options."), + ] + return _apply_options(cmd, options) + + def _apply_options(cmd, options): for option in options: cmd = option(cmd) @@ -132,6 +141,7 @@ def _apply_options(cmd, options): help="Specify Arrow source directory") # toolchain @cpp_toolchain_options +@java_toolchain_options @click.option("--build-type", default=None, type=build_type, help="CMake's CMAKE_BUILD_TYPE") @click.option("--warn-level", default="production", type=warn_level_type, @@ -357,6 +367,11 @@ def benchmark(ctx): def benchmark_common_options(cmd): + def check_language(ctx, param, value): + if value not in {"cpp", "java"}: + raise click.BadParameter("cpp or java is supported now") + return value + options = [ click.option("--src", metavar="", show_default=True, default=None, callback=validate_arrow_sources, @@ -367,11 +382,21 @@ def benchmark_common_options(cmd): click.option("--output", metavar="", type=click.File("w", encoding="utf8"), default="-", help="Capture output result into file."), + click.option("--language", metavar="", type=str, default="cpp", + show_default=True, callback=check_language, + help="Specify target language for the benchmark"), + click.option("--build-extras", type=str, multiple=True, + help="Extra flags/options to pass to mvn build. " + "Can be stacked. For language=java"), + click.option("--benchmark-extras", type=str, multiple=True, + help="Extra flags/options to pass to mvn benchmark. " + "Can be stacked. For language=java"), click.option("--cmake-extras", type=str, multiple=True, help="Extra flags/options to pass to cmake invocation. " - "Can be stacked"), + "Can be stacked. For language=cpp") ] + cmd = java_toolchain_options(cmd) cmd = cpp_toolchain_options(cmd) return _apply_options(cmd, options) @@ -392,19 +417,33 @@ def benchmark_filter_options(cmd): @click.argument("rev_or_path", metavar="[]", default="WORKSPACE", required=False) @benchmark_common_options +@benchmark_filter_options @click.pass_context def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, - **kwargs): + java_home, java_options, build_extras, benchmark_extras, + language, **kwargs): """ List benchmark suite. """ with tmpdir(preserve=preserve) as root: logger.debug("Running benchmark {}".format(rev_or_path)) - conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) + if language == "cpp": + conf = CppBenchmarkRunner.default_configuration( + cmake_extras=cmake_extras, **kwargs) + + runner_base = CppBenchmarkRunner.from_rev_or_path( + src, root, rev_or_path, conf) + + elif language == "java": + for key in {'cpp_package_prefix', 'cxx_flags', 'cxx', 'cc'}: + del kwargs[key] + conf = JavaBenchmarkRunner.default_configuration( + java_home=java_home, java_options=java_options, + build_extras=build_extras, benchmark_extras=benchmark_extras, + **kwargs) - runner_base = BenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf) + runner_base = JavaBenchmarkRunner.from_rev_or_path( + src, root, rev_or_path, conf) for b in runner_base.list_benchmarks: click.echo(b, file=output) @@ -415,12 +454,15 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, default="WORKSPACE", required=False) @benchmark_common_options @benchmark_filter_options -@click.option("--repetitions", type=int, default=1, show_default=True, +@click.option("--repetitions", type=int, default=-1, help=("Number of repetitions of each benchmark. Increasing " - "may improve result precision.")) + "may improve result precision. " + "[default: 1 for cpp, 5 for java")) @click.pass_context def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, - suite_filter, benchmark_filter, repetitions, **kwargs): + java_home, java_options, build_extras, benchmark_extras, + language, suite_filter, benchmark_filter, repetitions, + **kwargs): """ Run benchmark suite. This command will run the benchmark suite for a single build. This is @@ -456,13 +498,29 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, with tmpdir(preserve=preserve) as root: logger.debug("Running benchmark {}".format(rev_or_path)) - conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) - - runner_base = BenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf, - repetitions=repetitions, - suite_filter=suite_filter, benchmark_filter=benchmark_filter) + if language == "cpp": + conf = CppBenchmarkRunner.default_configuration( + cmake_extras=cmake_extras, **kwargs) + + repetitions = repetitions if repetitions != -1 else 1 + runner_base = CppBenchmarkRunner.from_rev_or_path( + src, root, rev_or_path, conf, + repetitions=repetitions, + suite_filter=suite_filter, benchmark_filter=benchmark_filter) + + elif language == "java": + for key in {'cpp_package_prefix', 'cxx_flags', 'cxx', 'cc'}: + del kwargs[key] + conf = JavaBenchmarkRunner.default_configuration( + java_home=java_home, java_options=java_options, + build_extras=build_extras, benchmark_extras=benchmark_extras, + **kwargs) + + repetitions = repetitions if repetitions != -1 else 5 + runner_base = JavaBenchmarkRunner.from_rev_or_path( + src, root, rev_or_path, conf, + repetitions=repetitions, + benchmark_filter=benchmark_filter) json.dump(runner_base, output, cls=JsonEncoder) @@ -475,7 +533,8 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, help="Regression failure threshold in percentage.") @click.option("--repetitions", type=int, default=1, show_default=True, help=("Number of repetitions of each benchmark. Increasing " - "may improve result precision.")) + "may improve result precision. " + "[default: 1 for cpp, 5 for java")) @click.option("--no-counters", type=BOOL, default=False, is_flag=True, help="Hide counters field in diff report.") @click.argument("contender", metavar="[", @@ -483,8 +542,9 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, @click.argument("baseline", metavar="[]]", default="origin/master", required=False) @click.pass_context -def benchmark_diff(ctx, src, preserve, output, cmake_extras, +def benchmark_diff(ctx, src, preserve, output, language, cmake_extras, suite_filter, benchmark_filter, repetitions, no_counters, + java_home, java_options, build_extras, benchmark_extras, threshold, contender, baseline, **kwargs): """Compare (diff) benchmark runs. @@ -560,26 +620,47 @@ def benchmark_diff(ctx, src, preserve, output, cmake_extras, logger.debug("Comparing {} (contender) with {} (baseline)" .format(contender, baseline)) - conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) - - runner_cont = BenchmarkRunner.from_rev_or_path( - src, root, contender, conf, - repetitions=repetitions, - suite_filter=suite_filter, - benchmark_filter=benchmark_filter) - runner_base = BenchmarkRunner.from_rev_or_path( - src, root, baseline, conf, - repetitions=repetitions, - suite_filter=suite_filter, - benchmark_filter=benchmark_filter) + if language == "cpp": + conf = CppBenchmarkRunner.default_configuration( + cmake_extras=cmake_extras, **kwargs) + + repetitions = repetitions if repetitions != -1 else 1 + runner_cont = CppBenchmarkRunner.from_rev_or_path( + src, root, contender, conf, + repetitions=repetitions, + suite_filter=suite_filter, + benchmark_filter=benchmark_filter) + runner_base = CppBenchmarkRunner.from_rev_or_path( + src, root, baseline, conf, + repetitions=repetitions, + suite_filter=suite_filter, + benchmark_filter=benchmark_filter) + + elif language == "java": + for key in {'cpp_package_prefix', 'cxx_flags', 'cxx', 'cc'}: + del kwargs[key] + conf = JavaBenchmarkRunner.default_configuration( + java_home=java_home, java_options=java_options, + build_extras=build_extras, benchmark_extras=benchmark_extras, + **kwargs) + + repetitions = repetitions if repetitions != -1 else 5 + runner_cont = JavaBenchmarkRunner.from_rev_or_path( + src, root, contender, conf, + repetitions=repetitions, + benchmark_filter=benchmark_filter) + runner_base = JavaBenchmarkRunner.from_rev_or_path( + src, root, baseline, conf, + repetitions=repetitions, + benchmark_filter=benchmark_filter) runner_comp = RunnerComparator(runner_cont, runner_base, threshold) # TODO(kszucs): test that the output is properly formatted jsonlines comparisons_json = _get_comparisons_as_json(runner_comp.comparisons) + ren_counters = language == "java" formatted = _format_comparisons_with_pandas(comparisons_json, - no_counters) + no_counters, ren_counters) output.write(formatted) output.write('\n') @@ -593,7 +674,8 @@ def _get_comparisons_as_json(comparisons): return buf.getvalue() -def _format_comparisons_with_pandas(comparisons_json, no_counters): +def _format_comparisons_with_pandas(comparisons_json, no_counters, + ren_counters): import pandas as pd df = pd.read_json(StringIO(comparisons_json), lines=True) # parse change % so we can sort by it @@ -604,7 +686,10 @@ def _format_comparisons_with_pandas(comparisons_json, no_counters): if not no_counters: fields += ['counters'] - df = df[fields].sort_values(by='change %', ascending=False) + df = df[fields] + if ren_counters: + df = df.rename(columns={'counters': 'configurations'}) + df = df.sort_values(by='change %', ascending=False) def labelled(title, df): if len(df) == 0: diff --git a/dev/archery/archery/lang/java.py b/dev/archery/archery/lang/java.py index 24743b67fd7..bc169adf647 100644 --- a/dev/archery/archery/lang/java.py +++ b/dev/archery/archery/lang/java.py @@ -15,7 +15,10 @@ # specific language governing permissions and limitations # under the License. +import os + from ..utils.command import Command, CommandStackMixin, default_bin +from ..utils.maven import MavenDefinition class Java(Command): @@ -28,3 +31,47 @@ def __init__(self, jar, *args, **kwargs): self.jar = jar self.argv = ("-jar", jar) Java.__init__(self, *args, **kwargs) + + +class JavaConfiguration: + def __init__(self, + + # toolchain + java_home=None, java_options=None, + # build & benchmark + build_extras=None, benchmark_extras=None): + self.java_home = java_home + self.java_options = java_options + + self.build_extras = list(build_extras) if build_extras else [] + self.benchmark_extras = list( + benchmark_extras) if benchmark_extras else [] + + @property + def build_definitions(self): + return self.build_extras + + @property + def benchmark_definitions(self): + return self.benchmark_extras + + @property + def environment(self): + env = os.environ.copy() + + if self.java_home: + env["JAVA_HOME"] = self.java_home + + if self.java_options: + env["JAVA_OPTIONS"] = self.java_options + + return env + + +class JavaMavenDefinition(MavenDefinition): + def __init__(self, source, conf, **kwargs): + self.configuration = conf + super().__init__(source, **kwargs, + build_definitions=conf.build_definitions, + benchmark_definitions=conf.benchmark_definitions, + env=conf.environment) diff --git a/dev/archery/archery/utils/maven.py b/dev/archery/archery/utils/maven.py new file mode 100644 index 00000000000..96a3bf5bd99 --- /dev/null +++ b/dev/archery/archery/utils/maven.py @@ -0,0 +1,204 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from .command import Command, default_bin + + +class Maven(Command): + def __init__(self, maven_bin=None): + self.bin = default_bin(maven_bin, "mvn") + + +maven = Maven() + + +class MavenDefinition: + """ MavenDefinition captures the maven invocation arguments. + + It allows creating build directories with the same definition, e.g. + ``` + build_1 = maven_def.build("/tmp/build-1") + build_2 = maven_def.build("/tmp/build-2") + + ... + + build1.install() + build2.install() + """ + + def __init__(self, source, build_definitions=None, + benchmark_definitions=None, env=None): + """ Initialize a MavenDefinition + + Parameters + ---------- + source : str + Source directory where the top-level pom.xml is + located. This is usually the root of the project. + build_definitions: list(str), optional + benchmark_definitions: list(str), optional + """ + self.source = os.path.abspath(source) + self.build_definitions = build_definitions if build_definitions else [] + self.benchmark_definitions =\ + benchmark_definitions if benchmark_definitions else [] + self.env = env + + @property + def build_arguments(self): + """" Return the arguments to maven invocation for build. """ + arguments = self.build_definitions + [ + "-B", "-DskipTests", "-Drat.skip=true", + "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer." + "Slf4jMavenTransferListener=warn", + "-T", "2C", "install" + ] + return arguments + + def build(self, build_dir, force=False, cmd_kwargs=None, **kwargs): + """ Invoke maven into a build directory. + + Parameters + ---------- + build_dir : str + Directory in which the Maven build will be instantiated. + force : bool + not used now + """ + if os.path.exists(build_dir): + # Extra safety to ensure we're deleting a build folder. + if not MavenBuild.is_build_dir(build_dir): + raise FileExistsError( + "{} is not a maven build".format(build_dir) + ) + + cmd_kwargs = cmd_kwargs if cmd_kwargs else {} + assert MavenBuild.is_build_dir(build_dir) + maven(*self.build_arguments, cwd=build_dir, env=self.env, **cmd_kwargs) + return MavenBuild(build_dir, definition=self, **kwargs) + + @property + def list_arguments(self): + """" Return the arguments to maven invocation for list """ + arguments = [ + "-Dskip.perf.benchmarks=false", "-Dbenchmark.list=-lp", "install" + ] + return arguments + + @property + def benchmark_arguments(self): + """" Return the arguments to maven invocation for benchmark """ + arguments = self.benchmark_definitions + [ + "-Dskip.perf.benchmarks=false", "-Dbenchmark.fork=1", + "-Dbenchmark.jvmargs=\"-Darrow.enable_null_check_for_get=false " + "-Darrow.enable_unsafe_memory_access=true\"", + "install" + ] + return arguments + + def __repr__(self): + return "MavenDefinition[source={}]".format(self.source) + + +class MavenBuild(Maven): + """ MavenBuild represents a build directory initialized by maven. + + The build instance can be used to build/test/install. It alleviates the + user to know which generator is used. + """ + + def __init__(self, build_dir, definition=None): + """ Initialize a MavenBuild. + + The caller must ensure that maven was invoked in the build directory. + + Parameters + ---------- + definition : MavenDefinition + The definition to build from. + build_dir : str + The build directory to setup into. + """ + assert MavenBuild.is_build_dir(build_dir) + super().__init__() + self.build_dir = os.path.abspath(build_dir) + self.definition = definition + + @property + def binaries_dir(self): + return self.build_dir + + def run(self, *argv, verbose=False, cwd=None, **kwargs): + extra = [] + if verbose: + extra.append("-X") + if cwd is None: + cwd = self.build_dir + # Commands must be ran under the directory where pom.xml exists + return super().run(*extra, *argv, **kwargs, cwd=cwd) + + def build(self, *argv, verbose=False, **kwargs): + definition_args = self.definition.build_arguments + cwd = self.binaries_dir + return self.run(*argv, *definition_args, verbose=verbose, cwd=cwd, + env=self.definition.env, **kwargs) + + def list(self, *argv, verbose=False, **kwargs): + definition_args = self.definition.list_arguments + cwd = self.binaries_dir + "/performance" + return self.run(*argv, *definition_args, verbose=verbose, cwd=cwd, + env=self.definition.env, **kwargs) + + def benchmark(self, *argv, verbose=False, **kwargs): + definition_args = self.definition.benchmark_arguments + cwd = self.binaries_dir + "/performance" + return self.run(*argv, *definition_args, verbose=verbose, cwd=cwd, + env=self.definition.env, **kwargs) + + @staticmethod + def is_build_dir(path): + """ Indicate if a path is Maven top directory. + + This method only checks for the existence of paths and does not do any + validation whatsoever. + """ + pom_xml = os.path.join(path, "pom.xml") + performance_dir = os.path.join(path, "performance") + return os.path.exists(pom_xml) and os.path.isdir(performance_dir) + + @staticmethod + def from_path(path): + """ Instantiate a Maven from a path. + + This is used to recover from an existing physical directory (created + with or without Maven). + + Note that this method is not idempotent as the original definition will + be lost. + """ + if not MavenBuild.is_build_dir(path): + raise ValueError("Not a valid MavenBuild path: {}".format(path)) + + return MavenBuild(path, definition=None) + + def __repr__(self): + return ("MavenBuild[" + "build = {}," + "definition = {}]".format(self.build_dir, + self.definition)) diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index d30b4f152e5..f7e47a5a1b6 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -68,6 +68,11 @@ def dev(self): """ Returns the dev directory of an Arrow sources. """ return self.path / "dev" + @property + def java(self): + """ Returns the java directory of an Arrow sources. """ + return self.path / "java" + @property def python(self): """ Returns the python directory of an Arrow sources. """ diff --git a/java/performance/pom.xml b/java/performance/pom.xml index dffe7f2cbd2..81e268d9769 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -99,8 +99,12 @@ true .* 1 + 5 5 + + jmh-result.json + json @@ -169,10 +173,17 @@ ${benchmark.filter} -f ${benchmark.forks} + -jvmArgs + ${benchmark.jvmargs} -wi ${benchmark.warmups} -i ${benchmark.runs} + ${benchmark.list} + -rff + ${benchmark.resultfile} + -rf + ${benchmark.resultformat}