2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-08-31 06:25:31 +00:00

chg: test: Add stress tests with DoH and DoT

Validation pipeline: https://gitlab.isc.org/isc-projects/bind9/-/pipelines/160984

Prerequisites:
- [x] isc-private/devops!11
- [x] https://gitlab.isc.org/isc-projects/bind9-qa/-/merge_requests/9

Things to consider:
- FreeBSD DoH jobs are not added because Flamethrower queries always timeout.
- This adds 15 more CI jobs:
  - Linux (AWS autoscaler): `(auth + recursive + RPZ) * (DoH + DoT) * (amd64 + arm64) = 12`
  - FreeBSD (one FreeBSD runner): `(auth + recursive + RPZ) * (DoT) * (amd64) = 3`
- Autoscaler is not yet present on FreeBSD. Adding 3 CI jobs (i.e., DoT) run serially adds 3 hours to the pipeline runtime. Should we add just one FreeBSD DoT job to limit the runtime?
- DoH/DoT performance is slightly lower than pure TCP, so the threshold for the test to pass must be lowered by 5-10% (see isc-private/bind-qa!40).

Merge branch 'mnowak/stress-test-with-doh-dot' into 'main'

See merge request isc-projects/bind9!5800
This commit is contained in:
Michal Nowak
2025-01-27 20:09:49 +00:00
2 changed files with 158 additions and 273 deletions

View File

@@ -51,10 +51,6 @@ variables:
# cross-testrun files as there is no need to use that feature in CI.
PYTEST_ADDOPTS: "-p no:cacheprovider"
# Default platforms to run "stress" tests on
BIND_STRESS_TEST_OS: linux
BIND_STRESS_TEST_ARCH: amd64
HYPOTHESIS_PROFILE: "ci"
default:
@@ -1699,279 +1695,29 @@ shotgun:doh-get:
.stress-test: &stress_test
stage: performance
generate-stress-test-configs:
<<: *base_image
<<: *default_triggering_rules
stage: precheck
script:
- *configure
- *setup_interfaces
- make -j${BUILD_PARALLEL_JOBS:-1} -k all V=1
- make DESTDIR="${INSTALL_PATH}" install
- git clone --depth 1 https://gitlab.isc.org/isc-projects/bind9-qa.git
- cd bind9-qa/stress
- LD_LIBRARY_PATH="${INSTALL_PATH}/usr/local/lib" BIND_INSTALL_PATH="${INSTALL_PATH}/usr/local" WORKSPACE="${CI_PROJECT_DIR}" bash stress.sh
- util/generate-stress-test-configs.py > stress-test-configs.yml
artifacts:
paths:
- stress-test-configs.yml
needs: []
stress-test-child-pipeline:
<<: *default_triggering_rules
stage: performance
trigger:
include:
- artifact: stress-test-configs.yml
job: generate-stress-test-configs
needs:
- job: autoreconf
- job: generate-stress-test-configs
artifacts: true
.stress-test-long: &stress_test_long_job
<<: *stress_test
artifacts:
untracked: true
exclude:
- "output/ns4/*.dtq*"
- "output/ns4/large-delta-rpz*.local"
- "output/rpz_*"
expire_in: "1 week"
when: always
timeout: 2h
.stress-test-short: &stress_test_short_job
<<: *stress_test
only:
- merge_requests
artifacts:
untracked: true
exclude:
- "output/ns4/*.dtq*"
- "output/ns4/large-delta-rpz*.local"
- "output/rpz_*"
when: always
stress:short:authoritative:fedora:41:amd64:
<<: *fedora_41_amd64_image
<<: *linux_amd64
<<: *stress_test_short_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: authoritative
RATE: 10000
RUN_TIME: 15
stress:short:recursive:fedora:41:amd64:
<<: *fedora_41_amd64_image
<<: *linux_amd64
<<: *stress_test_short_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: recursive
RATE: 10000
RUN_TIME: 15
stress:short:rpz:fedora:41:amd64:
<<: *fedora_41_amd64_image
<<: *linux_amd64
<<: *stress_test_short_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: rpz
RATE: 1500
RUN_TIME: 15
stress:short:authoritative:fedora:41:arm64:
<<: *fedora_41_arm64_image
<<: *linux_arm64
<<: *stress_test_short_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: authoritative
RATE: 10000
RUN_TIME: 15
stress:short:recursive:fedora:41:arm64:
<<: *fedora_41_arm64_image
<<: *linux_arm64
<<: *stress_test_short_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: recursive
RATE: 10000
RUN_TIME: 15
stress:short:rpz:fedora:41:arm64:
<<: *fedora_41_arm64_image
<<: *linux_arm64
<<: *stress_test_short_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: rpz
RATE: 1500
RUN_TIME: 15
stress:short:authoritative:freebsd13:amd64:
<<: *freebsd_stress_amd64
<<: *stress_test_short_job
variables:
CC: clang
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/local/bin/flame
MODE: authoritative
RATE: 10000
RUN_TIME: 15
stress:short:recursive:freebsd13:amd64:
<<: *freebsd_stress_amd64
<<: *stress_test_short_job
variables:
CC: clang
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/local/bin/flame
MODE: recursive
RATE: 10000
RUN_TIME: 15
stress:short:rpz:freebsd13:amd64:
<<: *freebsd_stress_amd64
<<: *stress_test_short_job
variables:
CC: clang
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/local/bin/flame
MODE: rpz
RATE: 1500
RUN_TIME: 15
stress:authoritative:fedora:41:amd64:
<<: *fedora_41_amd64_image
<<: *linux_amd64
<<: *stress_test_long_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: authoritative
RATE: 10000
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /linux/i && $BIND_STRESS_TEST_MODE =~ /authoritative/i && $BIND_STRESS_TEST_ARCH =~ /amd64/i)
stress:recursive:fedora:41:amd64:
<<: *fedora_41_amd64_image
<<: *linux_amd64
<<: *stress_test_long_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: recursive
RATE: 10000
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /linux/i && $BIND_STRESS_TEST_MODE =~ /recursive/i && $BIND_STRESS_TEST_ARCH =~ /amd64/i)
stress:rpz:fedora:41:amd64:
<<: *fedora_41_amd64_image
<<: *linux_amd64
<<: *stress_test_long_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: rpz
RATE: 1500
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /linux/i && $BIND_STRESS_TEST_MODE =~ /rpz/i && $BIND_STRESS_TEST_ARCH =~ /amd64/i)
stress:authoritative:fedora:41:arm64:
<<: *fedora_41_arm64_image
<<: *linux_arm64
<<: *stress_test_long_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: authoritative
RATE: 10000
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /linux/i && $BIND_STRESS_TEST_MODE =~ /authoritative/i && $BIND_STRESS_TEST_ARCH =~ /arm64/i)
stress:recursive:fedora:41:arm64:
<<: *fedora_41_arm64_image
<<: *linux_arm64
<<: *stress_test_long_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: recursive
RATE: 10000
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /linux/i && $BIND_STRESS_TEST_MODE =~ /recursive/i && $BIND_STRESS_TEST_ARCH =~ /arm64/i)
stress:rpz:fedora:41:arm64:
<<: *fedora_41_arm64_image
<<: *linux_arm64
<<: *stress_test_long_job
variables:
CC: gcc
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/bin/flame
MODE: rpz
RATE: 1500
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /linux/i && $BIND_STRESS_TEST_MODE =~ /rpz/i && $BIND_STRESS_TEST_ARCH =~ /arm64/i)
stress:authoritative:freebsd13:amd64:
<<: *freebsd_stress_amd64
<<: *stress_test_long_job
variables:
CC: clang
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/local/bin/flame
MODE: authoritative
RATE: 10000
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /freebsd/i && $BIND_STRESS_TEST_MODE =~ /authoritative/i && $BIND_STRESS_TEST_ARCH =~ /amd64/i)
stress:recursive:freebsd13:amd64:
<<: *freebsd_stress_amd64
<<: *stress_test_long_job
variables:
CC: clang
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/local/bin/flame
MODE: recursive
RATE: 10000
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /freebsd/i && $BIND_STRESS_TEST_MODE =~ /recursive/i && $BIND_STRESS_TEST_ARCH =~ /amd64/i)
stress:rpz:freebsd13:amd64:
<<: *freebsd_stress_amd64
<<: *stress_test_long_job
variables:
CC: clang
CFLAGS: "${CFLAGS_COMMON} -Og"
FLAME: /usr/local/bin/flame
MODE: rpz
RATE: 1500
RUN_TIME: 60
only:
variables:
- $CI_COMMIT_TAG || ($BIND_STRESS_TEST_OS =~ /freebsd/i && $BIND_STRESS_TEST_MODE =~ /rpz/i && $BIND_STRESS_TEST_ARCH =~ /amd64/i)
# git fsck operates over the whole repository and is sufficient to schedule it
# only in one branch, preferably "main". GitLab's clone strategy prevents us
# from using the "bind9" repo clone; we need to clone it ourselves.

View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
# Copyright (C) Internet Systems Consortium, Inc. ("ISC")
#
# SPDX-License-Identifier: MPL-2.0
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, you can obtain one at https://mozilla.org/MPL/2.0/.
#
# See the COPYRIGHT file distributed with this work for additional
# information regarding copyright ownership.
import itertools
import os
import random
import yaml
with open(".gitlab-ci.yml", encoding="utf-8") as gitlab_ci_yml:
anchors = yaml.load(gitlab_ci_yml, Loader=yaml.Loader)
# Mandatory environment variables
ci_pipeline_source = os.environ["CI_PIPELINE_SOURCE"]
install_path = os.environ["INSTALL_PATH"]
project_directory = os.environ["CI_PROJECT_DIR"]
# Optional environment variables
all_bind_stress_tests = os.getenv("ALL_BIND_STRESS_TESTS")
build_parallel_jobs = os.getenv("BUILD_PARALLEL_JOBS", "1")
cflags_common = os.getenv("CFLAGS_COMMON", "")
ci_commit_tag = os.getenv("CI_COMMIT_TAG")
# Optional overrides for default test parameters
env_traffic_rate = os.getenv("BIND_STRESS_TESTS_RATE")
env_run_time = os.getenv("BIND_STRESS_TESTS_RUN_TIME")
# Tags and scheduled pipelines produce longer jobs.
if ci_commit_tag or ci_pipeline_source == "schedule":
all_bind_stress_tests = True
scenario = "long"
default_runtime = 60
expire_in = "1 week"
else:
scenario = "short"
default_runtime = 15
expire_in = "1 day"
ALL_MODES = "recursive", "authoritative", "rpz"
ALL_PROTOCOLS = "tcp", "doh", "dot"
ALL_PLATFORMS = ".fedora-41-amd64", ".fedora-41-arm64", ".freebsd-stress-amd64"
# If ALL_BIND_STRESS_TESTS and CI_COMMIT_TAG environmental variables are unset,
# pick only two of three items from "modes", "protocols", and "machines" to make
# the "modes x protocols x machines" matrix smaller.
if all_bind_stress_tests is None and ci_commit_tag is None:
modes = random.sample(ALL_MODES, k=2)
protocols = random.sample(ALL_PROTOCOLS, k=2)
platforms = random.sample(ALL_PLATFORMS, k=2)
else:
modes = ALL_MODES
protocols = ALL_PROTOCOLS
platforms = ALL_PLATFORMS
jobs = {}
for mode, protocol, platform in itertools.product(modes, protocols, platforms):
if "freebsd" in platform:
# Flamethrower-produced DoH queries on FreeBSD always timeout. Skip
# DoH-on-FreeBSD jobs.
if protocol == "doh":
continue
job_platform = "freebsd:amd64"
compiler_binary = "clang"
flame_binary = "/usr/local/bin/flame"
else:
if "amd64" in platform:
job_platform = "linux:amd64"
else:
job_platform = "linux:arm64"
compiler_binary = "gcc"
flame_binary = "/usr/bin/flame"
if mode == "rpz":
default_traffic_rate = 1500
else:
default_traffic_rate = 10000
traffic_rate = int(env_traffic_rate or default_traffic_rate)
runtime = int(env_run_time or default_runtime)
expected_tcp_response_rate = 80 if protocol == "dot" else 90
job_definition = {
"stage": "test",
"variables": {
"CC": compiler_binary,
"CFLAGS": f"{cflags_common} -Og",
"EXPECTED_TCP_RESPONSE_RATE": expected_tcp_response_rate,
"FLAME": flame_binary,
"MODE": mode,
"PROTOCOL": f"{protocol} udp",
"RATE": traffic_rate,
"RUN_TIME": runtime,
},
"script": [
"autoreconf -fi",
*anchors[".configure"],
*anchors[".setup_interfaces"],
f"make -j{build_parallel_jobs} -k all V=1",
f'make DESTDIR="{install_path}" install',
"git clone --depth 1 https://gitlab.isc.org/isc-projects/bind9-qa.git",
"cd bind9-qa/stress",
f'export LD_LIBRARY_PATH="{install_path}/usr/local/lib"',
f'export BIND_INSTALL_PATH="{install_path}/usr/local"',
f'export WORKSPACE="{project_directory}"',
"bash stress.sh",
],
"rules": [{"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'}],
"timeout": f"{runtime + 30} minutes",
"artifacts": {
"untracked": True,
"when": "always",
"expire_in": expire_in,
"exclude": [
"output/ns4/*.dtq*",
"output/ns4/large-delta-rpz*.local",
"output/rpz_*",
],
},
}
job_definition |= anchors[platform]
job_name = f"stress:{scenario}:{mode}:{protocol}+udp:{job_platform}"
jobs[job_name] = job_definition
print(yaml.dump(jobs, Dumper=yaml.Dumper))