From df994ace9195f7e8b7cb8cf5dea0ff75b35ebed3 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 27 May 2026 15:58:11 +0300 Subject: [PATCH 1/3] Refactor SLO workload for ydb-slo-action v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the new SLO Action's one-process-one-container model and push client-side latency aggregates instead of histograms. - metrics.py: compute p50/p95/p99 per push window via HdrHistogram and emit them as gauges; reset the histogram after each push so the values represent the last window only. Read ref/workload from the new WORKLOAD_REF/WORKLOAD_NAME env vars (with legacy REF/WORKLOAD fallback). Resolve the OTLP endpoint from the standard OTEL_EXPORTER_OTLP_* env vars before falling back to --otlp-endpoint. - docker-entrypoint.sh: tiny wrapper invoked by ydb-slo-action v2 that picks the *-create / *-run subcommand from WORKLOAD_NAME and forwards tuning flags from workload_current_command to *-run. - Dockerfile: install the entrypoint script; keep gcc/libc6-dev for the hdrhistogram sdist build. - slo.yml: use ydb-platform/ydb-slo-action/init@v2 with the workload matrix, fix workload_baseline_image (was pointing at current). - slo-report.yml: align with the Java SDK workflow — drop the JS-based label removal in favour of `gh pr edit --remove-label`. - requirements.txt: pin hdrhistogram for the percentile computation. --- .github/workflows/slo-report.yml | 43 ++---- .github/workflows/slo.yml | 228 ++++++------------------------- tests/slo/Dockerfile | 53 +++---- tests/slo/docker-entrypoint.sh | 40 ++++++ tests/slo/requirements.txt | 2 +- tests/slo/src/core/metrics.py | 214 +++++++++++++---------------- 6 files changed, 220 insertions(+), 360 deletions(-) create mode 100755 tests/slo/docker-entrypoint.sh diff --git a/.github/workflows/slo-report.yml b/.github/workflows/slo-report.yml index 07231d3bf..cd02c703f 100644 --- a/.github/workflows/slo-report.yml +++ b/.github/workflows/slo-report.yml @@ -7,7 +7,8 @@ on: - completed jobs: - ydb-slo-action-report: + publish-slo-report: + if: github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest name: Publish YDB SLO Report permissions: @@ -16,42 +17,24 @@ jobs: pull-requests: write steps: - name: Publish YDB SLO Report - uses: ydb-platform/ydb-slo-action/report@13c687b7d4b2879da79dd12932dee0ed2b65dd1c + uses: ydb-platform/ydb-slo-action/report@v2 with: github_token: ${{ secrets.GITHUB_TOKEN }} github_run_id: ${{ github.event.workflow_run.id }} remove-slo-label: - if: always() && github.event.workflow_run.event == 'pull_request' - name: Remove SLO Label - needs: ydb-slo-action-report + if: github.event.workflow_run.event == 'pull_request' runs-on: ubuntu-latest + name: Remove SLO Label permissions: pull-requests: write steps: - name: Remove SLO label from PR - uses: actions/github-script@v7 - with: - script: | - const pullRequests = context.payload.workflow_run.pull_requests; - if (pullRequests && pullRequests.length > 0) { - for (const pr of pullRequests) { - try { - await github.rest.issues.removeLabel({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: pr.number, - name: 'SLO' - }); - console.log(`Removed SLO label from PR #${pr.number}`); - } catch (error) { - if (error.status === 404) { - console.log(`SLO label not found on PR #${pr.number}, skipping`); - } else { - throw error; - } - } - } - } else { - console.log('No pull requests associated with this workflow run'); - } + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }} + REPO: ${{ github.event.workflow_run.repository.full_name }} + run: | + set -euo pipefail + PR=$(jq -r '.[0].number' <<<"$PRS") + gh pr edit "$PR" --repo "$REPO" --remove-label SLO diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index fbbf66cc1..1669768b3 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -3,28 +3,6 @@ name: SLO on: pull_request: types: [opened, reopened, synchronize, labeled] - branches: - - main - workflow_dispatch: - inputs: - github_issue: - description: "GitHub issue / PR number where the SLO report will be posted (optional; will be inferred for PR runs)" - required: false - baseline_ref: - description: "Baseline commit/branch/tag to compare against (leave empty to auto-detect merge-base with main)" - required: false - slo_workload_read_max_rps: - description: "Maximum read RPS for the SLO workload" - required: false - default: "1000" - slo_workload_write_max_rps: - description: "Maximum write RPS for the SLO workload" - required: false - default: "100" - slo_workload_duration_seconds: - description: "Duration of the SLO workload in seconds" - required: false - default: "600" permissions: contents: read @@ -33,233 +11,109 @@ permissions: jobs: ydb-slo-action: + if: contains(github.event.pull_request.labels.*.name, 'SLO') + name: Run YDB SLO Tests runs-on: "large-runner-python-sdk" - # Run on PRs only when labeled "SLO"; allow manual runs via workflow_dispatch - if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'SLO') }} - strategy: fail-fast: false matrix: - include: - - id: sync-table - prefix: table - workload: sync-table - - id: sync-query - prefix: table - workload: sync-query + sdk: + - name: sync-table + command: "--read-rps 1000 --write-rps 100" + - name: sync-query + command: "--read-rps 1000 --write-rps 100" concurrency: - group: slo-${{ github.ref }}-${{ matrix.workload }} + group: slo-${{ github.ref }}-${{ matrix.sdk.name }} cancel-in-progress: true steps: - name: Install dependencies run: | + set -euxo pipefail YQ_VERSION=v4.48.2 BUILDX_VERSION=0.30.1 COMPOSE_VERSION=2.40.3 - sudo curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o /usr/local/bin/yq && \ - sudo chmod +x /usr/local/bin/yq + sudo curl -fLo /usr/local/bin/yq \ + "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" + sudo chmod +x /usr/local/bin/yq - echo "Updating Docker plugins..." sudo mkdir -p /usr/local/lib/docker/cli-plugins - echo "Installing Docker Buildx ${BUILDX_VERSION}..." sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-buildx \ "https://github.com/docker/buildx/releases/download/v${BUILDX_VERSION}/buildx-v${BUILDX_VERSION}.linux-amd64" sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx - echo "Installing Docker Compose ${COMPOSE_VERSION}..." sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-compose \ "https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-linux-x86_64" sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose - echo "Installed versions:" yq --version docker --version docker buildx version docker compose version - - name: Checkout current version + - name: Checkout current SDK version uses: actions/checkout@v5 with: - path: current + path: sdk-current fetch-depth: 0 - name: Determine baseline commit id: baseline - shell: bash + working-directory: sdk-current run: | - cd current - if [[ -n "${{ inputs.baseline_ref }}" ]]; then - BASELINE="${{ inputs.baseline_ref }}" - else - BASELINE=$(git merge-base HEAD origin/main) - fi - echo "sha=$BASELINE" >> $GITHUB_OUTPUT + set -euo pipefail + BASELINE=$(git merge-base HEAD origin/main) + echo "sha=${BASELINE}" >> "$GITHUB_OUTPUT" - # Try to determine a human-readable ref name for baseline - # Check if baseline is on main - if git merge-base --is-ancestor $BASELINE origin/main && \ - [ "$(git rev-parse origin/main)" = "$BASELINE" ]; then - BASELINE_REF="main" + if git merge-base --is-ancestor "${BASELINE}" origin/main && \ + [ "$(git rev-parse origin/main)" = "${BASELINE}" ]; then + BASELINE_REF="main" else - # Try to find a branch containing this commit - BRANCH=$(git branch -r --contains $BASELINE | grep -v HEAD | head -1 | sed 's/.*\///' || echo "") - if [ -n "$BRANCH" ]; then + BRANCH=$(git branch -r --contains "${BASELINE}" | grep -v HEAD | head -1 | sed 's|.*/||' || echo "") + if [ -n "${BRANCH}" ]; then BASELINE_REF="${BRANCH}@${BASELINE:0:7}" else BASELINE_REF="${BASELINE:0:7}" fi fi - echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT + echo "ref=${BASELINE_REF}" >> "$GITHUB_OUTPUT" - - name: Checkout baseline version + - name: Checkout baseline SDK version uses: actions/checkout@v5 with: ref: ${{ steps.baseline.outputs.sha }} - path: baseline + path: sdk-baseline fetch-depth: 1 - - name: Show Docker versions - run: | - docker --version - docker compose version - - name: Build workload images (current + baseline) run: | + set -euxo pipefail docker build \ - -f "$GITHUB_WORKSPACE/current/tests/slo/Dockerfile" \ + -f "$GITHUB_WORKSPACE/sdk-current/tests/slo/Dockerfile" \ -t "ydb-app-current" \ - "$GITHUB_WORKSPACE/current" + "$GITHUB_WORKSPACE/sdk-current" docker build \ - -f "$GITHUB_WORKSPACE/baseline/tests/slo/Dockerfile" \ + -f "$GITHUB_WORKSPACE/sdk-baseline/tests/slo/Dockerfile" \ -t "ydb-app-baseline" \ - "$GITHUB_WORKSPACE/baseline" + "$GITHUB_WORKSPACE/sdk-baseline" - - name: Initialize YDB SLO - id: ydb_slo - uses: ydb-platform/ydb-slo-action/init@13c687b7d4b2879da79dd12932dee0ed2b65dd1c + - name: Run SLO Tests + uses: ydb-platform/ydb-slo-action/init@v2 + timeout-minutes: 30 with: - github_issue: ${{ github.event.pull_request.number || inputs.github_issue }} + github_issue: ${{ github.event.pull_request.number }} github_token: ${{ secrets.GITHUB_TOKEN }} - workload_name: ydb-python-${{ matrix.workload }} + workload_name: ${{ matrix.sdk.name }} + workload_duration: "600" workload_current_ref: ${{ github.head_ref || github.ref_name }} + workload_current_image: ydb-app-current + workload_current_command: ${{ matrix.sdk.command }} workload_baseline_ref: ${{ steps.baseline.outputs.ref }} - - - name: Prepare SLO Database - run: | - docker run --rm \ - --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - -e "WORKLOAD=${{ matrix.workload }}" \ - -e "REF=${{ github.head_ref || github.ref_name }}" \ - ydb-app-current \ - ${{ matrix.prefix }}-create grpc://ydb:2136 /Root/testdb - - - name: Run SLO Tests (current + baseline in parallel) - timeout-minutes: 15 - env: - WORKLOAD: ${{ matrix.workload }} - DURATION: ${{ inputs.slo_workload_duration_seconds || 600 }} - READ_RPS: ${{ inputs.slo_workload_read_max_rps || 1000 }} - WRITE_RPS: ${{ inputs.slo_workload_write_max_rps || 100 }} - CURRENT_REF: ${{ github.head_ref || github.ref_name }} - BASELINE_REF: ${{ steps.baseline.outputs.ref }} - run: | - ARGS="${{ matrix.prefix }}-run grpc://ydb:2136 /Root/testdb \ - --otlp-endpoint http://prometheus:9090/api/v1/otlp/v1/metrics \ - --report-period 250 \ - --time ${DURATION} \ - --read-rps ${READ_RPS} \ - --write-rps ${WRITE_RPS} \ - --read-timeout 1000 \ - --write-timeout 1000" - - echo "Starting current workload (ref=${CURRENT_REF}, workload=${WORKLOAD})..." - docker run -d \ - --name ydb-app-current \ - --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - -e "REF=${CURRENT_REF}" \ - -e "WORKLOAD=${WORKLOAD}" \ - ydb-app-current \ - $ARGS - - echo "Starting baseline workload (ref=${BASELINE_REF}, workload=${WORKLOAD})..." - docker run -d \ - --name ydb-app-baseline \ - --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - -e "REF=${BASELINE_REF}" \ - -e "WORKLOAD=${WORKLOAD}" \ - ydb-app-baseline \ - $ARGS - - echo "" - echo "==================== INITIAL CURRENT LOGS ====================" - docker logs -n 15 ydb-app-current 2>&1 || echo "No current container" - echo "" - echo "==================== INITIAL BASELINE LOGS ====================" - docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container" - echo "" - - echo "Waiting for workloads to complete (${DURATION}s)..." - sleep ${DURATION} - - echo "Stopping containers after ${DURATION}s..." - docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true - - # Force kill if still running - docker kill ydb-app-current ydb-app-baseline 2>&1 || true - - # Check exit codes - CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1") - BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "1") - - echo "Current exit code: ${CURRENT_EXIT}" - echo "Baseline exit code: ${BASELINE_EXIT}" - - echo "" - echo "==================== FINAL CURRENT LOGS ====================" - docker logs -n 15 ydb-app-current 2>&1 || echo "No current container" - echo "" - echo "==================== FINAL BASELINE LOGS ====================" - docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container" - echo "" - - if [[ "${CURRENT_EXIT}" != "0" || "${BASELINE_EXIT}" != "0" ]]; then - echo "One or both workloads failed." - exit 0 - fi - - echo "SUCCESS: Workloads completed successfully" - - - if: always() - name: Store logs - run: | - docker logs ydb-app-current > current.log 2>&1 || echo "No current container" > current.log - docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container" > baseline.log - - - if: always() - name: Upload logs - uses: actions/upload-artifact@v4 - with: - name: ydb-python-${{ matrix.workload }}-logs - path: | - ./current.log - ./baseline.log - retention-days: 1 + workload_baseline_image: ydb-app-baseline + workload_baseline_command: ${{ matrix.sdk.command }} diff --git a/tests/slo/Dockerfile b/tests/slo/Dockerfile index 4f18a2b3f..b45d26111 100644 --- a/tests/slo/Dockerfile +++ b/tests/slo/Dockerfile @@ -1,38 +1,45 @@ # syntax=docker/dockerfile:1 -# This image packages the Python SLO workload runner. -# It expects to be run with arguments like: -# docker run --rm table-run --otlp-endpoint http://prometheus:9090/api/v1/otlp/v1/metrics ... +# This image packages the Python SLO workload runner for ydb-slo-action v2. +# +# The action launches the same image for `current` and `baseline` workloads +# in parallel and injects these env vars (see tests/slo/docker-entrypoint.sh): +# +# WORKLOAD_NAME sync-table | sync-query | topic +# WORKLOAD_REF "current" / "main" — used as the `ref` metric label +# WORKLOAD_DURATION seconds +# YDB_ENDPOINT grpc://ydb:2136 +# YDB_DATABASE /Root/testdb +# OTEL_EXPORTER_OTLP_ENDPOINT / _METRICS_ENDPOINT +# +# `workload_current_command` from the action is appended to the `*-run` call +# as tuning flags (--read-rps, --write-rps, ...). # # Notes: # - OpenTelemetry 1.39.x requires Python >= 3.9. -# - The entrypoint is `python ./tests/slo/src`, i.e. it runs the `__main__.py` -# from that directory (same as `python tests/slo/src ...` in CI). +# - gcc/libc6-dev are needed to build hdrhistogram from sdist. -FROM python:3.11-slim AS build +FROM python:3.11-slim ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 -WORKDIR /src -COPY . /src - -# Install runtime deps into an isolated venv so we can copy it into the final stage. -RUN python -m venv /opt/venv \ - && /opt/venv/bin/python -m pip install --no-cache-dir --upgrade pip \ - && /opt/venv/bin/pip install --no-cache-dir . \ - && /opt/venv/bin/pip install --no-cache-dir -r tests/slo/requirements.txt - +RUN apt-get update && apt-get install -y --no-install-recommends gcc libc6-dev && rm -rf /var/lib/apt/lists/* -FROM python:3.11-slim +WORKDIR /src -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - PATH="/opt/venv/bin:${PATH}" +# 1. YDB SDK +COPY setup.py pyproject.toml README.md requirements.txt ./ +COPY ydb/ ydb/ +RUN pip install --no-cache-dir . -WORKDIR /app +# 2. SLO deps +COPY tests/slo/requirements.txt tests/slo/requirements.txt +RUN pip install --no-cache-dir -r tests/slo/requirements.txt -COPY --from=build /opt/venv /opt/venv -COPY --from=build /src/tests/slo/src /app/tests/slo/src +# 3. Workload source + entrypoint +COPY tests/slo/src /src/tests/slo/src +COPY tests/slo/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +RUN chmod +x /usr/local/bin/docker-entrypoint.sh -ENTRYPOINT ["python", "./tests/slo/src"] +ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] diff --git a/tests/slo/docker-entrypoint.sh b/tests/slo/docker-entrypoint.sh new file mode 100755 index 000000000..9b84b45dc --- /dev/null +++ b/tests/slo/docker-entrypoint.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# Workload entrypoint used by ydb-slo-action v2. +# +# The action launches the same image for both `current` and `baseline` workloads +# in parallel; both must be able to schema-prepare and then run. +# +# Inputs come from the env vars injected by the action: +# WORKLOAD_NAME sync-table | sync-query | topic +# WORKLOAD_DURATION run duration in seconds +# YDB_ENDPOINT grpc://ydb:2136 +# YDB_DATABASE /Root/testdb +# +# Anything passed after the script name is appended to the `*-run` command — +# this is how tuning flags from `workload_current_command` (e.g. --read-rps) +# reach the workload. + +set -e + +case "${WORKLOAD_NAME:-sync-query}" in + sync-table|sync-query) PREFIX=table ;; + topic) PREFIX=topic ;; + *) + echo "Unknown WORKLOAD_NAME: ${WORKLOAD_NAME}" >&2 + exit 1 + ;; +esac + +ENDPOINT="${YDB_ENDPOINT:-grpc://localhost:2136}" +DATABASE="${YDB_DATABASE:-/local}" +DURATION="${WORKLOAD_DURATION:-600}" + +# Schema prep is idempotent at the SDK level for topics; for tables, a parallel +# baseline container may race and fail with "already exists" — tolerate it. +python ./tests/slo/src "${PREFIX}-create" "$ENDPOINT" "$DATABASE" \ + || echo "WARN: ${PREFIX}-create exited non-zero (treated as already-prepared)" >&2 + +exec python ./tests/slo/src \ + "${PREFIX}-run" "$ENDPOINT" "$DATABASE" \ + --time "$DURATION" \ + "$@" diff --git a/tests/slo/requirements.txt b/tests/slo/requirements.txt index cd5cdfe1a..d9021cc5c 100644 --- a/tests/slo/requirements.txt +++ b/tests/slo/requirements.txt @@ -1,6 +1,6 @@ requests==2.33.0 aiolimiter==1.1.0 -quantile-estimator==0.1.2 +hdrhistogram==0.10.3 # OpenTelemetry (OTLP/HTTP exporter) # NOTE: OpenTelemetry 1.39.1 requires Python >= 3.9. diff --git a/tests/slo/src/core/metrics.py b/tests/slo/src/core/metrics.py index bff90eda8..51d648443 100644 --- a/tests/slo/src/core/metrics.py +++ b/tests/slo/src/core/metrics.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +import threading import time from abc import ABC, abstractmethod from collections.abc import Iterable @@ -10,36 +11,25 @@ from typing import Any, Optional, Tuple OP_TYPE_READ, OP_TYPE_WRITE = "read", "write" -OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "err" +OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "error" -REF = environ.get("REF", "main") -WORKLOAD = environ.get("WORKLOAD", "sync-query") +REF = environ.get("WORKLOAD_REF") or environ.get("REF") or "main" +WORKLOAD = environ.get("WORKLOAD_NAME") or environ.get("WORKLOAD") or "sync-query" logger = logging.getLogger(__name__) def _normalize_labels(labels: Any) -> Tuple[Any, ...]: - """ - Convert labels into a tuple of label values. - - Important: - - `str` is an Iterable, but for our purposes it must be treated as a single label value. - """ if labels is None: return tuple() - if isinstance(labels, str): return (labels,) - if isinstance(labels, tuple): return labels - if isinstance(labels, list): return tuple(labels) - if isinstance(labels, Iterable): return tuple(labels) - return (labels,) @@ -101,30 +91,29 @@ def push(self) -> None: class OtlpMetrics(BaseMetrics): """ - Canonical OpenTelemetry metrics implementation. + Exports metrics via OTLP/HTTP to a Prometheus endpoint with OTLP receiver enabled. - This exports metrics via OTLP/HTTP to a Prometheus server with OTLP receiver enabled: - POST http(s)://:/api/v1/otlp/v1/metrics - - Naming notes: - - Metric names follow OpenTelemetry conventions (dot-separated namespaces, e.g. `sdk.operations.total`). - - Prometheus OTLP translation typically converts dots to underscores and may add suffixes like - `_total` for counters and `_bucket/_sum/_count` for histograms. + Latency percentiles (p50/p95/p99) are computed client-side per push window via + HdrHistogram and emitted as gauges; counters are emitted via OTel counters. + Histogram is reset after each push so each sample represents the last window only. """ + _HDR_MIN_US = 1 + _HDR_MAX_US = 60_000_000 # 60s + _HDR_SIG_FIGS = 3 + _PERCENTILES = (("p50", 50.0), ("p95", 95.0), ("p99", 99.0)) + def __init__(self, otlp_metrics_endpoint: str): + from hdrh.histogram import HdrHistogram from opentelemetry.exporter.otlp.proto.http.metric_exporter import ( OTLPMetricExporter, ) from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader - from opentelemetry.sdk.metrics.view import ( - ExplicitBucketHistogramAggregation, - View, - ) from opentelemetry.sdk.resources import Resource - # Resource attributes: Prometheus maps service.name -> job, service.instance.id -> instance. + self._HdrHistogram = HdrHistogram + resource = Resource.create( { "service.name": f"workload-{WORKLOAD}", @@ -138,79 +127,59 @@ def __init__(self, otlp_metrics_endpoint: str): ) exporter = OTLPMetricExporter(endpoint=otlp_metrics_endpoint) - reader = PeriodicExportingMetricReader(exporter) # we force_flush() explicitly in push() - - latency_view = View( - instrument_name="sdk.operation.latency", - aggregation=ExplicitBucketHistogramAggregation( - boundaries=( - 0.001, - 0.002, - 0.003, - 0.004, - 0.005, - 0.0075, - 0.010, - 0.020, - 0.050, - 0.100, - 0.200, - 0.500, - 1.000, - ) - ), - ) - - self._provider = MeterProvider( - resource=resource, - metric_readers=[reader], - views=[latency_view], - ) + reader = PeriodicExportingMetricReader(exporter) + self._provider = MeterProvider(resource=resource, metric_readers=[reader]) self._meter = self._provider.get_meter("ydb-slo") - # Instruments (sync) self._errors = self._meter.create_counter( name="sdk.errors.total", description="Total number of errors encountered, categorized by error type.", ) self._operations_total = self._meter.create_counter( name="sdk.operations.total", - description="Total number of operations, categorized by type attempted by the SDK.", + description="Total number of operations attempted by the SDK.", ) self._operations_success_total = self._meter.create_counter( name="sdk.operations.success.total", - description="Total number of successful operations, categorized by type.", + description="Total number of successful operations.", ) self._operations_failure_total = self._meter.create_counter( name="sdk.operations.failure.total", - description="Total number of failed operations, categorized by type.", + description="Total number of failed operations.", ) - self._latency = self._meter.create_histogram( - name="sdk.operation.latency", - unit="s", - description="Latency of operations performed by the SDK in seconds, categorized by type and status.", + self._retry_attempts_total = self._meter.create_counter( + name="sdk.retry.attempts.total", + description="Total number of retry attempts.", ) - self._pending = self._meter.create_up_down_counter( name="sdk.pending.operations", - description="Current number of pending operations, categorized by type.", + description="Current number of pending operations.", ) + self._latency_gauges = { + name: self._meter.create_gauge( + name=f"sdk.operation.latency.{name}.seconds", + unit="s", + description=f"Operation latency {name} computed over the last push window.", + ) + for name, _ in self._PERCENTILES + } - self._retry_attempts_total = self._meter.create_counter( - name="sdk.retry.attempts.total", - description="Total number of retry attempts, categorized by ref and operation type.", - ) + self._lock = threading.Lock() + self._hdr: dict = {} - self.reset() + def _get_hdr(self, op_type: str, op_status: str): + key = (op_type, op_status) + hist = self._hdr.get(key) + if hist is None: + hist = self._HdrHistogram(self._HDR_MIN_US, self._HDR_MAX_US, self._HDR_SIG_FIGS) + self._hdr[key] = hist + return hist def start(self, labels) -> float: labels_t = _normalize_labels(labels) self._pending.add( 1, - attributes={ - "ref": REF, - "operation_type": labels_t[0], - }, + attributes={"ref": REF, "operation_type": labels_t[0]}, ) return time.time() @@ -223,74 +192,81 @@ def stop( ) -> None: labels_t = _normalize_labels(labels) duration = time.time() - start_time + duration_us = min(max(int(duration * 1_000_000), self._HDR_MIN_US), self._HDR_MAX_US) op_type = labels_t[0] - base_attrs = { - "ref": REF, - "operation_type": op_type, - } + op_status = OP_STATUS_SUCCESS if error is None else OP_STATUS_FAILURE + base_attrs = {"ref": REF, "operation_type": op_type} + op_attrs = {**base_attrs, "operation_status": op_status} - # Update instruments self._retry_attempts_total.add(int(attempts), attributes=base_attrs) self._pending.add(-1, attributes=base_attrs) - - # Counters + latency - self._operations_total.add(1, attributes=base_attrs) + self._operations_total.add(1, attributes=op_attrs) if error is not None: - self._errors.add( - 1, - attributes={ - **base_attrs, - "error_type": type(error).__name__, - }, - ) + self._errors.add(1, attributes={**base_attrs, "error_type": type(error).__name__}) self._operations_failure_total.add(1, attributes=base_attrs) - self._latency.record( - duration, - attributes={ - **base_attrs, - "operation_status": OP_STATUS_FAILURE, - }, - ) - return - - self._operations_success_total.add(1, attributes=base_attrs) - self._latency.record( - duration, - attributes={ - **base_attrs, - "operation_status": OP_STATUS_SUCCESS, - }, - ) + else: + self._operations_success_total.add(1, attributes=base_attrs) + + with self._lock: + self._get_hdr(op_type, op_status).record_value(duration_us) def push(self) -> None: - # Metrics job calls push() with the cadence of --report-period. - # force_flush() makes the exporter send immediately. + with self._lock: + for (op_type, op_status), hist in self._hdr.items(): + if hist.get_total_count() == 0: + continue + attrs = {"ref": REF, "operation_type": op_type, "operation_status": op_status} + for name, percentile in self._PERCENTILES: + value_s = hist.get_value_at_percentile(percentile) / 1_000_000 + self._latency_gauges[name].set(value_s, attributes=attrs) + for hist in self._hdr.values(): + hist.reset() self._provider.force_flush() def reset(self) -> None: - # OpenTelemetry counters/histograms are cumulative and cannot be reset. - # Reset is implemented as an immediate push/flush. - self.push() + with self._lock: + for hist in self._hdr.values(): + hist.reset() + self._provider.force_flush() -def create_metrics(otlp_endpoint: Optional[str]) -> BaseMetrics: +def _resolve_metrics_endpoint(cli_endpoint: Optional[str]) -> str: """ - Factory used by SLO runners. + Resolution order: + 1. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT (used as-is) + 2. OTEL_EXPORTER_OTLP_ENDPOINT + /v1/metrics suffix + 3. CLI --otlp-endpoint + """ + metrics_env = environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", "").strip() + if metrics_env: + return metrics_env + + base_env = environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "").strip() + if base_env: + base = base_env.rstrip("/") + if base.endswith("/v1/metrics"): + return base + return f"{base}/v1/metrics" - Metrics are enabled if either: - - OTLP_ENDPOINT env var is set, or - - `--otlp-endpoint` is provided (and non-empty) + return (cli_endpoint or "").strip() + + +def create_metrics(otlp_endpoint: Optional[str]) -> BaseMetrics: + """ + Build a metrics exporter. - If endpoint is empty, metrics are disabled (DummyMetrics). + Metrics are enabled if a non-empty endpoint can be derived from either the + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT / OTEL_EXPORTER_OTLP_ENDPOINT env vars + or the explicit `--otlp-endpoint` CLI flag. Otherwise DummyMetrics is used. """ - endpoint = (environ.get("OTLP_ENDPOINT") or (otlp_endpoint or "")).strip() + endpoint = _resolve_metrics_endpoint(otlp_endpoint) if not endpoint: logger.info("Creating dummy metrics (metrics disabled)") return DummyMetrics() - logger.info("Creating OTLP metrics exporter to Prometheus: %s", endpoint) + logger.info("Creating OTLP metrics exporter to: %s", endpoint) try: return OtlpMetrics(endpoint) except Exception: From 8d60d4dd80cac46aed99dd28aac663b53a529dfa Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Thu, 28 May 2026 12:36:47 +0300 Subject: [PATCH 2/3] inline slo post actions --- .github/workflows/slo-report.yml | 40 -------------------------------- .github/workflows/slo.yml | 33 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 40 deletions(-) delete mode 100644 .github/workflows/slo-report.yml diff --git a/.github/workflows/slo-report.yml b/.github/workflows/slo-report.yml deleted file mode 100644 index cd02c703f..000000000 --- a/.github/workflows/slo-report.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: slo-report - -on: - workflow_run: - workflows: ["SLO"] - types: - - completed - -jobs: - publish-slo-report: - if: github.event.workflow_run.conclusion == 'success' - runs-on: ubuntu-latest - name: Publish YDB SLO Report - permissions: - checks: write - contents: read - pull-requests: write - steps: - - name: Publish YDB SLO Report - uses: ydb-platform/ydb-slo-action/report@v2 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - github_run_id: ${{ github.event.workflow_run.id }} - - remove-slo-label: - if: github.event.workflow_run.event == 'pull_request' - runs-on: ubuntu-latest - name: Remove SLO Label - permissions: - pull-requests: write - steps: - - name: Remove SLO label from PR - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }} - REPO: ${{ github.event.workflow_run.repository.full_name }} - run: | - set -euo pipefail - PR=$(jq -r '.[0].number' <<<"$PRS") - gh pr edit "$PR" --repo "$REPO" --remove-label SLO diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 1669768b3..f4bfd29d7 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -117,3 +117,36 @@ jobs: workload_baseline_ref: ${{ steps.baseline.outputs.ref }} workload_baseline_image: ydb-app-baseline workload_baseline_command: ${{ matrix.sdk.command }} + + publish-slo-report: + if: always() + name: Publish YDB SLO Report + needs: ydb-slo-action + runs-on: ubuntu-latest + permissions: + checks: write + contents: read + pull-requests: write + steps: + - name: Publish YDB SLO Report + uses: ydb-platform/ydb-slo-action/report@v2 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + github_run_id: ${{ github.event.workflow_run.id }} + + remove-slo-label: + name: Remove SLO Label + needs: ydb-slo-action + runs-on: ubuntu-latest + permissions: + pull-requests: write + steps: + - name: Remove SLO label from PR + env: + PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }} + REPO: ${{ github.event.workflow_run.repository.full_name }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + PR=$(jq -r '.[0].number' <<<"$PRS") + gh pr edit "$PR" --repo "$REPO" --remove-label SLO From 3963d63ca59c95e4c27df580708ca8f7d73fcb90 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Thu, 28 May 2026 17:06:04 +0300 Subject: [PATCH 3/3] Build baseline workload from current Dockerfile + tests/slo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v2 SLO action runs both current and baseline images with the same command. The baseline checkout is on main, which still has the old `python ./tests/slo/src` entrypoint that expects a subcommand as the first arg — so passing tuning flags directly (e.g. --read-rps 1000) fails with 'argument subcommand: invalid choice'. Mirror the Java SDK setup: build the baseline image with the current PR's Dockerfile + tests/slo runner against the baseline ydb library. That way the runner-side contract (entrypoint script, metrics format, env vars) is identical for both images and only the SDK under test differs. Also fix the inline report/label-removal jobs to use pull_request context (github.run_id, github.event.pull_request.number) instead of workflow_run fields that are undefined for this trigger. --- .github/workflows/slo.yml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index f4bfd29d7..90ac4508b 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -93,11 +93,19 @@ jobs: - name: Build workload images (current + baseline) run: | set -euxo pipefail + + # Build current: SDK + workload runner both from this PR. docker build \ -f "$GITHUB_WORKSPACE/sdk-current/tests/slo/Dockerfile" \ -t "ydb-app-current" \ "$GITHUB_WORKSPACE/sdk-current" + # Build baseline: baseline SDK with the current workload runner + # (Dockerfile + tests/slo/), so the runner-side contract changes + # (entrypoint, metrics format) apply uniformly to both images. + rm -rf "$GITHUB_WORKSPACE/sdk-baseline/tests/slo" + cp -r "$GITHUB_WORKSPACE/sdk-current/tests/slo" \ + "$GITHUB_WORKSPACE/sdk-baseline/tests/slo" docker build \ -f "$GITHUB_WORKSPACE/sdk-baseline/tests/slo/Dockerfile" \ -t "ydb-app-baseline" \ @@ -119,7 +127,6 @@ jobs: workload_baseline_command: ${{ matrix.sdk.command }} publish-slo-report: - if: always() name: Publish YDB SLO Report needs: ydb-slo-action runs-on: ubuntu-latest @@ -132,9 +139,10 @@ jobs: uses: ydb-platform/ydb-slo-action/report@v2 with: github_token: ${{ secrets.GITHUB_TOKEN }} - github_run_id: ${{ github.event.workflow_run.id }} + github_run_id: ${{ github.run_id }} remove-slo-label: + if: always() name: Remove SLO Label needs: ydb-slo-action runs-on: ubuntu-latest @@ -143,10 +151,9 @@ jobs: steps: - name: Remove SLO label from PR env: - PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }} - REPO: ${{ github.event.workflow_run.repository.full_name }} + PR: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -euo pipefail - PR=$(jq -r '.[0].number' <<<"$PRS") gh pr edit "$PR" --repo "$REPO" --remove-label SLO