From df994ace9195f7e8b7cb8cf5dea0ff75b35ebed3 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 27 May 2026 15:58:11 +0300
Subject: [PATCH 1/3] Refactor SLO workload for ydb-slo-action v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch to the new SLO Action's one-process-one-container model and
push client-side latency aggregates instead of histograms.

- metrics.py: compute p50/p95/p99 per push window via HdrHistogram and
  emit them as gauges; reset the histogram after each push so the values
  represent the last window only. Read ref/workload from the new
  WORKLOAD_REF/WORKLOAD_NAME env vars (with legacy REF/WORKLOAD fallback).
  Resolve the OTLP endpoint from the standard OTEL_EXPORTER_OTLP_*
  env vars before falling back to --otlp-endpoint.
- docker-entrypoint.sh: tiny wrapper invoked by ydb-slo-action v2 that
  picks the *-create / *-run subcommand from WORKLOAD_NAME and forwards
  tuning flags from workload_current_command to *-run.
- Dockerfile: install the entrypoint script; keep gcc/libc6-dev for the
  hdrhistogram sdist build.
- slo.yml: use ydb-platform/ydb-slo-action/init@v2 with the workload
  matrix, fix workload_baseline_image (was pointing at current).
- slo-report.yml: align with the Java SDK workflow — drop the JS-based
  label removal in favour of `gh pr edit --remove-label`.
- requirements.txt: pin hdrhistogram for the percentile computation.
---
 .github/workflows/slo-report.yml |  43 ++----
 .github/workflows/slo.yml        | 228 ++++++-------------------------
 tests/slo/Dockerfile             |  53 +++----
 tests/slo/docker-entrypoint.sh   |  40 ++++++
 tests/slo/requirements.txt       |   2 +-
 tests/slo/src/core/metrics.py    | 214 +++++++++++++----------------
 6 files changed, 220 insertions(+), 360 deletions(-)
 create mode 100755 tests/slo/docker-entrypoint.sh

diff --git a/.github/workflows/slo-report.yml b/.github/workflows/slo-report.yml
index 07231d3bf..cd02c703f 100644
--- a/.github/workflows/slo-report.yml
+++ b/.github/workflows/slo-report.yml
@@ -7,7 +7,8 @@ on:
       - completed
 
 jobs:
-  ydb-slo-action-report:
+  publish-slo-report:
+    if: github.event.workflow_run.conclusion == 'success'
     runs-on: ubuntu-latest
     name: Publish YDB SLO Report
     permissions:
@@ -16,42 +17,24 @@ jobs:
       pull-requests: write
     steps:
       - name: Publish YDB SLO Report
-        uses: ydb-platform/ydb-slo-action/report@13c687b7d4b2879da79dd12932dee0ed2b65dd1c
+        uses: ydb-platform/ydb-slo-action/report@v2
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           github_run_id: ${{ github.event.workflow_run.id }}
 
   remove-slo-label:
-    if: always() && github.event.workflow_run.event == 'pull_request'
-    name: Remove SLO Label
-    needs: ydb-slo-action-report
+    if: github.event.workflow_run.event == 'pull_request'
     runs-on: ubuntu-latest
+    name: Remove SLO Label
     permissions:
       pull-requests: write
     steps:
       - name: Remove SLO label from PR
-        uses: actions/github-script@v7
-        with:
-          script: |
-            const pullRequests = context.payload.workflow_run.pull_requests;
-            if (pullRequests && pullRequests.length > 0) {
-              for (const pr of pullRequests) {
-                try {
-                  await github.rest.issues.removeLabel({
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    issue_number: pr.number,
-                    name: 'SLO'
-                  });
-                  console.log(`Removed SLO label from PR #${pr.number}`);
-                } catch (error) {
-                  if (error.status === 404) {
-                    console.log(`SLO label not found on PR #${pr.number}, skipping`);
-                  } else {
-                    throw error;
-                  }
-                }
-              }
-            } else {
-              console.log('No pull requests associated with this workflow run');
-            }
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }}
+          REPO: ${{ github.event.workflow_run.repository.full_name }}
+        run: |
+          set -euo pipefail
+          PR=$(jq -r '.[0].number' <<<"$PRS")
+          gh pr edit "$PR" --repo "$REPO" --remove-label SLO
diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index fbbf66cc1..1669768b3 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -3,28 +3,6 @@ name: SLO
 on:
   pull_request:
     types: [opened, reopened, synchronize, labeled]
-    branches:
-      - main
-  workflow_dispatch:
-    inputs:
-      github_issue:
-        description: "GitHub issue / PR number where the SLO report will be posted (optional; will be inferred for PR runs)"
-        required: false
-      baseline_ref:
-        description: "Baseline commit/branch/tag to compare against (leave empty to auto-detect merge-base with main)"
-        required: false
-      slo_workload_read_max_rps:
-        description: "Maximum read RPS for the SLO workload"
-        required: false
-        default: "1000"
-      slo_workload_write_max_rps:
-        description: "Maximum write RPS for the SLO workload"
-        required: false
-        default: "100"
-      slo_workload_duration_seconds:
-        description: "Duration of the SLO workload in seconds"
-        required: false
-        default: "600"
 
 permissions:
   contents: read
@@ -33,233 +11,109 @@ permissions:
 
 jobs:
   ydb-slo-action:
+    if: contains(github.event.pull_request.labels.*.name, 'SLO')
+
     name: Run YDB SLO Tests
     runs-on: "large-runner-python-sdk"
 
-    # Run on PRs only when labeled "SLO"; allow manual runs via workflow_dispatch
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'SLO') }}
-
     strategy:
       fail-fast: false
       matrix:
-        include:
-          - id: sync-table
-            prefix: table
-            workload: sync-table
-          - id: sync-query
-            prefix: table
-            workload: sync-query
+        sdk:
+          - name: sync-table
+            command: "--read-rps 1000 --write-rps 100"
+          - name: sync-query
+            command: "--read-rps 1000 --write-rps 100"
 
     concurrency:
-      group: slo-${{ github.ref }}-${{ matrix.workload }}
+      group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
       cancel-in-progress: true
 
     steps:
       - name: Install dependencies
         run: |
+          set -euxo pipefail
           YQ_VERSION=v4.48.2
           BUILDX_VERSION=0.30.1
           COMPOSE_VERSION=2.40.3
 
-          sudo curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o /usr/local/bin/yq && \
-            sudo chmod +x /usr/local/bin/yq
+          sudo curl -fLo /usr/local/bin/yq \
+            "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64"
+          sudo chmod +x /usr/local/bin/yq
 
-          echo "Updating Docker plugins..."
           sudo mkdir -p /usr/local/lib/docker/cli-plugins
 
-          echo "Installing Docker Buildx ${BUILDX_VERSION}..."
           sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-buildx \
             "https://github.com/docker/buildx/releases/download/v${BUILDX_VERSION}/buildx-v${BUILDX_VERSION}.linux-amd64"
           sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx
 
-          echo "Installing Docker Compose ${COMPOSE_VERSION}..."
           sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-compose \
             "https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-linux-x86_64"
           sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
 
-          echo "Installed versions:"
           yq --version
           docker --version
           docker buildx version
           docker compose version
 
-      - name: Checkout current version
+      - name: Checkout current SDK version
         uses: actions/checkout@v5
         with:
-          path: current
+          path: sdk-current
           fetch-depth: 0
 
       - name: Determine baseline commit
         id: baseline
-        shell: bash
+        working-directory: sdk-current
         run: |
-          cd current
-          if [[ -n "${{ inputs.baseline_ref }}" ]]; then
-            BASELINE="${{ inputs.baseline_ref }}"
-          else
-            BASELINE=$(git merge-base HEAD origin/main)
-          fi
-          echo "sha=$BASELINE" >> $GITHUB_OUTPUT
+          set -euo pipefail
+          BASELINE=$(git merge-base HEAD origin/main)
+          echo "sha=${BASELINE}" >> "$GITHUB_OUTPUT"
 
-          # Try to determine a human-readable ref name for baseline
-          # Check if baseline is on main
-          if git merge-base --is-ancestor $BASELINE origin/main && \
-             [ "$(git rev-parse origin/main)" = "$BASELINE" ]; then
-             BASELINE_REF="main"
+          if git merge-base --is-ancestor "${BASELINE}" origin/main && \
+             [ "$(git rev-parse origin/main)" = "${BASELINE}" ]; then
+            BASELINE_REF="main"
           else
-            # Try to find a branch containing this commit
-            BRANCH=$(git branch -r --contains $BASELINE | grep -v HEAD | head -1 | sed 's/.*\///' || echo "")
-            if [ -n "$BRANCH" ]; then
+            BRANCH=$(git branch -r --contains "${BASELINE}" | grep -v HEAD | head -1 | sed 's|.*/||' || echo "")
+            if [ -n "${BRANCH}" ]; then
               BASELINE_REF="${BRANCH}@${BASELINE:0:7}"
             else
               BASELINE_REF="${BASELINE:0:7}"
             fi
           fi
-          echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT
+          echo "ref=${BASELINE_REF}" >> "$GITHUB_OUTPUT"
 
-      - name: Checkout baseline version
+      - name: Checkout baseline SDK version
         uses: actions/checkout@v5
         with:
           ref: ${{ steps.baseline.outputs.sha }}
-          path: baseline
+          path: sdk-baseline
           fetch-depth: 1
 
-      - name: Show Docker versions
-        run: |
-          docker --version
-          docker compose version
-
       - name: Build workload images (current + baseline)
         run: |
+          set -euxo pipefail
           docker build \
-            -f "$GITHUB_WORKSPACE/current/tests/slo/Dockerfile" \
+            -f "$GITHUB_WORKSPACE/sdk-current/tests/slo/Dockerfile" \
             -t "ydb-app-current" \
-            "$GITHUB_WORKSPACE/current"
+            "$GITHUB_WORKSPACE/sdk-current"
 
           docker build \
-            -f "$GITHUB_WORKSPACE/baseline/tests/slo/Dockerfile" \
+            -f "$GITHUB_WORKSPACE/sdk-baseline/tests/slo/Dockerfile" \
             -t "ydb-app-baseline" \
-            "$GITHUB_WORKSPACE/baseline"
+            "$GITHUB_WORKSPACE/sdk-baseline"
 
-      - name: Initialize YDB SLO
-        id: ydb_slo
-        uses: ydb-platform/ydb-slo-action/init@13c687b7d4b2879da79dd12932dee0ed2b65dd1c
+      - name: Run SLO Tests
+        uses: ydb-platform/ydb-slo-action/init@v2
+        timeout-minutes: 30
         with:
-          github_issue: ${{ github.event.pull_request.number || inputs.github_issue }}
+          github_issue: ${{ github.event.pull_request.number }}
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          workload_name: ydb-python-${{ matrix.workload }}
+          workload_name: ${{ matrix.sdk.name }}
+          workload_duration: "600"
           workload_current_ref: ${{ github.head_ref || github.ref_name }}
+          workload_current_image: ydb-app-current
+          workload_current_command: ${{ matrix.sdk.command }}
           workload_baseline_ref: ${{ steps.baseline.outputs.ref }}
-
-      - name: Prepare SLO Database
-        run: |
-          docker run --rm \
-            --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            -e "WORKLOAD=${{ matrix.workload }}" \
-            -e "REF=${{ github.head_ref || github.ref_name }}" \
-            ydb-app-current \
-            ${{ matrix.prefix }}-create grpc://ydb:2136 /Root/testdb
-
-      - name: Run SLO Tests (current + baseline in parallel)
-        timeout-minutes: 15
-        env:
-          WORKLOAD: ${{ matrix.workload }}
-          DURATION: ${{ inputs.slo_workload_duration_seconds || 600 }}
-          READ_RPS: ${{ inputs.slo_workload_read_max_rps || 1000 }}
-          WRITE_RPS: ${{ inputs.slo_workload_write_max_rps || 100 }}
-          CURRENT_REF: ${{ github.head_ref || github.ref_name }}
-          BASELINE_REF: ${{ steps.baseline.outputs.ref }}
-        run: |
-          ARGS="${{ matrix.prefix }}-run grpc://ydb:2136 /Root/testdb \
-            --otlp-endpoint http://prometheus:9090/api/v1/otlp/v1/metrics \
-            --report-period 250 \
-            --time ${DURATION} \
-            --read-rps ${READ_RPS} \
-            --write-rps ${WRITE_RPS} \
-            --read-timeout 1000 \
-            --write-timeout 1000"
-
-          echo "Starting current workload (ref=${CURRENT_REF}, workload=${WORKLOAD})..."
-          docker run -d \
-            --name ydb-app-current \
-            --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            -e "REF=${CURRENT_REF}" \
-            -e "WORKLOAD=${WORKLOAD}" \
-            ydb-app-current \
-            $ARGS
-
-          echo "Starting baseline workload (ref=${BASELINE_REF}, workload=${WORKLOAD})..."
-          docker run -d \
-            --name ydb-app-baseline \
-            --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            -e "REF=${BASELINE_REF}" \
-            -e "WORKLOAD=${WORKLOAD}" \
-            ydb-app-baseline \
-            $ARGS
-
-          echo ""
-          echo "==================== INITIAL CURRENT LOGS ===================="
-          docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
-          echo ""
-          echo "==================== INITIAL BASELINE LOGS ===================="
-          docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
-          echo ""
-
-          echo "Waiting for workloads to complete (${DURATION}s)..."
-          sleep ${DURATION}
-
-          echo "Stopping containers after ${DURATION}s..."
-          docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true
-
-          # Force kill if still running
-          docker kill ydb-app-current ydb-app-baseline 2>&1 || true
-
-          # Check exit codes
-          CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1")
-          BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "1")
-
-          echo "Current exit code: ${CURRENT_EXIT}"
-          echo "Baseline exit code: ${BASELINE_EXIT}"
-
-          echo ""
-          echo "==================== FINAL CURRENT LOGS ===================="
-          docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
-          echo ""
-          echo "==================== FINAL BASELINE LOGS ===================="
-          docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
-          echo ""
-
-          if [[ "${CURRENT_EXIT}" != "0" || "${BASELINE_EXIT}" != "0" ]]; then
-            echo "One or both workloads failed."
-            exit 0
-          fi
-
-          echo "SUCCESS: Workloads completed successfully"
-
-      - if: always()
-        name: Store logs
-        run: |
-          docker logs ydb-app-current > current.log 2>&1 || echo "No current container" > current.log
-          docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container" > baseline.log
-
-      - if: always()
-        name: Upload logs
-        uses: actions/upload-artifact@v4
-        with:
-          name: ydb-python-${{ matrix.workload }}-logs
-          path: |
-            ./current.log
-            ./baseline.log
-          retention-days: 1
+          workload_baseline_image: ydb-app-baseline
+          workload_baseline_command: ${{ matrix.sdk.command }}
diff --git a/tests/slo/Dockerfile b/tests/slo/Dockerfile
index 4f18a2b3f..b45d26111 100644
--- a/tests/slo/Dockerfile
+++ b/tests/slo/Dockerfile
@@ -1,38 +1,45 @@
 # syntax=docker/dockerfile:1
 
-# This image packages the Python SLO workload runner.
-# It expects to be run with arguments like:
-#   docker run --rm <image> table-run <endpoint> <db> --otlp-endpoint http://prometheus:9090/api/v1/otlp/v1/metrics ...
+# This image packages the Python SLO workload runner for ydb-slo-action v2.
+#
+# The action launches the same image for `current` and `baseline` workloads
+# in parallel and injects these env vars (see tests/slo/docker-entrypoint.sh):
+#
+#   WORKLOAD_NAME             sync-table | sync-query | topic
+#   WORKLOAD_REF              "current" / "main" — used as the `ref` metric label
+#   WORKLOAD_DURATION         seconds
+#   YDB_ENDPOINT              grpc://ydb:2136
+#   YDB_DATABASE              /Root/testdb
+#   OTEL_EXPORTER_OTLP_ENDPOINT / _METRICS_ENDPOINT
+#
+# `workload_current_command` from the action is appended to the `*-run` call
+# as tuning flags (--read-rps, --write-rps, ...).
 #
 # Notes:
 # - OpenTelemetry 1.39.x requires Python >= 3.9.
-# - The entrypoint is `python ./tests/slo/src`, i.e. it runs the `__main__.py`
-#   from that directory (same as `python tests/slo/src ...` in CI).
+# - gcc/libc6-dev are needed to build hdrhistogram from sdist.
 
-FROM python:3.11-slim AS build
+FROM python:3.11-slim
 
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1
 
-WORKDIR /src
-COPY . /src
-
-# Install runtime deps into an isolated venv so we can copy it into the final stage.
-RUN python -m venv /opt/venv \
-    && /opt/venv/bin/python -m pip install --no-cache-dir --upgrade pip \
-    && /opt/venv/bin/pip install --no-cache-dir . \
-    && /opt/venv/bin/pip install --no-cache-dir -r tests/slo/requirements.txt
-
+RUN apt-get update && apt-get install -y --no-install-recommends gcc libc6-dev && rm -rf /var/lib/apt/lists/*
 
-FROM python:3.11-slim
+WORKDIR /src
 
-ENV PYTHONDONTWRITEBYTECODE=1 \
-    PYTHONUNBUFFERED=1 \
-    PATH="/opt/venv/bin:${PATH}"
+# 1. YDB SDK
+COPY setup.py pyproject.toml README.md requirements.txt ./
+COPY ydb/ ydb/
+RUN pip install --no-cache-dir .
 
-WORKDIR /app
+# 2. SLO deps
+COPY tests/slo/requirements.txt tests/slo/requirements.txt
+RUN pip install --no-cache-dir -r tests/slo/requirements.txt
 
-COPY --from=build /opt/venv /opt/venv
-COPY --from=build /src/tests/slo/src /app/tests/slo/src
+# 3. Workload source + entrypoint
+COPY tests/slo/src /src/tests/slo/src
+COPY tests/slo/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
 
-ENTRYPOINT ["python", "./tests/slo/src"]
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
diff --git a/tests/slo/docker-entrypoint.sh b/tests/slo/docker-entrypoint.sh
new file mode 100755
index 000000000..9b84b45dc
--- /dev/null
+++ b/tests/slo/docker-entrypoint.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+# Workload entrypoint used by ydb-slo-action v2.
+#
+# The action launches the same image for both `current` and `baseline` workloads
+# in parallel; both must be able to schema-prepare and then run.
+#
+# Inputs come from the env vars injected by the action:
+#   WORKLOAD_NAME       sync-table | sync-query | topic
+#   WORKLOAD_DURATION   run duration in seconds
+#   YDB_ENDPOINT        grpc://ydb:2136
+#   YDB_DATABASE        /Root/testdb
+#
+# Anything passed after the script name is appended to the `*-run` command —
+# this is how tuning flags from `workload_current_command` (e.g. --read-rps)
+# reach the workload.
+
+set -e
+
+case "${WORKLOAD_NAME:-sync-query}" in
+    sync-table|sync-query) PREFIX=table ;;
+    topic) PREFIX=topic ;;
+    *)
+        echo "Unknown WORKLOAD_NAME: ${WORKLOAD_NAME}" >&2
+        exit 1
+        ;;
+esac
+
+ENDPOINT="${YDB_ENDPOINT:-grpc://localhost:2136}"
+DATABASE="${YDB_DATABASE:-/local}"
+DURATION="${WORKLOAD_DURATION:-600}"
+
+# Schema prep is idempotent at the SDK level for topics; for tables, a parallel
+# baseline container may race and fail with "already exists" — tolerate it.
+python ./tests/slo/src "${PREFIX}-create" "$ENDPOINT" "$DATABASE" \
+    || echo "WARN: ${PREFIX}-create exited non-zero (treated as already-prepared)" >&2
+
+exec python ./tests/slo/src \
+    "${PREFIX}-run" "$ENDPOINT" "$DATABASE" \
+    --time "$DURATION" \
+    "$@"
diff --git a/tests/slo/requirements.txt b/tests/slo/requirements.txt
index cd5cdfe1a..d9021cc5c 100644
--- a/tests/slo/requirements.txt
+++ b/tests/slo/requirements.txt
@@ -1,6 +1,6 @@
 requests==2.33.0
 aiolimiter==1.1.0
-quantile-estimator==0.1.2
+hdrhistogram==0.10.3
 
 # OpenTelemetry (OTLP/HTTP exporter)
 # NOTE: OpenTelemetry 1.39.1 requires Python >= 3.9.
diff --git a/tests/slo/src/core/metrics.py b/tests/slo/src/core/metrics.py
index bff90eda8..51d648443 100644
--- a/tests/slo/src/core/metrics.py
+++ b/tests/slo/src/core/metrics.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+import threading
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
@@ -10,36 +11,25 @@
 from typing import Any, Optional, Tuple
 
 OP_TYPE_READ, OP_TYPE_WRITE = "read", "write"
-OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "err"
+OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "error"
 
-REF = environ.get("REF", "main")
-WORKLOAD = environ.get("WORKLOAD", "sync-query")
+REF = environ.get("WORKLOAD_REF") or environ.get("REF") or "main"
+WORKLOAD = environ.get("WORKLOAD_NAME") or environ.get("WORKLOAD") or "sync-query"
 
 logger = logging.getLogger(__name__)
 
 
 def _normalize_labels(labels: Any) -> Tuple[Any, ...]:
-    """
-    Convert labels into a tuple of label values.
-
-    Important:
-    - `str` is an Iterable, but for our purposes it must be treated as a single label value.
-    """
     if labels is None:
         return tuple()
-
     if isinstance(labels, str):
         return (labels,)
-
     if isinstance(labels, tuple):
         return labels
-
     if isinstance(labels, list):
         return tuple(labels)
-
     if isinstance(labels, Iterable):
         return tuple(labels)
-
     return (labels,)
 
 
@@ -101,30 +91,29 @@ def push(self) -> None:
 
 class OtlpMetrics(BaseMetrics):
     """
-    Canonical OpenTelemetry metrics implementation.
+    Exports metrics via OTLP/HTTP to a Prometheus endpoint with OTLP receiver enabled.
 
-    This exports metrics via OTLP/HTTP to a Prometheus server with OTLP receiver enabled:
-      POST http(s)://<host>:<port>/api/v1/otlp/v1/metrics
-
-    Naming notes:
-    - Metric names follow OpenTelemetry conventions (dot-separated namespaces, e.g. `sdk.operations.total`).
-    - Prometheus OTLP translation typically converts dots to underscores and may add suffixes like
-      `_total` for counters and `_bucket/_sum/_count` for histograms.
+    Latency percentiles (p50/p95/p99) are computed client-side per push window via
+    HdrHistogram and emitted as gauges; counters are emitted via OTel counters.
+    Histogram is reset after each push so each sample represents the last window only.
     """
 
+    _HDR_MIN_US = 1
+    _HDR_MAX_US = 60_000_000  # 60s
+    _HDR_SIG_FIGS = 3
+    _PERCENTILES = (("p50", 50.0), ("p95", 95.0), ("p99", 99.0))
+
     def __init__(self, otlp_metrics_endpoint: str):
+        from hdrh.histogram import HdrHistogram
         from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
             OTLPMetricExporter,
         )
         from opentelemetry.sdk.metrics import MeterProvider
         from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
-        from opentelemetry.sdk.metrics.view import (
-            ExplicitBucketHistogramAggregation,
-            View,
-        )
         from opentelemetry.sdk.resources import Resource
 
-        # Resource attributes: Prometheus maps service.name -> job, service.instance.id -> instance.
+        self._HdrHistogram = HdrHistogram
+
         resource = Resource.create(
             {
                 "service.name": f"workload-{WORKLOAD}",
@@ -138,79 +127,59 @@ def __init__(self, otlp_metrics_endpoint: str):
         )
 
         exporter = OTLPMetricExporter(endpoint=otlp_metrics_endpoint)
-        reader = PeriodicExportingMetricReader(exporter)  # we force_flush() explicitly in push()
-
-        latency_view = View(
-            instrument_name="sdk.operation.latency",
-            aggregation=ExplicitBucketHistogramAggregation(
-                boundaries=(
-                    0.001,
-                    0.002,
-                    0.003,
-                    0.004,
-                    0.005,
-                    0.0075,
-                    0.010,
-                    0.020,
-                    0.050,
-                    0.100,
-                    0.200,
-                    0.500,
-                    1.000,
-                )
-            ),
-        )
-
-        self._provider = MeterProvider(
-            resource=resource,
-            metric_readers=[reader],
-            views=[latency_view],
-        )
+        reader = PeriodicExportingMetricReader(exporter)
+        self._provider = MeterProvider(resource=resource, metric_readers=[reader])
         self._meter = self._provider.get_meter("ydb-slo")
 
-        # Instruments (sync)
         self._errors = self._meter.create_counter(
             name="sdk.errors.total",
             description="Total number of errors encountered, categorized by error type.",
         )
         self._operations_total = self._meter.create_counter(
             name="sdk.operations.total",
-            description="Total number of operations, categorized by type attempted by the SDK.",
+            description="Total number of operations attempted by the SDK.",
         )
         self._operations_success_total = self._meter.create_counter(
             name="sdk.operations.success.total",
-            description="Total number of successful operations, categorized by type.",
+            description="Total number of successful operations.",
         )
         self._operations_failure_total = self._meter.create_counter(
             name="sdk.operations.failure.total",
-            description="Total number of failed operations, categorized by type.",
+            description="Total number of failed operations.",
         )
-        self._latency = self._meter.create_histogram(
-            name="sdk.operation.latency",
-            unit="s",
-            description="Latency of operations performed by the SDK in seconds, categorized by type and status.",
+        self._retry_attempts_total = self._meter.create_counter(
+            name="sdk.retry.attempts.total",
+            description="Total number of retry attempts.",
         )
-
         self._pending = self._meter.create_up_down_counter(
             name="sdk.pending.operations",
-            description="Current number of pending operations, categorized by type.",
+            description="Current number of pending operations.",
         )
+        self._latency_gauges = {
+            name: self._meter.create_gauge(
+                name=f"sdk.operation.latency.{name}.seconds",
+                unit="s",
+                description=f"Operation latency {name} computed over the last push window.",
+            )
+            for name, _ in self._PERCENTILES
+        }
 
-        self._retry_attempts_total = self._meter.create_counter(
-            name="sdk.retry.attempts.total",
-            description="Total number of retry attempts, categorized by ref and operation type.",
-        )
+        self._lock = threading.Lock()
+        self._hdr: dict = {}
 
-        self.reset()
+    def _get_hdr(self, op_type: str, op_status: str):
+        key = (op_type, op_status)
+        hist = self._hdr.get(key)
+        if hist is None:
+            hist = self._HdrHistogram(self._HDR_MIN_US, self._HDR_MAX_US, self._HDR_SIG_FIGS)
+            self._hdr[key] = hist
+        return hist
 
     def start(self, labels) -> float:
         labels_t = _normalize_labels(labels)
         self._pending.add(
             1,
-            attributes={
-                "ref": REF,
-                "operation_type": labels_t[0],
-            },
+            attributes={"ref": REF, "operation_type": labels_t[0]},
         )
         return time.time()
 
@@ -223,74 +192,81 @@ def stop(
     ) -> None:
         labels_t = _normalize_labels(labels)
         duration = time.time() - start_time
+        duration_us = min(max(int(duration * 1_000_000), self._HDR_MIN_US), self._HDR_MAX_US)
 
         op_type = labels_t[0]
-        base_attrs = {
-            "ref": REF,
-            "operation_type": op_type,
-        }
+        op_status = OP_STATUS_SUCCESS if error is None else OP_STATUS_FAILURE
+        base_attrs = {"ref": REF, "operation_type": op_type}
+        op_attrs = {**base_attrs, "operation_status": op_status}
 
-        # Update instruments
         self._retry_attempts_total.add(int(attempts), attributes=base_attrs)
         self._pending.add(-1, attributes=base_attrs)
-
-        # Counters + latency
-        self._operations_total.add(1, attributes=base_attrs)
+        self._operations_total.add(1, attributes=op_attrs)
 
         if error is not None:
-            self._errors.add(
-                1,
-                attributes={
-                    **base_attrs,
-                    "error_type": type(error).__name__,
-                },
-            )
+            self._errors.add(1, attributes={**base_attrs, "error_type": type(error).__name__})
             self._operations_failure_total.add(1, attributes=base_attrs)
-            self._latency.record(
-                duration,
-                attributes={
-                    **base_attrs,
-                    "operation_status": OP_STATUS_FAILURE,
-                },
-            )
-            return
-
-        self._operations_success_total.add(1, attributes=base_attrs)
-        self._latency.record(
-            duration,
-            attributes={
-                **base_attrs,
-                "operation_status": OP_STATUS_SUCCESS,
-            },
-        )
+        else:
+            self._operations_success_total.add(1, attributes=base_attrs)
+
+        with self._lock:
+            self._get_hdr(op_type, op_status).record_value(duration_us)
 
     def push(self) -> None:
-        # Metrics job calls push() with the cadence of --report-period.
-        # force_flush() makes the exporter send immediately.
+        with self._lock:
+            for (op_type, op_status), hist in self._hdr.items():
+                if hist.get_total_count() == 0:
+                    continue
+                attrs = {"ref": REF, "operation_type": op_type, "operation_status": op_status}
+                for name, percentile in self._PERCENTILES:
+                    value_s = hist.get_value_at_percentile(percentile) / 1_000_000
+                    self._latency_gauges[name].set(value_s, attributes=attrs)
+            for hist in self._hdr.values():
+                hist.reset()
         self._provider.force_flush()
 
     def reset(self) -> None:
-        # OpenTelemetry counters/histograms are cumulative and cannot be reset.
-        # Reset is implemented as an immediate push/flush.
-        self.push()
+        with self._lock:
+            for hist in self._hdr.values():
+                hist.reset()
+        self._provider.force_flush()
 
 
-def create_metrics(otlp_endpoint: Optional[str]) -> BaseMetrics:
+def _resolve_metrics_endpoint(cli_endpoint: Optional[str]) -> str:
     """
-    Factory used by SLO runners.
+    Resolution order:
+      1. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT (used as-is)
+      2. OTEL_EXPORTER_OTLP_ENDPOINT + /v1/metrics suffix
+      3. CLI --otlp-endpoint
+    """
+    metrics_env = environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", "").strip()
+    if metrics_env:
+        return metrics_env
+
+    base_env = environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "").strip()
+    if base_env:
+        base = base_env.rstrip("/")
+        if base.endswith("/v1/metrics"):
+            return base
+        return f"{base}/v1/metrics"
 
-    Metrics are enabled if either:
-    - OTLP_ENDPOINT env var is set, or
-    - `--otlp-endpoint` is provided (and non-empty)
+    return (cli_endpoint or "").strip()
+
+
+def create_metrics(otlp_endpoint: Optional[str]) -> BaseMetrics:
+    """
+    Build a metrics exporter.
 
-    If endpoint is empty, metrics are disabled (DummyMetrics).
+    Metrics are enabled if a non-empty endpoint can be derived from either the
+    OTEL_EXPORTER_OTLP_METRICS_ENDPOINT / OTEL_EXPORTER_OTLP_ENDPOINT env vars
+    or the explicit `--otlp-endpoint` CLI flag. Otherwise DummyMetrics is used.
     """
-    endpoint = (environ.get("OTLP_ENDPOINT") or (otlp_endpoint or "")).strip()
+    endpoint = _resolve_metrics_endpoint(otlp_endpoint)
     if not endpoint:
         logger.info("Creating dummy metrics (metrics disabled)")
         return DummyMetrics()
 
-    logger.info("Creating OTLP metrics exporter to Prometheus: %s", endpoint)
+    logger.info("Creating OTLP metrics exporter to: %s", endpoint)
     try:
         return OtlpMetrics(endpoint)
     except Exception:

From 8d60d4dd80cac46aed99dd28aac663b53a529dfa Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Thu, 28 May 2026 12:36:47 +0300
Subject: [PATCH 2/3] inline slo post actions

---
 .github/workflows/slo-report.yml | 40 --------------------------------
 .github/workflows/slo.yml        | 33 ++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 40 deletions(-)
 delete mode 100644 .github/workflows/slo-report.yml

diff --git a/.github/workflows/slo-report.yml b/.github/workflows/slo-report.yml
deleted file mode 100644
index cd02c703f..000000000
--- a/.github/workflows/slo-report.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: slo-report
-
-on:
-  workflow_run:
-    workflows: ["SLO"]
-    types:
-      - completed
-
-jobs:
-  publish-slo-report:
-    if: github.event.workflow_run.conclusion == 'success'
-    runs-on: ubuntu-latest
-    name: Publish YDB SLO Report
-    permissions:
-      checks: write
-      contents: read
-      pull-requests: write
-    steps:
-      - name: Publish YDB SLO Report
-        uses: ydb-platform/ydb-slo-action/report@v2
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          github_run_id: ${{ github.event.workflow_run.id }}
-
-  remove-slo-label:
-    if: github.event.workflow_run.event == 'pull_request'
-    runs-on: ubuntu-latest
-    name: Remove SLO Label
-    permissions:
-      pull-requests: write
-    steps:
-      - name: Remove SLO label from PR
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }}
-          REPO: ${{ github.event.workflow_run.repository.full_name }}
-        run: |
-          set -euo pipefail
-          PR=$(jq -r '.[0].number' <<<"$PRS")
-          gh pr edit "$PR" --repo "$REPO" --remove-label SLO
diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 1669768b3..f4bfd29d7 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -117,3 +117,36 @@ jobs:
           workload_baseline_ref: ${{ steps.baseline.outputs.ref }}
           workload_baseline_image: ydb-app-baseline
           workload_baseline_command: ${{ matrix.sdk.command }}
+
+  publish-slo-report:
+    if: always()
+    name: Publish YDB SLO Report
+    needs: ydb-slo-action
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      contents: read
+      pull-requests: write
+    steps:
+      - name: Publish YDB SLO Report
+        uses: ydb-platform/ydb-slo-action/report@v2
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          github_run_id: ${{ github.event.workflow_run.id }}
+
+  remove-slo-label:
+    name: Remove SLO Label
+    needs: ydb-slo-action
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Remove SLO label from PR
+        env:
+          PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }}
+          REPO: ${{ github.event.workflow_run.repository.full_name }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          PR=$(jq -r '.[0].number' <<<"$PRS")
+          gh pr edit "$PR" --repo "$REPO" --remove-label SLO

From 3963d63ca59c95e4c27df580708ca8f7d73fcb90 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Thu, 28 May 2026 17:06:04 +0300
Subject: [PATCH 3/3] Build baseline workload from current Dockerfile +
 tests/slo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v2 SLO action runs both current and baseline images with the same
command. The baseline checkout is on main, which still has the old
`python ./tests/slo/src` entrypoint that expects a subcommand as
the first arg — so passing tuning flags directly (e.g. --read-rps 1000)
fails with 'argument subcommand: invalid choice'.

Mirror the Java SDK setup: build the baseline image with the current
PR's Dockerfile + tests/slo runner against the baseline ydb library.
That way the runner-side contract (entrypoint script, metrics format,
env vars) is identical for both images and only the SDK under test
differs.

Also fix the inline report/label-removal jobs to use pull_request
context (github.run_id, github.event.pull_request.number) instead of
workflow_run fields that are undefined for this trigger.
---
 .github/workflows/slo.yml | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index f4bfd29d7..90ac4508b 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -93,11 +93,19 @@ jobs:
       - name: Build workload images (current + baseline)
         run: |
           set -euxo pipefail
+
+          # Build current: SDK + workload runner both from this PR.
           docker build \
             -f "$GITHUB_WORKSPACE/sdk-current/tests/slo/Dockerfile" \
             -t "ydb-app-current" \
             "$GITHUB_WORKSPACE/sdk-current"
 
+          # Build baseline: baseline SDK with the current workload runner
+          # (Dockerfile + tests/slo/), so the runner-side contract changes
+          # (entrypoint, metrics format) apply uniformly to both images.
+          rm -rf "$GITHUB_WORKSPACE/sdk-baseline/tests/slo"
+          cp -r  "$GITHUB_WORKSPACE/sdk-current/tests/slo" \
+                 "$GITHUB_WORKSPACE/sdk-baseline/tests/slo"
           docker build \
             -f "$GITHUB_WORKSPACE/sdk-baseline/tests/slo/Dockerfile" \
             -t "ydb-app-baseline" \
@@ -119,7 +127,6 @@ jobs:
           workload_baseline_command: ${{ matrix.sdk.command }}
 
   publish-slo-report:
-    if: always()
     name: Publish YDB SLO Report
     needs: ydb-slo-action
     runs-on: ubuntu-latest
@@ -132,9 +139,10 @@ jobs:
         uses: ydb-platform/ydb-slo-action/report@v2
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          github_run_id: ${{ github.event.workflow_run.id }}
+          github_run_id: ${{ github.run_id }}
 
   remove-slo-label:
+    if: always()
     name: Remove SLO Label
     needs: ydb-slo-action
     runs-on: ubuntu-latest
@@ -143,10 +151,9 @@ jobs:
     steps:
       - name: Remove SLO label from PR
         env:
-          PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }}
-          REPO: ${{ github.event.workflow_run.repository.full_name }}
+          PR: ${{ github.event.pull_request.number }}
+          REPO: ${{ github.repository }}
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           set -euo pipefail
-          PR=$(jq -r '.[0].number' <<<"$PRS")
           gh pr edit "$PR" --repo "$REPO" --remove-label SLO