-
Notifications
You must be signed in to change notification settings - Fork 144
test: retry Dapr example port-bind flake #1030
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
70f0464
013f876
1c2b5f8
1ff3990
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,10 @@ | |
|
|
||
| REPO_ROOT = Path(__file__).resolve().parent.parent.parent | ||
| EXAMPLES_DIR = REPO_ROOT / 'examples' | ||
| DAPR_PORT_BIND_FAILURE_MARKERS = ( | ||
| 'bind: address already in use', | ||
| 'failed to start internal gRPC server: could not listen on any endpoint', | ||
| ) | ||
|
|
||
|
|
||
| def pytest_configure(config: pytest.Config) -> None: | ||
|
|
@@ -38,7 +42,18 @@ def _terminate(proc: subprocess.Popen[str]) -> None: | |
| terminate_process_group(proc, force=True) | ||
| proc.wait() | ||
|
|
||
| def run(self, args: str, *, timeout: int = 30, until: list[str] | None = None) -> str: | ||
| @staticmethod | ||
| def _is_dapr_port_bind_failure(output: str) -> bool: | ||
| return all(marker in output for marker in DAPR_PORT_BIND_FAILURE_MARKERS) | ||
|
|
||
| def run( | ||
| self, | ||
| args: str, | ||
| *, | ||
| timeout: int = 30, | ||
| until: list[str] | None = None, | ||
| port_bind_retries: int = 3, | ||
| ) -> str: | ||
| """Run a foreground command, block until it finishes, and return output. | ||
|
|
||
| Use this for short-lived processes (e.g. a publisher that exits on its | ||
|
|
@@ -49,7 +64,24 @@ def run(self, args: str, *, timeout: int = 30, until: list[str] | None = None) - | |
| timeout: Maximum seconds to wait before killing the process. | ||
| until: If provided, the process is terminated as soon as every | ||
| string in this list has appeared in the accumulated output. | ||
| port_bind_retries: Retry count for Dapr sidecar startup failures | ||
| caused by a transient random-port collision. | ||
| """ | ||
| attempts = max(1, port_bind_retries + 1) | ||
| for attempt in range(attempts): | ||
| output = self._run_once(args, timeout=timeout, until=until) | ||
| if attempt < attempts - 1 and self._is_dapr_port_bind_failure(output): | ||
| print( | ||
| 'Dapr sidecar failed to bind a random port; ' | ||
| f'retrying startup after {2**attempt}s ' | ||
| f'(attempt {attempt + 1}/{attempts})', | ||
| flush=True, | ||
| ) | ||
| time.sleep(2**attempt) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I'd limit this sleep to something like 5 or 10 seconds, if a user sets 10 attempts, the last attempt will take a few minutes and I don't think that's useful. I like it having exponential backoff, but it grows too fast and high-end is just too much. What do you think? |
||
| continue | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should print a warning message here so we know we got a sidecar retry because the port allocation failed |
||
| return output | ||
|
|
||
| def _run_once(self, args: str, *, timeout: int, until: list[str] | None) -> str: | ||
| proc = subprocess.Popen( | ||
| args=('dapr', 'run', *shlex.split(args)), | ||
| cwd=self._cwd, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| import subprocess | ||
| import time | ||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
|
|
||
| from tests.examples.conftest import DaprRunner | ||
|
|
||
|
|
||
| class FakeProcess: | ||
| def __init__(self, output: str, returncode: int = 1) -> None: | ||
| self.stdout = iter(output.splitlines(keepends=True)) | ||
| self.returncode = returncode | ||
|
|
||
| def poll(self) -> int: | ||
| return self.returncode | ||
|
|
||
| def wait(self, timeout: int | None = None) -> int: | ||
| return self.returncode | ||
|
|
||
|
|
||
| def test_run_retries_transient_dapr_port_bind_failure( | ||
| monkeypatch, tmp_path: Path, capsys: pytest.CaptureFixture[str] | ||
| ) -> None: | ||
| outputs = [ | ||
| ( | ||
| 'level=error msg="Failed to listen for gRPC server on TCP address :33223 ' | ||
| 'with error: listen tcp :33223: bind: address already in use"\n' | ||
| 'level=fatal msg="Fatal error from runtime: failed to start internal gRPC ' | ||
| 'server: could not listen on any endpoint"\n' | ||
| ), | ||
| "{'secretKey': 'secretValue'}\n", | ||
| ] | ||
| popen_calls = [] | ||
|
|
||
| def fake_popen(*args, **kwargs) -> FakeProcess: | ||
| popen_calls.append((args, kwargs)) | ||
| return FakeProcess(outputs.pop(0)) | ||
|
|
||
| monkeypatch.setattr(subprocess, 'Popen', fake_popen) | ||
| sleeps: list[int] = [] | ||
| monkeypatch.setattr(time, 'sleep', sleeps.append) | ||
|
|
||
| output = DaprRunner(tmp_path).run('--app-id=secretsapp -- python3 example.py', timeout=1) | ||
|
|
||
| assert output == "{'secretKey': 'secretValue'}\n" | ||
| assert len(popen_calls) == 2 | ||
| assert sleeps == [1] | ||
| assert ( | ||
| 'Dapr sidecar failed to bind a random port; retrying startup after 1s' | ||
| in capsys.readouterr().out | ||
| ) | ||
|
|
||
|
|
||
| def test_run_does_not_retry_non_port_bind_failure(monkeypatch, tmp_path: Path) -> None: | ||
| popen_calls = [] | ||
|
|
||
| def fake_popen(*args, **kwargs) -> FakeProcess: | ||
| popen_calls.append((args, kwargs)) | ||
| return FakeProcess('application failed before printing expected output\n') | ||
|
|
||
| monkeypatch.setattr(subprocess, 'Popen', fake_popen) | ||
|
|
||
| output = DaprRunner(tmp_path).run('--app-id=secretsapp -- python3 example.py', timeout=1) | ||
|
|
||
| assert output == 'application failed before printing expected output\n' | ||
| assert len(popen_calls) == 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd use a very basic backoff algorithm here. Something like
time.sleep(2**attempt)should be enough