diff --git a/backend/app/services/openclaw/provisioning.py b/backend/app/services/openclaw/provisioning.py index d775e38d..3a1d76f2 100644 --- a/backend/app/services/openclaw/provisioning.py +++ b/backend/app/services/openclaw/provisioning.py @@ -601,7 +601,8 @@ class OpenClawGatewayControlPlane(GatewayControlPlane): if agent_just_created: await asyncio.sleep(0.75) - # Retry agents.update a few times to handle gateway hot-reload race. + # Retry agents.update only when this call just created the agent. + # If create reported "already exists", "not found" should fail fast. _update_retries = 5 _update_delay = 0.5 for _attempt in range(_update_retries): @@ -617,7 +618,12 @@ class OpenClawGatewayControlPlane(GatewayControlPlane): ) break except OpenClawGatewayError as exc: - if _is_missing_agent_error(exc) and _attempt < _update_retries - 1: + should_retry = ( + agent_just_created + and _is_missing_agent_error(exc) + and _attempt < _update_retries - 1 + ) + if should_retry: await asyncio.sleep(_update_delay) _update_delay = min(_update_delay * 2, 4.0) continue diff --git a/backend/tests/test_agent_provisioning_utils.py b/backend/tests/test_agent_provisioning_utils.py index a5fec525..871c62b8 100644 --- a/backend/tests/test_agent_provisioning_utils.py +++ b/backend/tests/test_agent_provisioning_utils.py @@ -530,6 +530,89 @@ async def test_control_plane_upsert_agent_handles_already_exists(monkeypatch): assert calls[1][0] == "agents.update" +@pytest.mark.asyncio +async def test_control_plane_upsert_agent_retries_update_after_create_race(monkeypatch): + calls: list[tuple[str, dict[str, object] | None]] = [] + sleeps: list[float] = [] + update_attempts = 0 + + async def _fake_sleep(seconds: float) -> None: + sleeps.append(seconds) + + async def _fake_openclaw_call(method, params=None, config=None): + nonlocal update_attempts + _ = config + calls.append((method, params)) + if method == "agents.create": + return {"ok": True} + if method == "agents.update": + update_attempts += 1 + if update_attempts < 3: + raise agent_provisioning.OpenClawGatewayError('agent "board-agent-a" not found') + return {"ok": True} + if method == "config.get": + return {"hash": None, "config": {"agents": {"list": []}}} + if method == "config.patch": + return {"ok": True} + raise AssertionError(f"Unexpected method: {method}") + + monkeypatch.setattr(agent_provisioning, "openclaw_call", _fake_openclaw_call) + monkeypatch.setattr(agent_provisioning.asyncio, "sleep", _fake_sleep) + cp = agent_provisioning.OpenClawGatewayControlPlane( + agent_provisioning.GatewayClientConfig(url="ws://gateway.example/ws", token=None), + ) + await cp.upsert_agent( + agent_provisioning.GatewayAgentRegistration( + agent_id="board-agent-a", + name="Board Agent A", + workspace_path="/tmp/workspace-board-agent-a", + heartbeat={"every": "10m", "target": "last", "includeReasoning": False}, + ), + ) + + update_calls = [method for method, _ in calls if method == "agents.update"] + assert len(update_calls) == 3 + assert sleeps == [0.75, 0.5, 1.0] + + +@pytest.mark.asyncio +async def test_control_plane_upsert_agent_missing_after_already_exists_fails_fast(monkeypatch): + calls: list[tuple[str, dict[str, object] | None]] = [] + sleeps: list[float] = [] + + async def _fake_sleep(seconds: float) -> None: + sleeps.append(seconds) + + async def _fake_openclaw_call(method, params=None, config=None): + _ = config + calls.append((method, params)) + if method == "agents.create": + raise agent_provisioning.OpenClawGatewayError("already exists") + if method == "agents.update": + raise agent_provisioning.OpenClawGatewayError('agent "board-agent-a" not found') + raise AssertionError(f"Unexpected method: {method}") + + monkeypatch.setattr(agent_provisioning, "openclaw_call", _fake_openclaw_call) + monkeypatch.setattr(agent_provisioning.asyncio, "sleep", _fake_sleep) + cp = agent_provisioning.OpenClawGatewayControlPlane( + agent_provisioning.GatewayClientConfig(url="ws://gateway.example/ws", token=None), + ) + + with pytest.raises(agent_provisioning.OpenClawGatewayError): + await cp.upsert_agent( + agent_provisioning.GatewayAgentRegistration( + agent_id="board-agent-a", + name="Board Agent A", + workspace_path="/tmp/workspace-board-agent-a", + heartbeat={"every": "10m", "target": "last", "includeReasoning": False}, + ), + ) + + update_calls = [method for method, _ in calls if method == "agents.update"] + assert len(update_calls) == 1 + assert sleeps == [] + + def test_is_missing_agent_error_matches_gateway_agent_not_found() -> None: assert agent_provisioning._is_missing_agent_error( agent_provisioning.OpenClawGatewayError('agent "mc-abc" not found'),