refactor: enhance transient error handling and improve session key validation in template_sync.py

This commit is contained in:
Abhimanyu Saharan
2026-02-09 02:56:43 +05:30
parent 3660aa44d6
commit e2af067c85

View File

@@ -24,6 +24,29 @@ from app.schemas.gateways import GatewayTemplatesSyncError, GatewayTemplatesSync
from app.services.agent_provisioning import provision_agent, provision_main_agent from app.services.agent_provisioning import provision_agent, provision_main_agent
_TOOLS_KV_RE = re.compile(r"^(?P<key>[A-Z0-9_]+)=(?P<value>.*)$") _TOOLS_KV_RE = re.compile(r"^(?P<key>[A-Z0-9_]+)=(?P<value>.*)$")
SESSION_KEY_PARTS_MIN = 2
_NON_TRANSIENT_GATEWAY_ERROR_MARKERS = ("unsupported file",)
_TRANSIENT_GATEWAY_ERROR_MARKERS = (
"connect call failed",
"connection refused",
"errno 111",
"econnrefused",
"did not receive a valid http response",
"no route to host",
"network is unreachable",
"host is down",
"name or service not known",
"received 1012",
"service restart",
"http 503",
"http 502",
"http 504",
"temporar",
"timeout",
"timed out",
"connection closed",
"connection reset",
)
T = TypeVar("T") T = TypeVar("T")
@@ -39,31 +62,15 @@ def _is_transient_gateway_error(exc: Exception) -> bool:
message = str(exc).lower() message = str(exc).lower()
if not message: if not message:
return False return False
if "unsupported file" in message: if any(marker in message for marker in _NON_TRANSIENT_GATEWAY_ERROR_MARKERS):
return False return False
if "connect call failed" in message or "connection refused" in message: return ("503" in message and "websocket" in message) or any(
return True marker in message for marker in _TRANSIENT_GATEWAY_ERROR_MARKERS
if "errno 111" in message or "econnrefused" in message: )
return True
if "did not receive a valid http response" in message:
return True def _gateway_timeout_message(exc: OpenClawGatewayError) -> str:
if "no route to host" in message or "network is unreachable" in message: return f"Gateway unreachable after 10 minutes (template sync timeout). Last error: {exc}"
return True
if "host is down" in message or "name or service not known" in message:
return True
if "received 1012" in message or "service restart" in message:
return True
if "http 503" in message or ("503" in message and "websocket" in message):
return True
if "http 502" in message or "http 504" in message:
return True
if "temporar" in message:
return True
if "timeout" in message or "timed out" in message:
return True
if "connection closed" in message or "connection reset" in message:
return True
return False
class _GatewayBackoff: class _GatewayBackoff:
@@ -91,18 +98,13 @@ class _GatewayBackoff:
while True: while True:
try: try:
value = await fn() value = await fn()
self.reset()
return value
except OpenClawGatewayError as exc: except OpenClawGatewayError as exc:
if not _is_transient_gateway_error(exc): if not _is_transient_gateway_error(exc):
raise raise
now = asyncio.get_running_loop().time() now = asyncio.get_running_loop().time()
remaining = deadline_s - now remaining = deadline_s - now
if remaining <= 0: if remaining <= 0:
raise TimeoutError( raise TimeoutError(_gateway_timeout_message(exc)) from exc
"Gateway unreachable after 10 minutes (template sync timeout). "
f"Last error: {exc}"
) from exc
sleep_s = min(self._delay_s, remaining) sleep_s = min(self._delay_s, remaining)
if self._jitter: if self._jitter:
@@ -110,6 +112,9 @@ class _GatewayBackoff:
sleep_s = max(0.0, min(sleep_s, remaining)) sleep_s = max(0.0, min(sleep_s, remaining))
await asyncio.sleep(sleep_s) await asyncio.sleep(sleep_s)
self._delay_s = min(self._delay_s * 2.0, self._max_delay_s) self._delay_s = min(self._delay_s * 2.0, self._max_delay_s)
else:
self.reset()
return value
async def _with_gateway_retry( async def _with_gateway_retry(
@@ -127,7 +132,7 @@ def _agent_id_from_session_key(session_key: str | None) -> str | None:
if not value.startswith("agent:"): if not value.startswith("agent:"):
return None return None
parts = value.split(":") parts = value.split(":")
if len(parts) < 2: if len(parts) < SESSION_KEY_PARTS_MIN:
return None return None
agent_id = parts[1].strip() agent_id = parts[1].strip()
return agent_id or None return agent_id or None
@@ -167,7 +172,7 @@ def _gateway_agent_id(agent: Agent) -> str:
session_key = agent.openclaw_session_id or "" session_key = agent.openclaw_session_id or ""
if session_key.startswith("agent:"): if session_key.startswith("agent:"):
parts = session_key.split(":") parts = session_key.split(":")
if len(parts) >= 2 and parts[1]: if len(parts) >= SESSION_KEY_PARTS_MIN and parts[1]:
return parts[1] return parts[1]
return _slugify(agent.name) return _slugify(agent.name)