refactor: enhance transient error handling and improve session key validation in template_sync.py
This commit is contained in:
@@ -24,6 +24,29 @@ from app.schemas.gateways import GatewayTemplatesSyncError, GatewayTemplatesSync
|
|||||||
from app.services.agent_provisioning import provision_agent, provision_main_agent
|
from app.services.agent_provisioning import provision_agent, provision_main_agent
|
||||||
|
|
||||||
_TOOLS_KV_RE = re.compile(r"^(?P<key>[A-Z0-9_]+)=(?P<value>.*)$")
|
_TOOLS_KV_RE = re.compile(r"^(?P<key>[A-Z0-9_]+)=(?P<value>.*)$")
|
||||||
|
SESSION_KEY_PARTS_MIN = 2
|
||||||
|
_NON_TRANSIENT_GATEWAY_ERROR_MARKERS = ("unsupported file",)
|
||||||
|
_TRANSIENT_GATEWAY_ERROR_MARKERS = (
|
||||||
|
"connect call failed",
|
||||||
|
"connection refused",
|
||||||
|
"errno 111",
|
||||||
|
"econnrefused",
|
||||||
|
"did not receive a valid http response",
|
||||||
|
"no route to host",
|
||||||
|
"network is unreachable",
|
||||||
|
"host is down",
|
||||||
|
"name or service not known",
|
||||||
|
"received 1012",
|
||||||
|
"service restart",
|
||||||
|
"http 503",
|
||||||
|
"http 502",
|
||||||
|
"http 504",
|
||||||
|
"temporar",
|
||||||
|
"timeout",
|
||||||
|
"timed out",
|
||||||
|
"connection closed",
|
||||||
|
"connection reset",
|
||||||
|
)
|
||||||
|
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|
||||||
@@ -39,31 +62,15 @@ def _is_transient_gateway_error(exc: Exception) -> bool:
|
|||||||
message = str(exc).lower()
|
message = str(exc).lower()
|
||||||
if not message:
|
if not message:
|
||||||
return False
|
return False
|
||||||
if "unsupported file" in message:
|
if any(marker in message for marker in _NON_TRANSIENT_GATEWAY_ERROR_MARKERS):
|
||||||
return False
|
return False
|
||||||
if "connect call failed" in message or "connection refused" in message:
|
return ("503" in message and "websocket" in message) or any(
|
||||||
return True
|
marker in message for marker in _TRANSIENT_GATEWAY_ERROR_MARKERS
|
||||||
if "errno 111" in message or "econnrefused" in message:
|
)
|
||||||
return True
|
|
||||||
if "did not receive a valid http response" in message:
|
|
||||||
return True
|
def _gateway_timeout_message(exc: OpenClawGatewayError) -> str:
|
||||||
if "no route to host" in message or "network is unreachable" in message:
|
return f"Gateway unreachable after 10 minutes (template sync timeout). Last error: {exc}"
|
||||||
return True
|
|
||||||
if "host is down" in message or "name or service not known" in message:
|
|
||||||
return True
|
|
||||||
if "received 1012" in message or "service restart" in message:
|
|
||||||
return True
|
|
||||||
if "http 503" in message or ("503" in message and "websocket" in message):
|
|
||||||
return True
|
|
||||||
if "http 502" in message or "http 504" in message:
|
|
||||||
return True
|
|
||||||
if "temporar" in message:
|
|
||||||
return True
|
|
||||||
if "timeout" in message or "timed out" in message:
|
|
||||||
return True
|
|
||||||
if "connection closed" in message or "connection reset" in message:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class _GatewayBackoff:
|
class _GatewayBackoff:
|
||||||
@@ -91,18 +98,13 @@ class _GatewayBackoff:
|
|||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
value = await fn()
|
value = await fn()
|
||||||
self.reset()
|
|
||||||
return value
|
|
||||||
except OpenClawGatewayError as exc:
|
except OpenClawGatewayError as exc:
|
||||||
if not _is_transient_gateway_error(exc):
|
if not _is_transient_gateway_error(exc):
|
||||||
raise
|
raise
|
||||||
now = asyncio.get_running_loop().time()
|
now = asyncio.get_running_loop().time()
|
||||||
remaining = deadline_s - now
|
remaining = deadline_s - now
|
||||||
if remaining <= 0:
|
if remaining <= 0:
|
||||||
raise TimeoutError(
|
raise TimeoutError(_gateway_timeout_message(exc)) from exc
|
||||||
"Gateway unreachable after 10 minutes (template sync timeout). "
|
|
||||||
f"Last error: {exc}"
|
|
||||||
) from exc
|
|
||||||
|
|
||||||
sleep_s = min(self._delay_s, remaining)
|
sleep_s = min(self._delay_s, remaining)
|
||||||
if self._jitter:
|
if self._jitter:
|
||||||
@@ -110,6 +112,9 @@ class _GatewayBackoff:
|
|||||||
sleep_s = max(0.0, min(sleep_s, remaining))
|
sleep_s = max(0.0, min(sleep_s, remaining))
|
||||||
await asyncio.sleep(sleep_s)
|
await asyncio.sleep(sleep_s)
|
||||||
self._delay_s = min(self._delay_s * 2.0, self._max_delay_s)
|
self._delay_s = min(self._delay_s * 2.0, self._max_delay_s)
|
||||||
|
else:
|
||||||
|
self.reset()
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
async def _with_gateway_retry(
|
async def _with_gateway_retry(
|
||||||
@@ -127,7 +132,7 @@ def _agent_id_from_session_key(session_key: str | None) -> str | None:
|
|||||||
if not value.startswith("agent:"):
|
if not value.startswith("agent:"):
|
||||||
return None
|
return None
|
||||||
parts = value.split(":")
|
parts = value.split(":")
|
||||||
if len(parts) < 2:
|
if len(parts) < SESSION_KEY_PARTS_MIN:
|
||||||
return None
|
return None
|
||||||
agent_id = parts[1].strip()
|
agent_id = parts[1].strip()
|
||||||
return agent_id or None
|
return agent_id or None
|
||||||
@@ -167,7 +172,7 @@ def _gateway_agent_id(agent: Agent) -> str:
|
|||||||
session_key = agent.openclaw_session_id or ""
|
session_key = agent.openclaw_session_id or ""
|
||||||
if session_key.startswith("agent:"):
|
if session_key.startswith("agent:"):
|
||||||
parts = session_key.split(":")
|
parts = session_key.split(":")
|
||||||
if len(parts) >= 2 and parts[1]:
|
if len(parts) >= SESSION_KEY_PARTS_MIN and parts[1]:
|
||||||
return parts[1]
|
return parts[1]
|
||||||
return _slugify(agent.name)
|
return _slugify(agent.name)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user