feat: implement unified agent lifecycle orchestration and metadata tracking
This commit is contained in:
@@ -21,24 +21,18 @@ from app.models.gateways import Gateway
|
||||
from app.models.tasks import Task
|
||||
from app.schemas.gateways import GatewayTemplatesSyncResult
|
||||
from app.services.openclaw.constants import DEFAULT_HEARTBEAT_CONFIG
|
||||
from app.services.openclaw.db_agent_state import (
|
||||
mark_provision_complete,
|
||||
mark_provision_requested,
|
||||
mint_agent_token,
|
||||
)
|
||||
from app.services.openclaw.db_service import OpenClawDBService
|
||||
from app.services.openclaw.error_messages import normalize_gateway_error_message
|
||||
from app.services.openclaw.gateway_compat import check_gateway_version_compatibility
|
||||
from app.services.openclaw.gateway_rpc import GatewayConfig as GatewayClientConfig
|
||||
from app.services.openclaw.gateway_rpc import OpenClawGatewayError, openclaw_call
|
||||
from app.services.openclaw.provisioning import OpenClawGatewayProvisioner
|
||||
from app.services.openclaw.lifecycle_orchestrator import AgentLifecycleOrchestrator
|
||||
from app.services.openclaw.provisioning_db import (
|
||||
GatewayTemplateSyncOptions,
|
||||
OpenClawProvisioningService,
|
||||
)
|
||||
from app.services.openclaw.session_service import GatewayTemplateSyncQuery
|
||||
from app.services.openclaw.shared import GatewayAgentIdentity
|
||||
from app.services.organizations import get_org_owner_user
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from sqlmodel.ext.asyncio.session import AsyncSession
|
||||
@@ -222,69 +216,38 @@ class GatewayAdminLifecycleService(OpenClawDBService):
|
||||
action: str,
|
||||
notify: bool,
|
||||
) -> Agent:
|
||||
template_user = user or await get_org_owner_user(
|
||||
self.session,
|
||||
organization_id=gateway.organization_id,
|
||||
)
|
||||
if template_user is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
detail="Organization owner not found (required for gateway agent USER.md rendering).",
|
||||
)
|
||||
raw_token = mint_agent_token(agent)
|
||||
mark_provision_requested(
|
||||
agent,
|
||||
action=action,
|
||||
status="updating" if action == "update" else "provisioning",
|
||||
)
|
||||
await self.add_commit_refresh(agent)
|
||||
if not gateway.url:
|
||||
return agent
|
||||
|
||||
orchestrator = AgentLifecycleOrchestrator(self.session)
|
||||
try:
|
||||
await OpenClawGatewayProvisioner().apply_agent_lifecycle(
|
||||
agent=agent,
|
||||
provisioned = await orchestrator.run_lifecycle(
|
||||
gateway=gateway,
|
||||
agent_id=agent.id,
|
||||
board=None,
|
||||
auth_token=raw_token,
|
||||
user=template_user,
|
||||
user=user,
|
||||
action=action,
|
||||
auth_token=None,
|
||||
force_bootstrap=False,
|
||||
reset_session=False,
|
||||
wake=notify,
|
||||
deliver_wakeup=True,
|
||||
wakeup_verb=None,
|
||||
clear_confirm_token=False,
|
||||
raise_gateway_errors=True,
|
||||
)
|
||||
except OpenClawGatewayError as exc:
|
||||
except HTTPException:
|
||||
self.logger.error(
|
||||
"gateway.main_agent.provision_failed_gateway gateway_id=%s agent_id=%s error=%s",
|
||||
"gateway.main_agent.provision_failed gateway_id=%s agent_id=%s action=%s",
|
||||
gateway.id,
|
||||
agent.id,
|
||||
str(exc),
|
||||
action,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Gateway {action} failed: {exc}",
|
||||
) from exc
|
||||
except (OSError, RuntimeError, ValueError) as exc:
|
||||
self.logger.error(
|
||||
"gateway.main_agent.provision_failed gateway_id=%s agent_id=%s error=%s",
|
||||
gateway.id,
|
||||
agent.id,
|
||||
str(exc),
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Unexpected error {action}ing gateway provisioning.",
|
||||
) from exc
|
||||
|
||||
mark_provision_complete(agent, status="online")
|
||||
await self.add_commit_refresh(agent)
|
||||
|
||||
raise
|
||||
self.logger.info(
|
||||
"gateway.main_agent.provision_success gateway_id=%s agent_id=%s action=%s",
|
||||
gateway.id,
|
||||
agent.id,
|
||||
provisioned.id,
|
||||
action,
|
||||
)
|
||||
return agent
|
||||
return provisioned
|
||||
|
||||
async def ensure_main_agent(
|
||||
self,
|
||||
|
||||
@@ -18,6 +18,11 @@ DEFAULT_HEARTBEAT_CONFIG: dict[str, Any] = {
|
||||
}
|
||||
|
||||
OFFLINE_AFTER = timedelta(minutes=10)
|
||||
# Provisioning convergence policy:
|
||||
# - require first heartbeat/check-in within 30s of wake
|
||||
# - allow up to 3 wake attempts before giving up
|
||||
CHECKIN_DEADLINE_AFTER_WAKE = timedelta(seconds=30)
|
||||
MAX_WAKE_ATTEMPTS_WITHOUT_CHECKIN = 3
|
||||
AGENT_SESSION_PREFIX = "agent"
|
||||
|
||||
DEFAULT_CHANNEL_HEARTBEAT_VISIBILITY: dict[str, bool] = {
|
||||
|
||||
167
backend/app/services/openclaw/lifecycle_orchestrator.py
Normal file
167
backend/app/services/openclaw/lifecycle_orchestrator.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""Unified agent lifecycle orchestration.
|
||||
|
||||
This module centralizes DB-backed lifecycle transitions so call sites do not
|
||||
duplicate provisioning/wake/state logic.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import HTTPException, status
|
||||
from sqlmodel import col, select
|
||||
|
||||
from app.core.time import utcnow
|
||||
from app.models.agents import Agent
|
||||
from app.models.boards import Board
|
||||
from app.models.gateways import Gateway
|
||||
from app.services.openclaw.constants import CHECKIN_DEADLINE_AFTER_WAKE
|
||||
from app.services.openclaw.db_agent_state import (
|
||||
mark_provision_complete,
|
||||
mark_provision_requested,
|
||||
mint_agent_token,
|
||||
)
|
||||
from app.services.openclaw.db_service import OpenClawDBService
|
||||
from app.services.openclaw.gateway_rpc import OpenClawGatewayError
|
||||
from app.services.openclaw.lifecycle_queue import (
|
||||
QueuedAgentLifecycleReconcile,
|
||||
enqueue_lifecycle_reconcile,
|
||||
)
|
||||
from app.services.openclaw.provisioning import OpenClawGatewayProvisioner
|
||||
from app.services.organizations import get_org_owner_user
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from sqlmodel.ext.asyncio.session import AsyncSession
|
||||
|
||||
from app.models.users import User
|
||||
|
||||
|
||||
class AgentLifecycleOrchestrator(OpenClawDBService):
|
||||
"""Single lifecycle writer for agent provision/update transitions."""
|
||||
|
||||
def __init__(self, session: AsyncSession) -> None:
|
||||
super().__init__(session)
|
||||
|
||||
async def _lock_agent(self, *, agent_id: UUID) -> Agent:
|
||||
statement = select(Agent).where(col(Agent.id) == agent_id).with_for_update()
|
||||
agent = (await self.session.exec(statement)).first()
|
||||
if agent is None:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Agent not found")
|
||||
return agent
|
||||
|
||||
async def run_lifecycle(
|
||||
self,
|
||||
*,
|
||||
gateway: Gateway,
|
||||
agent_id: UUID,
|
||||
board: Board | None,
|
||||
user: User | None,
|
||||
action: str,
|
||||
auth_token: str | None = None,
|
||||
force_bootstrap: bool = False,
|
||||
reset_session: bool = False,
|
||||
wake: bool = True,
|
||||
deliver_wakeup: bool = True,
|
||||
wakeup_verb: str | None = None,
|
||||
clear_confirm_token: bool = False,
|
||||
raise_gateway_errors: bool = True,
|
||||
) -> Agent:
|
||||
"""Provision or update any agent under a per-agent lock."""
|
||||
|
||||
locked = await self._lock_agent(agent_id=agent_id)
|
||||
template_user = user
|
||||
if board is None and template_user is None:
|
||||
template_user = await get_org_owner_user(
|
||||
self.session,
|
||||
organization_id=gateway.organization_id,
|
||||
)
|
||||
if template_user is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
detail=(
|
||||
"Organization owner not found "
|
||||
"(required for gateway agent USER.md rendering)."
|
||||
),
|
||||
)
|
||||
|
||||
raw_token = auth_token or mint_agent_token(locked)
|
||||
mark_provision_requested(
|
||||
locked,
|
||||
action=action,
|
||||
status="updating" if action == "update" else "provisioning",
|
||||
)
|
||||
locked.lifecycle_generation += 1
|
||||
locked.last_provision_error = None
|
||||
locked.checkin_deadline_at = utcnow() + CHECKIN_DEADLINE_AFTER_WAKE if wake else None
|
||||
if wake:
|
||||
locked.wake_attempts += 1
|
||||
locked.last_wake_sent_at = utcnow()
|
||||
self.session.add(locked)
|
||||
await self.session.flush()
|
||||
|
||||
if not gateway.url:
|
||||
await self.session.commit()
|
||||
await self.session.refresh(locked)
|
||||
return locked
|
||||
|
||||
try:
|
||||
await OpenClawGatewayProvisioner().apply_agent_lifecycle(
|
||||
agent=locked,
|
||||
gateway=gateway,
|
||||
board=board,
|
||||
auth_token=raw_token,
|
||||
user=template_user,
|
||||
action=action,
|
||||
force_bootstrap=force_bootstrap,
|
||||
reset_session=reset_session,
|
||||
wake=wake,
|
||||
deliver_wakeup=deliver_wakeup,
|
||||
wakeup_verb=wakeup_verb,
|
||||
)
|
||||
except OpenClawGatewayError as exc:
|
||||
locked.last_provision_error = str(exc)
|
||||
locked.updated_at = utcnow()
|
||||
self.session.add(locked)
|
||||
await self.session.commit()
|
||||
await self.session.refresh(locked)
|
||||
if raise_gateway_errors:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Gateway {action} failed: {exc}",
|
||||
) from exc
|
||||
return locked
|
||||
except (OSError, RuntimeError, ValueError) as exc:
|
||||
locked.last_provision_error = str(exc)
|
||||
locked.updated_at = utcnow()
|
||||
self.session.add(locked)
|
||||
await self.session.commit()
|
||||
await self.session.refresh(locked)
|
||||
if raise_gateway_errors:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Unexpected error {action}ing gateway provisioning.",
|
||||
) from exc
|
||||
return locked
|
||||
|
||||
mark_provision_complete(
|
||||
locked,
|
||||
status="online",
|
||||
clear_confirm_token=clear_confirm_token,
|
||||
)
|
||||
locked.last_provision_error = None
|
||||
locked.checkin_deadline_at = utcnow() + CHECKIN_DEADLINE_AFTER_WAKE if wake else None
|
||||
self.session.add(locked)
|
||||
await self.session.commit()
|
||||
await self.session.refresh(locked)
|
||||
if wake and locked.checkin_deadline_at is not None:
|
||||
enqueue_lifecycle_reconcile(
|
||||
QueuedAgentLifecycleReconcile(
|
||||
agent_id=locked.id,
|
||||
gateway_id=locked.gateway_id,
|
||||
board_id=locked.board_id,
|
||||
generation=locked.lifecycle_generation,
|
||||
checkin_deadline_at=locked.checkin_deadline_at,
|
||||
)
|
||||
)
|
||||
return locked
|
||||
122
backend/app/services/openclaw/lifecycle_queue.py
Normal file
122
backend/app/services/openclaw/lifecycle_queue.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Queue payload helpers for stuck-agent lifecycle reconciliation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.logging import get_logger
|
||||
from app.core.time import utcnow
|
||||
from app.services.queue import QueuedTask, enqueue_task_with_delay
|
||||
from app.services.queue import requeue_if_failed as generic_requeue_if_failed
|
||||
|
||||
logger = get_logger(__name__)
|
||||
TASK_TYPE = "agent_lifecycle_reconcile"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QueuedAgentLifecycleReconcile:
|
||||
"""Queued payload metadata for lifecycle reconciliation checks."""
|
||||
|
||||
agent_id: UUID
|
||||
gateway_id: UUID
|
||||
board_id: UUID | None
|
||||
generation: int
|
||||
checkin_deadline_at: datetime
|
||||
attempts: int = 0
|
||||
|
||||
|
||||
def _task_from_payload(payload: QueuedAgentLifecycleReconcile) -> QueuedTask:
|
||||
return QueuedTask(
|
||||
task_type=TASK_TYPE,
|
||||
payload={
|
||||
"agent_id": str(payload.agent_id),
|
||||
"gateway_id": str(payload.gateway_id),
|
||||
"board_id": str(payload.board_id) if payload.board_id is not None else None,
|
||||
"generation": payload.generation,
|
||||
"checkin_deadline_at": payload.checkin_deadline_at.isoformat(),
|
||||
},
|
||||
created_at=utcnow(),
|
||||
attempts=payload.attempts,
|
||||
)
|
||||
|
||||
|
||||
def decode_lifecycle_task(task: QueuedTask) -> QueuedAgentLifecycleReconcile:
|
||||
if task.task_type not in {TASK_TYPE, "legacy"}:
|
||||
raise ValueError(f"Unexpected task_type={task.task_type!r}; expected {TASK_TYPE!r}")
|
||||
payload: dict[str, Any] = task.payload
|
||||
raw_board_id = payload.get("board_id")
|
||||
board_id = UUID(raw_board_id) if isinstance(raw_board_id, str) and raw_board_id else None
|
||||
raw_deadline = payload.get("checkin_deadline_at")
|
||||
if not isinstance(raw_deadline, str):
|
||||
raise ValueError("checkin_deadline_at is required")
|
||||
return QueuedAgentLifecycleReconcile(
|
||||
agent_id=UUID(str(payload["agent_id"])),
|
||||
gateway_id=UUID(str(payload["gateway_id"])),
|
||||
board_id=board_id,
|
||||
generation=int(payload["generation"]),
|
||||
checkin_deadline_at=datetime.fromisoformat(raw_deadline),
|
||||
attempts=int(payload.get("attempts", task.attempts)),
|
||||
)
|
||||
|
||||
|
||||
def enqueue_lifecycle_reconcile(payload: QueuedAgentLifecycleReconcile) -> bool:
|
||||
"""Enqueue a delayed reconcile check keyed to the expected check-in deadline."""
|
||||
now = utcnow()
|
||||
delay_seconds = max(0.0, (payload.checkin_deadline_at - now).total_seconds())
|
||||
queued = _task_from_payload(payload)
|
||||
ok = enqueue_task_with_delay(
|
||||
queued,
|
||||
settings.rq_queue_name,
|
||||
delay_seconds=delay_seconds,
|
||||
redis_url=settings.rq_redis_url,
|
||||
)
|
||||
if ok:
|
||||
logger.info(
|
||||
"lifecycle.queue.enqueued",
|
||||
extra={
|
||||
"agent_id": str(payload.agent_id),
|
||||
"generation": payload.generation,
|
||||
"delay_seconds": delay_seconds,
|
||||
"attempt": payload.attempts,
|
||||
},
|
||||
)
|
||||
return ok
|
||||
|
||||
|
||||
def defer_lifecycle_reconcile(
|
||||
task: QueuedTask,
|
||||
*,
|
||||
delay_seconds: float,
|
||||
) -> bool:
|
||||
"""Defer a reconcile task without incrementing retry attempts."""
|
||||
payload = decode_lifecycle_task(task)
|
||||
deferred = QueuedAgentLifecycleReconcile(
|
||||
agent_id=payload.agent_id,
|
||||
gateway_id=payload.gateway_id,
|
||||
board_id=payload.board_id,
|
||||
generation=payload.generation,
|
||||
checkin_deadline_at=payload.checkin_deadline_at,
|
||||
attempts=task.attempts,
|
||||
)
|
||||
queued = _task_from_payload(deferred)
|
||||
return enqueue_task_with_delay(
|
||||
queued,
|
||||
settings.rq_queue_name,
|
||||
delay_seconds=max(0.0, delay_seconds),
|
||||
redis_url=settings.rq_redis_url,
|
||||
)
|
||||
|
||||
|
||||
def requeue_lifecycle_queue_task(task: QueuedTask, *, delay_seconds: float = 0) -> bool:
|
||||
"""Requeue a failed lifecycle task with capped retries."""
|
||||
return generic_requeue_if_failed(
|
||||
task,
|
||||
settings.rq_queue_name,
|
||||
max_retries=settings.rq_dispatch_max_retries,
|
||||
redis_url=settings.rq_redis_url,
|
||||
delay_seconds=max(0.0, delay_seconds),
|
||||
)
|
||||
140
backend/app/services/openclaw/lifecycle_reconcile.py
Normal file
140
backend/app/services/openclaw/lifecycle_reconcile.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""Worker handlers for lifecycle reconciliation tasks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
from app.core.logging import get_logger
|
||||
from app.core.time import utcnow
|
||||
from app.db.session import async_session_maker
|
||||
from app.models.agents import Agent
|
||||
from app.models.boards import Board
|
||||
from app.models.gateways import Gateway
|
||||
from app.services.openclaw.constants import MAX_WAKE_ATTEMPTS_WITHOUT_CHECKIN
|
||||
from app.services.openclaw.lifecycle_orchestrator import AgentLifecycleOrchestrator
|
||||
from app.services.openclaw.lifecycle_queue import decode_lifecycle_task, defer_lifecycle_reconcile
|
||||
from app.services.queue import QueuedTask
|
||||
|
||||
logger = get_logger(__name__)
|
||||
_RECONCILE_TIMEOUT_SECONDS = 60.0
|
||||
|
||||
|
||||
def _has_checked_in_since_wake(agent: Agent) -> bool:
|
||||
if agent.last_seen_at is None:
|
||||
return False
|
||||
if agent.last_wake_sent_at is None:
|
||||
return True
|
||||
return agent.last_seen_at >= agent.last_wake_sent_at
|
||||
|
||||
|
||||
async def process_lifecycle_queue_task(task: QueuedTask) -> None:
|
||||
"""Re-run lifecycle provisioning when an agent misses post-provision check-in."""
|
||||
payload = decode_lifecycle_task(task)
|
||||
now = utcnow()
|
||||
|
||||
async with async_session_maker() as session:
|
||||
agent = await Agent.objects.by_id(payload.agent_id).first(session)
|
||||
if agent is None:
|
||||
logger.info(
|
||||
"lifecycle.reconcile.skip_missing_agent",
|
||||
extra={"agent_id": str(payload.agent_id)},
|
||||
)
|
||||
return
|
||||
|
||||
# Ignore stale queue messages after a newer lifecycle generation.
|
||||
if agent.lifecycle_generation != payload.generation:
|
||||
logger.info(
|
||||
"lifecycle.reconcile.skip_stale_generation",
|
||||
extra={
|
||||
"agent_id": str(agent.id),
|
||||
"queued_generation": payload.generation,
|
||||
"current_generation": agent.lifecycle_generation,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
if _has_checked_in_since_wake(agent):
|
||||
logger.info(
|
||||
"lifecycle.reconcile.skip_not_stuck",
|
||||
extra={"agent_id": str(agent.id), "status": agent.status},
|
||||
)
|
||||
return
|
||||
|
||||
deadline = agent.checkin_deadline_at or payload.checkin_deadline_at
|
||||
if agent.status == "deleting":
|
||||
logger.info(
|
||||
"lifecycle.reconcile.skip_deleting",
|
||||
extra={"agent_id": str(agent.id)},
|
||||
)
|
||||
return
|
||||
|
||||
if now < deadline:
|
||||
delay = max(0.0, (deadline - now).total_seconds())
|
||||
if not defer_lifecycle_reconcile(task, delay_seconds=delay):
|
||||
msg = "Failed to defer lifecycle reconcile task"
|
||||
raise RuntimeError(msg)
|
||||
logger.info(
|
||||
"lifecycle.reconcile.deferred",
|
||||
extra={"agent_id": str(agent.id), "delay_seconds": delay},
|
||||
)
|
||||
return
|
||||
|
||||
if agent.wake_attempts >= MAX_WAKE_ATTEMPTS_WITHOUT_CHECKIN:
|
||||
agent.status = "offline"
|
||||
agent.checkin_deadline_at = None
|
||||
agent.last_provision_error = (
|
||||
"Agent did not check in after wake; max wake attempts reached"
|
||||
)
|
||||
agent.updated_at = utcnow()
|
||||
session.add(agent)
|
||||
await session.commit()
|
||||
logger.warning(
|
||||
"lifecycle.reconcile.max_attempts_reached",
|
||||
extra={
|
||||
"agent_id": str(agent.id),
|
||||
"wake_attempts": agent.wake_attempts,
|
||||
"max_attempts": MAX_WAKE_ATTEMPTS_WITHOUT_CHECKIN,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
gateway = await Gateway.objects.by_id(agent.gateway_id).first(session)
|
||||
if gateway is None:
|
||||
logger.warning(
|
||||
"lifecycle.reconcile.skip_missing_gateway",
|
||||
extra={"agent_id": str(agent.id), "gateway_id": str(agent.gateway_id)},
|
||||
)
|
||||
return
|
||||
board: Board | None = None
|
||||
if agent.board_id is not None:
|
||||
board = await Board.objects.by_id(agent.board_id).first(session)
|
||||
if board is None:
|
||||
logger.warning(
|
||||
"lifecycle.reconcile.skip_missing_board",
|
||||
extra={"agent_id": str(agent.id), "board_id": str(agent.board_id)},
|
||||
)
|
||||
return
|
||||
|
||||
orchestrator = AgentLifecycleOrchestrator(session)
|
||||
await asyncio.wait_for(
|
||||
orchestrator.run_lifecycle(
|
||||
gateway=gateway,
|
||||
agent_id=agent.id,
|
||||
board=board,
|
||||
user=None,
|
||||
action="update",
|
||||
auth_token=None,
|
||||
force_bootstrap=False,
|
||||
reset_session=True,
|
||||
wake=True,
|
||||
deliver_wakeup=True,
|
||||
wakeup_verb="updated",
|
||||
clear_confirm_token=True,
|
||||
raise_gateway_errors=True,
|
||||
),
|
||||
timeout=_RECONCILE_TIMEOUT_SECONDS,
|
||||
)
|
||||
logger.info(
|
||||
"lifecycle.reconcile.retriggered",
|
||||
extra={"agent_id": str(agent.id), "generation": payload.generation},
|
||||
)
|
||||
@@ -52,8 +52,6 @@ from app.services.openclaw.constants import (
|
||||
OFFLINE_AFTER,
|
||||
)
|
||||
from app.services.openclaw.db_agent_state import (
|
||||
mark_provision_complete,
|
||||
mark_provision_requested,
|
||||
mint_agent_token,
|
||||
)
|
||||
from app.services.openclaw.db_service import OpenClawDBService
|
||||
@@ -74,6 +72,7 @@ from app.services.openclaw.internal.session_keys import (
|
||||
board_agent_session_key,
|
||||
board_lead_session_key,
|
||||
)
|
||||
from app.services.openclaw.lifecycle_orchestrator import AgentLifecycleOrchestrator
|
||||
from app.services.openclaw.policies import OpenClawAuthorizationPolicy
|
||||
from app.services.openclaw.provisioning import (
|
||||
OpenClawGatewayControlPlane,
|
||||
@@ -143,7 +142,6 @@ class OpenClawProvisioningService(OpenClawDBService):
|
||||
|
||||
def __init__(self, session: AsyncSession) -> None:
|
||||
super().__init__(session)
|
||||
self._gateway = OpenClawGatewayProvisioner()
|
||||
|
||||
@staticmethod
|
||||
def lead_session_key(board: Board) -> str:
|
||||
@@ -213,25 +211,25 @@ class OpenClawProvisioningService(OpenClawDBService):
|
||||
openclaw_session_id=self.lead_session_key(board),
|
||||
)
|
||||
raw_token = mint_agent_token(agent)
|
||||
mark_provision_requested(agent, action=config_options.action, status="provisioning")
|
||||
await self.add_commit_refresh(agent)
|
||||
|
||||
# Strict behavior: provisioning errors surface to the caller. The DB row exists
|
||||
# so a later retry can succeed with the same deterministic identity/session key.
|
||||
await self._gateway.apply_agent_lifecycle(
|
||||
agent=agent,
|
||||
agent = await AgentLifecycleOrchestrator(self.session).run_lifecycle(
|
||||
gateway=request.gateway,
|
||||
agent_id=agent.id,
|
||||
board=board,
|
||||
auth_token=raw_token,
|
||||
user=request.user,
|
||||
action=config_options.action,
|
||||
auth_token=raw_token,
|
||||
force_bootstrap=False,
|
||||
reset_session=False,
|
||||
wake=True,
|
||||
deliver_wakeup=True,
|
||||
wakeup_verb=None,
|
||||
clear_confirm_token=False,
|
||||
raise_gateway_errors=True,
|
||||
)
|
||||
|
||||
mark_provision_complete(agent, status="online")
|
||||
await self.add_commit_refresh(agent)
|
||||
|
||||
return agent, True
|
||||
|
||||
async def sync_gateway_templates(
|
||||
@@ -298,7 +296,6 @@ class OpenClawProvisioningService(OpenClawDBService):
|
||||
control_plane=control_plane,
|
||||
backoff=GatewayBackoff(timeout_s=10 * 60, timeout_context="template sync"),
|
||||
options=options,
|
||||
provisioner=self._gateway,
|
||||
)
|
||||
if not await _ping_gateway(ctx, result):
|
||||
return result
|
||||
@@ -352,7 +349,6 @@ class _SyncContext:
|
||||
control_plane: OpenClawGatewayControlPlane
|
||||
backoff: GatewayBackoff
|
||||
options: GatewayTemplateSyncOptions
|
||||
provisioner: OpenClawGatewayProvisioner
|
||||
|
||||
|
||||
def _parse_tools_md(content: str) -> dict[str, str]:
|
||||
@@ -584,18 +580,26 @@ async def _sync_one_agent(
|
||||
try:
|
||||
|
||||
async def _do_provision() -> bool:
|
||||
await ctx.provisioner.apply_agent_lifecycle(
|
||||
agent=agent,
|
||||
gateway=ctx.gateway,
|
||||
board=board,
|
||||
auth_token=auth_token,
|
||||
user=ctx.options.user,
|
||||
action="update",
|
||||
force_bootstrap=ctx.options.force_bootstrap,
|
||||
overwrite=ctx.options.overwrite,
|
||||
reset_session=ctx.options.reset_sessions,
|
||||
wake=False,
|
||||
)
|
||||
try:
|
||||
await AgentLifecycleOrchestrator(ctx.session).run_lifecycle(
|
||||
gateway=ctx.gateway,
|
||||
agent_id=agent.id,
|
||||
board=board,
|
||||
user=ctx.options.user,
|
||||
action="update",
|
||||
auth_token=auth_token,
|
||||
force_bootstrap=ctx.options.force_bootstrap,
|
||||
reset_session=ctx.options.reset_sessions,
|
||||
wake=False,
|
||||
deliver_wakeup=False,
|
||||
wakeup_verb="updated",
|
||||
clear_confirm_token=False,
|
||||
raise_gateway_errors=True,
|
||||
)
|
||||
except HTTPException as exc:
|
||||
if exc.status_code == status.HTTP_502_BAD_GATEWAY:
|
||||
raise OpenClawGatewayError(str(exc.detail)) from exc
|
||||
raise
|
||||
return True
|
||||
|
||||
await ctx.backoff.run(_do_provision)
|
||||
@@ -613,6 +617,15 @@ async def _sync_one_agent(
|
||||
message=f"Failed to sync templates: {exc}",
|
||||
)
|
||||
return False
|
||||
except HTTPException as exc:
|
||||
result.agents_skipped += 1
|
||||
_append_sync_error(
|
||||
result,
|
||||
agent=agent,
|
||||
board=board,
|
||||
message=f"Failed to sync templates: {exc.detail}",
|
||||
)
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
@@ -655,18 +668,26 @@ async def _sync_main_agent(
|
||||
try:
|
||||
|
||||
async def _do_provision_main() -> bool:
|
||||
await ctx.provisioner.apply_agent_lifecycle(
|
||||
agent=main_agent,
|
||||
gateway=ctx.gateway,
|
||||
board=None,
|
||||
auth_token=token,
|
||||
user=ctx.options.user,
|
||||
action="update",
|
||||
force_bootstrap=ctx.options.force_bootstrap,
|
||||
overwrite=ctx.options.overwrite,
|
||||
reset_session=ctx.options.reset_sessions,
|
||||
wake=False,
|
||||
)
|
||||
try:
|
||||
await AgentLifecycleOrchestrator(ctx.session).run_lifecycle(
|
||||
gateway=ctx.gateway,
|
||||
agent_id=main_agent.id,
|
||||
board=None,
|
||||
user=ctx.options.user,
|
||||
action="update",
|
||||
auth_token=token,
|
||||
force_bootstrap=ctx.options.force_bootstrap,
|
||||
reset_session=ctx.options.reset_sessions,
|
||||
wake=False,
|
||||
deliver_wakeup=False,
|
||||
wakeup_verb="updated",
|
||||
clear_confirm_token=False,
|
||||
raise_gateway_errors=True,
|
||||
)
|
||||
except HTTPException as exc:
|
||||
if exc.status_code == status.HTTP_502_BAD_GATEWAY:
|
||||
raise OpenClawGatewayError(str(exc.detail)) from exc
|
||||
raise
|
||||
return True
|
||||
|
||||
await ctx.backoff.run(_do_provision_main)
|
||||
@@ -679,6 +700,12 @@ async def _sync_main_agent(
|
||||
agent=main_agent,
|
||||
message=f"Failed to sync gateway agent templates: {exc}",
|
||||
)
|
||||
except HTTPException as exc:
|
||||
_append_sync_error(
|
||||
result,
|
||||
agent=main_agent,
|
||||
message=f"Failed to sync gateway agent templates: {exc.detail}",
|
||||
)
|
||||
else:
|
||||
result.main_updated = True
|
||||
return stop_sync
|
||||
@@ -1038,7 +1065,6 @@ class AgentLifecycleService(OpenClawDBService):
|
||||
) -> tuple[Agent, str]:
|
||||
agent = Agent.model_validate(data)
|
||||
raw_token = mint_agent_token(agent)
|
||||
mark_provision_requested(agent, action="provision", status="provisioning")
|
||||
agent.openclaw_session_id = self.resolve_session_key(agent)
|
||||
await self.add_commit_refresh(agent)
|
||||
return agent, raw_token
|
||||
@@ -1068,92 +1094,63 @@ class AgentLifecycleService(OpenClawDBService):
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="board is required for non-main agent provisioning",
|
||||
)
|
||||
template_user = user
|
||||
if target.is_main_agent and template_user is None:
|
||||
template_user = await get_org_owner_user(
|
||||
self.session,
|
||||
organization_id=target.gateway.organization_id,
|
||||
)
|
||||
if template_user is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
detail=(
|
||||
"User context is required to provision the gateway main agent "
|
||||
"(org owner not found)."
|
||||
),
|
||||
)
|
||||
await OpenClawGatewayProvisioner().apply_agent_lifecycle(
|
||||
agent=agent,
|
||||
provisioned = await AgentLifecycleOrchestrator(self.session).run_lifecycle(
|
||||
gateway=target.gateway,
|
||||
agent_id=agent.id,
|
||||
board=target.board if not target.is_main_agent else None,
|
||||
auth_token=auth_token,
|
||||
user=template_user,
|
||||
user=user,
|
||||
action=action,
|
||||
auth_token=auth_token,
|
||||
force_bootstrap=force_bootstrap,
|
||||
reset_session=True,
|
||||
wake=True,
|
||||
deliver_wakeup=True,
|
||||
wakeup_verb=wakeup_verb,
|
||||
clear_confirm_token=True,
|
||||
raise_gateway_errors=raise_gateway_errors,
|
||||
)
|
||||
mark_provision_complete(agent, status="online", clear_confirm_token=True)
|
||||
self.session.add(agent)
|
||||
await self.session.commit()
|
||||
record_activity(
|
||||
self.session,
|
||||
event_type=f"agent.{action}.direct",
|
||||
message=f"{action.capitalize()}d directly for {agent.name}.",
|
||||
agent_id=agent.id,
|
||||
message=f"{action.capitalize()}d directly for {provisioned.name}.",
|
||||
agent_id=provisioned.id,
|
||||
)
|
||||
record_activity(
|
||||
self.session,
|
||||
event_type="agent.wakeup.sent",
|
||||
message=f"Wakeup message sent to {agent.name}.",
|
||||
agent_id=agent.id,
|
||||
message=f"Wakeup message sent to {provisioned.name}.",
|
||||
agent_id=provisioned.id,
|
||||
)
|
||||
await self.session.commit()
|
||||
self.logger.info(
|
||||
"agent.provision.success action=%s agent_id=%s",
|
||||
action,
|
||||
agent.id,
|
||||
provisioned.id,
|
||||
)
|
||||
except OpenClawGatewayError as exc:
|
||||
except HTTPException as exc:
|
||||
self.record_instruction_failure(
|
||||
self.session,
|
||||
agent,
|
||||
str(exc),
|
||||
str(exc.detail),
|
||||
action,
|
||||
)
|
||||
await self.session.commit()
|
||||
self.logger.error(
|
||||
"agent.provision.gateway_error action=%s agent_id=%s error=%s",
|
||||
action,
|
||||
agent.id,
|
||||
str(exc),
|
||||
)
|
||||
if exc.status_code == status.HTTP_502_BAD_GATEWAY:
|
||||
self.logger.error(
|
||||
"agent.provision.gateway_error action=%s agent_id=%s error=%s",
|
||||
action,
|
||||
agent.id,
|
||||
str(exc.detail),
|
||||
)
|
||||
else:
|
||||
self.logger.critical(
|
||||
"agent.provision.runtime_error action=%s agent_id=%s error=%s",
|
||||
action,
|
||||
agent.id,
|
||||
str(exc.detail),
|
||||
)
|
||||
if raise_gateway_errors:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Gateway {action} failed: {exc}",
|
||||
) from exc
|
||||
except (OSError, RuntimeError, ValueError) as exc: # pragma: no cover
|
||||
self.record_instruction_failure(
|
||||
self.session,
|
||||
agent,
|
||||
str(exc),
|
||||
action,
|
||||
)
|
||||
await self.session.commit()
|
||||
self.logger.critical(
|
||||
"agent.provision.runtime_error action=%s agent_id=%s error=%s",
|
||||
action,
|
||||
agent.id,
|
||||
str(exc),
|
||||
)
|
||||
if raise_gateway_errors:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Unexpected error {action}ing agent provisioning.",
|
||||
) from exc
|
||||
raise
|
||||
|
||||
async def provision_new_agent(
|
||||
self,
|
||||
@@ -1315,7 +1312,6 @@ class AgentLifecycleService(OpenClawDBService):
|
||||
@staticmethod
|
||||
def mark_agent_update_pending(agent: Agent) -> str:
|
||||
raw_token = mint_agent_token(agent)
|
||||
mark_provision_requested(agent, action="update", status="updating")
|
||||
return raw_token
|
||||
|
||||
async def provision_updated_agent(
|
||||
@@ -1390,7 +1386,6 @@ class AgentLifecycleService(OpenClawDBService):
|
||||
return
|
||||
|
||||
raw_token = mint_agent_token(agent)
|
||||
mark_provision_requested(agent, action="provision", status="provisioning")
|
||||
await self.add_commit_refresh(agent)
|
||||
board = await self.require_board(
|
||||
str(agent.board_id) if agent.board_id else None,
|
||||
@@ -1436,6 +1431,10 @@ class AgentLifecycleService(OpenClawDBService):
|
||||
elif agent.status == "provisioning":
|
||||
agent.status = "online"
|
||||
agent.last_seen_at = utcnow()
|
||||
# Successful check-in ends the current wake escalation cycle.
|
||||
agent.wake_attempts = 0
|
||||
agent.checkin_deadline_at = None
|
||||
agent.last_provision_error = None
|
||||
agent.updated_at = utcnow()
|
||||
self.record_heartbeat(self.session, agent)
|
||||
self.session.add(agent)
|
||||
|
||||
Reference in New Issue
Block a user