Files
openclaw-mission-control/backend/app/services/openclaw/provisioning_db.py

719 lines
22 KiB
Python

"""DB-backed OpenClaw provisioning orchestration.
Layering:
- `app.services.openclaw.provisioning` contains gateway-only lifecycle operations (no DB calls).
- This module builds on top of that layer using AsyncSession for token rotation, lead-agent records,
and bulk template synchronization.
"""
from __future__ import annotations
import asyncio
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, TypeVar
from uuid import UUID
from sqlalchemy import func
from sqlmodel import col, select
from app.core.agent_tokens import generate_agent_token, hash_agent_token, verify_agent_token
from app.core.time import utcnow
from app.integrations.openclaw_gateway import GatewayConfig as GatewayClientConfig
from app.integrations.openclaw_gateway import OpenClawGatewayError
from app.models.agents import Agent
from app.models.board_memory import BoardMemory
from app.models.boards import Board
from app.models.gateways import Gateway
from app.schemas.gateways import GatewayTemplatesSyncError, GatewayTemplatesSyncResult
from app.services.openclaw.constants import (
_NON_TRANSIENT_GATEWAY_ERROR_MARKERS,
_SECURE_RANDOM,
_TOOLS_KV_RE,
_TRANSIENT_GATEWAY_ERROR_MARKERS,
DEFAULT_HEARTBEAT_CONFIG,
)
from app.services.openclaw.internal import agent_key as _agent_key
from app.services.openclaw.provisioning import (
OpenClawGatewayControlPlane,
OpenClawGatewayProvisioner,
)
from app.services.openclaw.shared import GatewayAgentIdentity
from app.services.organizations import get_org_owner_user
if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
from sqlmodel.ext.asyncio.session import AsyncSession
from app.models.users import User
_T = TypeVar("_T")
@dataclass(frozen=True)
class GatewayTemplateSyncOptions:
"""Runtime options controlling gateway template synchronization."""
user: User | None
include_main: bool = True
reset_sessions: bool = False
rotate_tokens: bool = False
force_bootstrap: bool = False
board_id: UUID | None = None
@dataclass(frozen=True, slots=True)
class LeadAgentOptions:
"""Optional overrides for board-lead provisioning behavior."""
agent_name: str | None = None
identity_profile: dict[str, str] | None = None
action: str = "provision"
@dataclass(frozen=True, slots=True)
class LeadAgentRequest:
"""Inputs required to ensure or provision a board lead agent."""
board: Board
gateway: Gateway
config: GatewayClientConfig
user: User | None
options: LeadAgentOptions = field(default_factory=LeadAgentOptions)
class OpenClawProvisioningService:
"""DB-backed provisioning workflows (bulk template sync, lead-agent record)."""
def __init__(self, session: AsyncSession) -> None:
self._session = session
self._gateway = OpenClawGatewayProvisioner()
@property
def session(self) -> AsyncSession:
return self._session
@staticmethod
def lead_session_key(board: Board) -> str:
return f"agent:lead-{board.id}:main"
@staticmethod
def lead_agent_name(_: Board) -> str:
return "Lead Agent"
async def ensure_board_lead_agent(
self,
*,
request: LeadAgentRequest,
) -> tuple[Agent, bool]:
"""Ensure a board has a lead agent; return `(agent, created)`."""
board = request.board
config_options = request.options
existing = (
await self.session.exec(
select(Agent)
.where(Agent.board_id == board.id)
.where(col(Agent.is_board_lead).is_(True)),
)
).first()
if existing:
desired_name = config_options.agent_name or self.lead_agent_name(board)
changed = False
if existing.name != desired_name:
existing.name = desired_name
changed = True
if existing.gateway_id != request.gateway.id:
existing.gateway_id = request.gateway.id
changed = True
desired_session_key = self.lead_session_key(board)
if existing.openclaw_session_id != desired_session_key:
existing.openclaw_session_id = desired_session_key
changed = True
if changed:
existing.updated_at = utcnow()
self.session.add(existing)
await self.session.commit()
await self.session.refresh(existing)
return existing, False
merged_identity_profile: dict[str, Any] = {
"role": "Board Lead",
"communication_style": "direct, concise, practical",
"emoji": ":gear:",
}
if config_options.identity_profile:
merged_identity_profile.update(
{
key: value.strip()
for key, value in config_options.identity_profile.items()
if value.strip()
},
)
agent = Agent(
name=config_options.agent_name or self.lead_agent_name(board),
status="provisioning",
board_id=board.id,
gateway_id=request.gateway.id,
is_board_lead=True,
heartbeat_config=DEFAULT_HEARTBEAT_CONFIG.copy(),
identity_profile=merged_identity_profile,
openclaw_session_id=self.lead_session_key(board),
provision_requested_at=utcnow(),
provision_action=config_options.action,
)
raw_token = generate_agent_token()
agent.agent_token_hash = hash_agent_token(raw_token)
self.session.add(agent)
await self.session.commit()
await self.session.refresh(agent)
# Strict behavior: provisioning errors surface to the caller. The DB row exists
# so a later retry can succeed with the same deterministic identity/session key.
await self._gateway.apply_agent_lifecycle(
agent=agent,
gateway=request.gateway,
board=board,
auth_token=raw_token,
user=request.user,
action=config_options.action,
wake=True,
deliver_wakeup=True,
)
agent.status = "online"
agent.provision_requested_at = None
agent.provision_action = None
agent.updated_at = utcnow()
self.session.add(agent)
await self.session.commit()
await self.session.refresh(agent)
return agent, True
async def sync_gateway_templates(
self,
gateway: Gateway,
options: GatewayTemplateSyncOptions,
) -> GatewayTemplatesSyncResult:
"""Synchronize AGENTS/TOOLS/etc templates to gateway-connected agents."""
template_user = options.user
if template_user is None:
template_user = await get_org_owner_user(
self.session,
organization_id=gateway.organization_id,
)
options = GatewayTemplateSyncOptions(
user=template_user,
include_main=options.include_main,
reset_sessions=options.reset_sessions,
rotate_tokens=options.rotate_tokens,
force_bootstrap=options.force_bootstrap,
board_id=options.board_id,
)
result = _base_result(
gateway,
include_main=options.include_main,
reset_sessions=options.reset_sessions,
)
if not gateway.url:
_append_sync_error(
result,
message="Gateway URL is not configured for this gateway.",
)
return result
control_plane = OpenClawGatewayControlPlane(
GatewayClientConfig(url=gateway.url, token=gateway.token),
)
ctx = _SyncContext(
session=self.session,
gateway=gateway,
control_plane=control_plane,
backoff=_GatewayBackoff(timeout_s=10 * 60, timeout_context="template sync"),
options=options,
provisioner=self._gateway,
)
if not await _ping_gateway(ctx, result):
return result
boards = await Board.objects.filter_by(gateway_id=gateway.id).all(self.session)
boards_by_id = _boards_by_id(boards, board_id=options.board_id)
if boards_by_id is None:
_append_sync_error(
result,
message="Board does not belong to this gateway.",
)
return result
paused_board_ids = await _paused_board_ids(self.session, list(boards_by_id.keys()))
if boards_by_id:
agents = await (
Agent.objects.by_field_in("board_id", list(boards_by_id.keys()))
.order_by(col(Agent.created_at).asc())
.all(self.session)
)
else:
agents = []
stop_sync = False
for agent in agents:
board = boards_by_id.get(agent.board_id) if agent.board_id is not None else None
if board is None:
result.agents_skipped += 1
_append_sync_error(
result,
agent=agent,
message="Skipping agent: board not found for agent.",
)
continue
if board.id in paused_board_ids:
result.agents_skipped += 1
continue
stop_sync = await _sync_one_agent(ctx, result, agent, board)
if stop_sync:
break
if not stop_sync and options.include_main:
await _sync_main_agent(ctx, result)
return result
@dataclass(frozen=True)
class _SyncContext:
session: AsyncSession
gateway: Gateway
control_plane: OpenClawGatewayControlPlane
backoff: _GatewayBackoff
options: GatewayTemplateSyncOptions
provisioner: OpenClawGatewayProvisioner
def _is_transient_gateway_error(exc: Exception) -> bool:
if not isinstance(exc, OpenClawGatewayError):
return False
message = str(exc).lower()
if not message:
return False
if any(marker in message for marker in _NON_TRANSIENT_GATEWAY_ERROR_MARKERS):
return False
return ("503" in message and "websocket" in message) or any(
marker in message for marker in _TRANSIENT_GATEWAY_ERROR_MARKERS
)
def _gateway_timeout_message(
exc: OpenClawGatewayError,
*,
timeout_s: float,
context: str,
) -> str:
rounded_timeout = int(timeout_s)
timeout_text = f"{rounded_timeout} seconds"
if rounded_timeout >= 120:
timeout_text = f"{rounded_timeout // 60} minutes"
return f"Gateway unreachable after {timeout_text} ({context} timeout). Last error: {exc}"
class _GatewayBackoff:
def __init__(
self,
*,
timeout_s: float = 10 * 60,
base_delay_s: float = 0.75,
max_delay_s: float = 30.0,
jitter: float = 0.2,
timeout_context: str = "gateway operation",
) -> None:
self._timeout_s = timeout_s
self._base_delay_s = base_delay_s
self._max_delay_s = max_delay_s
self._jitter = jitter
self._timeout_context = timeout_context
self._delay_s = base_delay_s
def reset(self) -> None:
self._delay_s = self._base_delay_s
@staticmethod
async def _attempt(
fn: Callable[[], Awaitable[_T]],
) -> tuple[_T | None, OpenClawGatewayError | None]:
try:
return await fn(), None
except OpenClawGatewayError as exc:
return None, exc
async def run(self, fn: Callable[[], Awaitable[_T]]) -> _T:
deadline_s = asyncio.get_running_loop().time() + self._timeout_s
while True:
value, error = await self._attempt(fn)
if error is not None:
exc = error
if not _is_transient_gateway_error(exc):
raise exc
now = asyncio.get_running_loop().time()
remaining = deadline_s - now
if remaining <= 0:
raise TimeoutError(
_gateway_timeout_message(
exc,
timeout_s=self._timeout_s,
context=self._timeout_context,
),
) from exc
sleep_s = min(self._delay_s, remaining)
if self._jitter:
sleep_s *= 1.0 + _SECURE_RANDOM.uniform(
-self._jitter,
self._jitter,
)
sleep_s = max(0.0, min(sleep_s, remaining))
await asyncio.sleep(sleep_s)
self._delay_s = min(self._delay_s * 2.0, self._max_delay_s)
continue
self.reset()
if value is None:
msg = "Gateway retry produced no value without an error"
raise RuntimeError(msg)
return value
async def _with_gateway_retry(
fn: Callable[[], Awaitable[_T]],
*,
backoff: _GatewayBackoff,
) -> _T:
return await backoff.run(fn)
def _parse_tools_md(content: str) -> dict[str, str]:
values: dict[str, str] = {}
for raw in content.splitlines():
line = raw.strip()
if not line or line.startswith("#"):
continue
match = _TOOLS_KV_RE.match(line)
if not match:
continue
values[match.group("key")] = match.group("value").strip()
return values
async def _get_agent_file(
*,
agent_gateway_id: str,
name: str,
control_plane: OpenClawGatewayControlPlane,
backoff: _GatewayBackoff | None = None,
) -> str | None:
try:
async def _do_get() -> object:
return await control_plane.get_agent_file_payload(agent_id=agent_gateway_id, name=name)
payload = await (backoff.run(_do_get) if backoff else _do_get())
except OpenClawGatewayError:
return None
if isinstance(payload, str):
return payload
if isinstance(payload, dict):
content = payload.get("content")
if isinstance(content, str):
return content
file_obj = payload.get("file")
if isinstance(file_obj, dict):
nested = file_obj.get("content")
if isinstance(nested, str):
return nested
return None
async def _get_existing_auth_token(
*,
agent_gateway_id: str,
control_plane: OpenClawGatewayControlPlane,
backoff: _GatewayBackoff | None = None,
) -> str | None:
tools = await _get_agent_file(
agent_gateway_id=agent_gateway_id,
name="TOOLS.md",
control_plane=control_plane,
backoff=backoff,
)
if not tools:
return None
values = _parse_tools_md(tools)
token = values.get("AUTH_TOKEN")
if not token:
return None
token = token.strip()
return token or None
async def _paused_board_ids(session: AsyncSession, board_ids: list[UUID]) -> set[UUID]:
if not board_ids:
return set()
commands = {"/pause", "/resume"}
statement = (
select(BoardMemory.board_id, BoardMemory.content)
.where(col(BoardMemory.board_id).in_(board_ids))
.where(col(BoardMemory.is_chat).is_(True))
.where(func.lower(func.trim(col(BoardMemory.content))).in_(commands))
.order_by(col(BoardMemory.board_id), col(BoardMemory.created_at).desc())
# Postgres: DISTINCT ON (board_id) to get latest command per board.
.distinct(col(BoardMemory.board_id))
)
paused: set[UUID] = set()
for board_id, content in await session.exec(statement):
cmd = (content or "").strip().lower()
if cmd == "/pause":
paused.add(board_id)
return paused
def _append_sync_error(
result: GatewayTemplatesSyncResult,
*,
message: str,
agent: Agent | None = None,
board: Board | None = None,
) -> None:
result.errors.append(
GatewayTemplatesSyncError(
agent_id=agent.id if agent else None,
agent_name=agent.name if agent else None,
board_id=board.id if board else None,
message=message,
),
)
async def _rotate_agent_token(session: AsyncSession, agent: Agent) -> str:
token = generate_agent_token()
agent.agent_token_hash = hash_agent_token(token)
agent.updated_at = utcnow()
session.add(agent)
await session.commit()
await session.refresh(agent)
return token
async def _ping_gateway(ctx: _SyncContext, result: GatewayTemplatesSyncResult) -> bool:
try:
async def _do_ping() -> object:
return await ctx.control_plane.health()
await ctx.backoff.run(_do_ping)
except (TimeoutError, OpenClawGatewayError) as exc:
_append_sync_error(result, message=str(exc))
return False
else:
return True
def _base_result(
gateway: Gateway,
*,
include_main: bool,
reset_sessions: bool,
) -> GatewayTemplatesSyncResult:
return GatewayTemplatesSyncResult(
gateway_id=gateway.id,
include_main=include_main,
reset_sessions=reset_sessions,
agents_updated=0,
agents_skipped=0,
main_updated=False,
)
def _boards_by_id(
boards: list[Board],
*,
board_id: UUID | None,
) -> dict[UUID, Board] | None:
boards_by_id = {board.id: board for board in boards}
if board_id is None:
return boards_by_id
board = boards_by_id.get(board_id)
if board is None:
return None
return {board_id: board}
async def _resolve_agent_auth_token(
ctx: _SyncContext,
result: GatewayTemplatesSyncResult,
agent: Agent,
board: Board | None,
*,
agent_gateway_id: str,
) -> tuple[str | None, bool]:
try:
auth_token = await _get_existing_auth_token(
agent_gateway_id=agent_gateway_id,
control_plane=ctx.control_plane,
backoff=ctx.backoff,
)
except TimeoutError as exc:
_append_sync_error(result, agent=agent, board=board, message=str(exc))
return None, True
if not auth_token:
if not ctx.options.rotate_tokens:
result.agents_skipped += 1
_append_sync_error(
result,
agent=agent,
board=board,
message=(
"Skipping agent: unable to read AUTH_TOKEN from TOOLS.md "
"(run with rotate_tokens=true to re-key)."
),
)
return None, False
auth_token = await _rotate_agent_token(ctx.session, agent)
if agent.agent_token_hash and not verify_agent_token(
auth_token,
agent.agent_token_hash,
):
if ctx.options.rotate_tokens:
auth_token = await _rotate_agent_token(ctx.session, agent)
else:
_append_sync_error(
result,
agent=agent,
board=board,
message=(
"Warning: AUTH_TOKEN in TOOLS.md does not match backend "
"token hash (agent auth may be broken)."
),
)
return auth_token, False
async def _sync_one_agent(
ctx: _SyncContext,
result: GatewayTemplatesSyncResult,
agent: Agent,
board: Board,
) -> bool:
auth_token, fatal = await _resolve_agent_auth_token(
ctx,
result,
agent,
board,
agent_gateway_id=_agent_key(agent),
)
if fatal:
return True
if not auth_token:
return False
try:
async def _do_provision() -> bool:
await ctx.provisioner.apply_agent_lifecycle(
agent=agent,
gateway=ctx.gateway,
board=board,
auth_token=auth_token,
user=ctx.options.user,
action="update",
force_bootstrap=ctx.options.force_bootstrap,
reset_session=ctx.options.reset_sessions,
wake=False,
)
return True
await _with_gateway_retry(_do_provision, backoff=ctx.backoff)
result.agents_updated += 1
except TimeoutError as exc: # pragma: no cover - gateway/network dependent
result.agents_skipped += 1
_append_sync_error(result, agent=agent, board=board, message=str(exc))
return True
except (OSError, RuntimeError, ValueError) as exc: # pragma: no cover
result.agents_skipped += 1
_append_sync_error(
result,
agent=agent,
board=board,
message=f"Failed to sync templates: {exc}",
)
return False
else:
return False
async def _sync_main_agent(
ctx: _SyncContext,
result: GatewayTemplatesSyncResult,
) -> bool:
main_agent = (
await Agent.objects.all()
.filter(col(Agent.gateway_id) == ctx.gateway.id)
.filter(col(Agent.board_id).is_(None))
.first(ctx.session)
)
if main_agent is None:
_append_sync_error(
result,
message="Gateway agent record not found; skipping gateway agent template sync.",
)
return True
main_gateway_agent_id = GatewayAgentIdentity.openclaw_agent_id(ctx.gateway)
token, fatal = await _resolve_agent_auth_token(
ctx,
result,
main_agent,
board=None,
agent_gateway_id=main_gateway_agent_id,
)
if fatal:
return True
if not token:
_append_sync_error(
result,
agent=main_agent,
message="Skipping gateway agent: unable to read AUTH_TOKEN from TOOLS.md.",
)
return True
stop_sync = False
try:
async def _do_provision_main() -> bool:
await ctx.provisioner.apply_agent_lifecycle(
agent=main_agent,
gateway=ctx.gateway,
board=None,
auth_token=token,
user=ctx.options.user,
action="update",
force_bootstrap=ctx.options.force_bootstrap,
reset_session=ctx.options.reset_sessions,
wake=False,
)
return True
await _with_gateway_retry(_do_provision_main, backoff=ctx.backoff)
except TimeoutError as exc: # pragma: no cover - gateway/network dependent
_append_sync_error(result, agent=main_agent, message=str(exc))
stop_sync = True
except (OSError, RuntimeError, ValueError) as exc: # pragma: no cover
_append_sync_error(
result,
agent=main_agent,
message=f"Failed to sync gateway agent templates: {exc}",
)
else:
result.main_updated = True
return stop_sync