Files
openclaw-mission-control/backend/app/services/openclaw/lifecycle_reconcile.py

141 lines
5.1 KiB
Python

"""Worker handlers for lifecycle reconciliation tasks."""
from __future__ import annotations
import asyncio
from app.core.logging import get_logger
from app.core.time import utcnow
from app.db.session import async_session_maker
from app.models.agents import Agent
from app.models.boards import Board
from app.models.gateways import Gateway
from app.services.openclaw.constants import MAX_WAKE_ATTEMPTS_WITHOUT_CHECKIN
from app.services.openclaw.lifecycle_orchestrator import AgentLifecycleOrchestrator
from app.services.openclaw.lifecycle_queue import decode_lifecycle_task, defer_lifecycle_reconcile
from app.services.queue import QueuedTask
logger = get_logger(__name__)
_RECONCILE_TIMEOUT_SECONDS = 60.0
def _has_checked_in_since_wake(agent: Agent) -> bool:
if agent.last_seen_at is None:
return False
if agent.last_wake_sent_at is None:
return True
return agent.last_seen_at >= agent.last_wake_sent_at
async def process_lifecycle_queue_task(task: QueuedTask) -> None:
"""Re-run lifecycle provisioning when an agent misses post-provision check-in."""
payload = decode_lifecycle_task(task)
now = utcnow()
async with async_session_maker() as session:
agent = await Agent.objects.by_id(payload.agent_id).first(session)
if agent is None:
logger.info(
"lifecycle.reconcile.skip_missing_agent",
extra={"agent_id": str(payload.agent_id)},
)
return
# Ignore stale queue messages after a newer lifecycle generation.
if agent.lifecycle_generation != payload.generation:
logger.info(
"lifecycle.reconcile.skip_stale_generation",
extra={
"agent_id": str(agent.id),
"queued_generation": payload.generation,
"current_generation": agent.lifecycle_generation,
},
)
return
if _has_checked_in_since_wake(agent):
logger.info(
"lifecycle.reconcile.skip_not_stuck",
extra={"agent_id": str(agent.id), "status": agent.status},
)
return
deadline = agent.checkin_deadline_at or payload.checkin_deadline_at
if agent.status == "deleting":
logger.info(
"lifecycle.reconcile.skip_deleting",
extra={"agent_id": str(agent.id)},
)
return
if now < deadline:
delay = max(0.0, (deadline - now).total_seconds())
if not defer_lifecycle_reconcile(task, delay_seconds=delay):
msg = "Failed to defer lifecycle reconcile task"
raise RuntimeError(msg)
logger.info(
"lifecycle.reconcile.deferred",
extra={"agent_id": str(agent.id), "delay_seconds": delay},
)
return
if agent.wake_attempts >= MAX_WAKE_ATTEMPTS_WITHOUT_CHECKIN:
agent.status = "offline"
agent.checkin_deadline_at = None
agent.last_provision_error = (
"Agent did not check in after wake; max wake attempts reached"
)
agent.updated_at = utcnow()
session.add(agent)
await session.commit()
logger.warning(
"lifecycle.reconcile.max_attempts_reached",
extra={
"agent_id": str(agent.id),
"wake_attempts": agent.wake_attempts,
"max_attempts": MAX_WAKE_ATTEMPTS_WITHOUT_CHECKIN,
},
)
return
gateway = await Gateway.objects.by_id(agent.gateway_id).first(session)
if gateway is None:
logger.warning(
"lifecycle.reconcile.skip_missing_gateway",
extra={"agent_id": str(agent.id), "gateway_id": str(agent.gateway_id)},
)
return
board: Board | None = None
if agent.board_id is not None:
board = await Board.objects.by_id(agent.board_id).first(session)
if board is None:
logger.warning(
"lifecycle.reconcile.skip_missing_board",
extra={"agent_id": str(agent.id), "board_id": str(agent.board_id)},
)
return
orchestrator = AgentLifecycleOrchestrator(session)
await asyncio.wait_for(
orchestrator.run_lifecycle(
gateway=gateway,
agent_id=agent.id,
board=board,
user=None,
action="update",
auth_token=None,
force_bootstrap=False,
reset_session=True,
wake=True,
deliver_wakeup=True,
wakeup_verb="updated",
clear_confirm_token=True,
raise_gateway_errors=True,
),
timeout=_RECONCILE_TIMEOUT_SECONDS,
)
logger.info(
"lifecycle.reconcile.retriggered",
extra={"agent_id": str(agent.id), "generation": payload.generation},
)