2026-02-12 08:17:59 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""Lightweight markdown link checker for repo docs.
|
|
|
|
|
|
|
|
|
|
Checks *relative* links inside markdown files and fails CI if any targets are missing.
|
|
|
|
|
|
|
|
|
|
Design goals:
|
|
|
|
|
- No external deps.
|
|
|
|
|
- Ignore http(s)/mailto links.
|
|
|
|
|
- Ignore pure anchors (#foo).
|
|
|
|
|
- Support links with anchors (./path.md#section) by checking only the path part.
|
|
|
|
|
|
|
|
|
|
Limitations:
|
|
|
|
|
- Does not validate that anchors exist inside target files.
|
|
|
|
|
- Does not validate links generated dynamically or via HTML.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LINK_RE = re.compile(r"\[[^\]]+\]\(([^)]+)\)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def iter_md_files(root: Path) -> list[Path]:
|
2026-02-12 09:05:48 +00:00
|
|
|
"""Return markdown files to check.
|
|
|
|
|
|
2026-02-12 13:19:37 +00:00
|
|
|
Policy:
|
|
|
|
|
- Always check root contributor-facing markdown (`README.md`, `CONTRIBUTING.md`).
|
|
|
|
|
- If `docs/` exists, also check `docs/**/*.md`.
|
2026-02-12 09:05:48 +00:00
|
|
|
|
|
|
|
|
Rationale:
|
2026-02-12 13:19:37 +00:00
|
|
|
- We want fast feedback on broken *relative* links in the most important entrypoints.
|
|
|
|
|
- We intentionally do **not** crawl external URLs.
|
2026-02-12 09:05:48 +00:00
|
|
|
"""
|
|
|
|
|
|
2026-02-12 13:19:37 +00:00
|
|
|
files: list[Path] = []
|
|
|
|
|
|
|
|
|
|
for p in (root / "README.md", root / "CONTRIBUTING.md"):
|
|
|
|
|
if p.exists():
|
|
|
|
|
files.append(p)
|
|
|
|
|
|
2026-02-12 08:17:59 +00:00
|
|
|
docs = root / "docs"
|
2026-02-12 13:19:37 +00:00
|
|
|
if docs.exists():
|
|
|
|
|
files.extend(sorted(docs.rglob("*.md")))
|
|
|
|
|
|
|
|
|
|
# de-dupe + stable order
|
|
|
|
|
return sorted({p.resolve() for p in files})
|
2026-02-12 08:17:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_target(raw: str) -> str | None:
|
|
|
|
|
raw = raw.strip()
|
|
|
|
|
if not raw:
|
|
|
|
|
return None
|
|
|
|
|
if raw.startswith("http://") or raw.startswith("https://") or raw.startswith("mailto:"):
|
|
|
|
|
return None
|
|
|
|
|
if raw.startswith("#"):
|
|
|
|
|
return None
|
|
|
|
|
# strip query/fragment
|
|
|
|
|
raw = raw.split("#", 1)[0].split("?", 1)[0]
|
|
|
|
|
if not raw:
|
|
|
|
|
return None
|
|
|
|
|
return raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> int:
|
|
|
|
|
root = Path(__file__).resolve().parents[1]
|
|
|
|
|
md_files = iter_md_files(root)
|
|
|
|
|
|
|
|
|
|
missing: list[tuple[Path, str]] = []
|
|
|
|
|
|
|
|
|
|
for md in md_files:
|
|
|
|
|
text = md.read_text(encoding="utf-8")
|
|
|
|
|
for m in LINK_RE.finditer(text):
|
|
|
|
|
target_raw = m.group(1)
|
|
|
|
|
target = normalize_target(target_raw)
|
|
|
|
|
if target is None:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Skip common markdown reference-style quirks.
|
|
|
|
|
if target.startswith("<") and target.endswith(">"):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Resolve relative to current file.
|
|
|
|
|
resolved = (md.parent / target).resolve()
|
|
|
|
|
if not resolved.exists():
|
|
|
|
|
missing.append((md, target_raw))
|
|
|
|
|
|
|
|
|
|
if missing:
|
|
|
|
|
print("Broken relative links detected:\n")
|
|
|
|
|
for md, target in missing:
|
|
|
|
|
print(f"- {md.relative_to(root)} -> {target}")
|
|
|
|
|
print(f"\nTotal: {len(missing)}")
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
print(f"OK: checked {len(md_files)} markdown files")
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
raise SystemExit(main())
|