claw/scripts/validate_skill_lib.py

import argparse
import re
import sys
import tomllib
from pathlib import Path
from typing import NamedTuple


MAX_TEXT_FILE_BYTES = 512 * 1024
SCRIPT_SUFFIXES = (
    ".sh",
    ".bash",
    ".zsh",
    ".ksh",
    ".fish",
    ".ps1",
    ".bat",
    ".cmd",
)
HIGH_RISK_PATTERNS = (
    (re.compile(r"(?im)\bcurl\b[^\n|]{0,200}\|\s*(?:sh|bash|zsh)\b"), "curl-pipe-shell"),
    (re.compile(r"(?im)\bwget\b[^\n|]{0,200}\|\s*(?:sh|bash|zsh)\b"), "wget-pipe-shell"),
    (re.compile(r"(?im)\b(?:invoke-expression|iex)\b"), "powershell-iex"),
    (re.compile(r"(?im)\brm\s+-rf\s+/"), "destructive-rm-rf-root"),
    (re.compile(r"(?im)\bnc(?:at)?\b[^\n]{0,120}\s-e\b"), "netcat-remote-exec"),
    (re.compile(r"(?im)\bdd\s+if="), "disk-overwrite-dd"),
    (re.compile(r"(?im)\bmkfs(?:\.[a-z0-9]+)?\b"), "filesystem-format"),
    (re.compile(r"(?im):\(\)\s*\{\s*:\|\:&\s*\};:"), "fork-bomb"),
)
MARKDOWN_LINK_RE = re.compile(r"\[[^\]]*\]\(([^)]+)\)")

REPO_ROOT = Path(__file__).resolve().parents[1]
SKILL_LIB_ROOT = REPO_ROOT.parent / "skill_lib"
SKILLS_DIR = SKILL_LIB_ROOT / "skills"


class SkillRecord(NamedTuple):
    name: str
    description: str
    version: str
    author: str | None
    tags: list[str]
    prompt_body: str
    location: Path


class AuditReport(NamedTuple):
    files_scanned: int
    findings: list[str]


class ValidationResult(NamedTuple):
    record: SkillRecord
    report: AuditReport
    ok: bool


def discover_skill_dirs(skills_dir: Path | None = None) -> list[Path]:
    root = skills_dir or SKILLS_DIR
    if not root.exists():
        return []
    return sorted(path for path in root.iterdir() if path.is_dir())


def load_skill(skill_dir: Path) -> SkillRecord:
    manifest_path = skill_dir / "SKILL.toml"
    markdown_path = skill_dir / "SKILL.md"

    if manifest_path.is_file():
        manifest = tomllib.loads(manifest_path.read_text(encoding="utf-8"))
        skill_meta = manifest.get("skill", {})
        prompts = manifest.get("prompts", [])
        body = ""
        if markdown_path.is_file():
            _, body = parse_skill_markdown(markdown_path.read_text(encoding="utf-8"))
        elif prompts:
            body = "\n\n".join(str(prompt) for prompt in prompts)

        description = skill_meta.get("description")
        if not description or not str(description).strip():
            description = extract_description(body)

        return SkillRecord(
            name=skill_meta.get("name") or skill_dir.name,
            description=str(description),
            version=str(skill_meta.get("version") or "0.1.0"),
            author=skill_meta.get("author") or None,
            tags=list(skill_meta.get("tags", [])),
            prompt_body=body,
            location=manifest_path,
        )

    skill_path = markdown_path
    content = skill_path.read_text(encoding="utf-8")
    meta, body = parse_skill_markdown(content)

    name = meta["name"] or skill_dir.name
    description = meta["description"]
    if not description or not description.strip():
        description = extract_description(body)

    version = meta["version"] or "0.1.0"
    author = meta["author"] or None
    tags = list(meta["tags"])

    return SkillRecord(
        name=name,
        description=description,
        version=version,
        author=author,
        tags=tags,
        prompt_body=body,
        location=skill_path,
    )


def validate_all_skills(allow_scripts: bool = False) -> list[ValidationResult]:
    results = []
    for skill_dir in discover_skill_dirs():
        record = load_skill(skill_dir)
        report = audit_skill_directory(skill_dir, allow_scripts=allow_scripts)
        results.append(ValidationResult(record=record, report=report, ok=not report.findings))
    return results


def parse_skill_markdown(content: str) -> tuple[dict[str, object], str]:
    frontmatter = split_skill_frontmatter(content)
    if frontmatter is None:
        return empty_meta(), content
    raw_frontmatter, body = frontmatter
    return parse_simple_frontmatter(raw_frontmatter), body


def split_skill_frontmatter(content: str) -> tuple[str, str] | None:
    normalized = content.replace("\r\n", "\n")
    if not normalized.startswith("---\n"):
        return None
    rest = normalized[len("---\n") :]
    marker = "\n---\n"
    idx = rest.find(marker)
    if idx != -1:
        return rest[:idx], rest[idx + len(marker) :]
    if rest.endswith("\n---"):
        return rest[:-4], ""
    return None


def parse_simple_frontmatter(frontmatter: str) -> dict[str, object]:
    meta = empty_meta()
    collecting_tags = False

    for raw_line in frontmatter.splitlines():
        if collecting_tags:
            trimmed = raw_line.strip()
            if trimmed.startswith("- "):
                tag = trimmed[2:].strip().strip('"').strip("'")
                if tag:
                    meta["tags"].append(tag)
                continue
            collecting_tags = False

        if ":" not in raw_line:
            continue
        key, value = raw_line.split(":", 1)
        key = key.strip()
        value = value.strip().strip('"').strip("'")

        if key == "name":
            meta["name"] = value
        elif key == "description":
            meta["description"] = value
        elif key == "version":
            meta["version"] = value
        elif key == "author":
            meta["author"] = value
        elif key == "tags":
            if not value:
                collecting_tags = True
            else:
                cleaned = value.lstrip("[").rstrip("]")
                meta["tags"] = [
                    item.strip().strip('"').strip("'")
                    for item in cleaned.split(",")
                    if item.strip().strip('"').strip("'")
                ]

    return meta


def empty_meta() -> dict[str, object]:
    return {
        "name": None,
        "description": None,
        "version": None,
        "author": None,
        "tags": [],
    }


def extract_description(body: str) -> str:
    for line in body.splitlines():
        if line.startswith("#"):
            continue
        if not line.strip():
            continue
        return line.strip()
    return "No description"


def audit_skill_directory(skill_dir: Path, allow_scripts: bool = False) -> AuditReport:
    if not skill_dir.exists():
        raise FileNotFoundError(f"Skill source does not exist: {skill_dir}")
    if not skill_dir.is_dir():
        raise NotADirectoryError(f"Skill source must be a directory: {skill_dir}")

    canonical_root = skill_dir.resolve()
    findings: list[str] = []
    files_scanned = 0

    has_manifest = (canonical_root / "SKILL.md").is_file() or (canonical_root / "SKILL.toml").is_file()
    if not has_manifest:
        findings.append(
            "Skill root must include SKILL.md or SKILL.toml for deterministic auditing."
        )

    for path in collect_paths_depth_first(canonical_root):
        files_scanned += 1
        findings.extend(audit_path(canonical_root, path, allow_scripts=allow_scripts))

    return AuditReport(files_scanned=files_scanned, findings=findings)


def collect_paths_depth_first(root: Path) -> list[Path]:
    stack = [root]
    discovered: list[Path] = []

    while stack:
        current = stack.pop()
        discovered.append(current)
        if not current.is_dir():
            continue
        children = sorted(current.iterdir())
        for child in reversed(children):
            stack.append(child)

    return discovered


def audit_path(root: Path, path: Path, allow_scripts: bool) -> list[str]:
    findings: list[str] = []
    metadata = path.lstat()
    rel = relative_display(root, path)

    if path.is_symlink():
        findings.append(f"{rel}: symlinks are not allowed in installed skills.")
        return findings

    if path.is_dir():
        return findings

    if not allow_scripts and is_unsupported_script_file(path):
        findings.append(f"{rel}: script-like files are blocked by skill security policy.")

    if metadata.st_size > MAX_TEXT_FILE_BYTES and (is_markdown_file(path) or is_toml_file(path)):
        findings.append(f"{rel}: file is too large for static audit (>{MAX_TEXT_FILE_BYTES} bytes).")
        return findings

    if is_markdown_file(path):
        findings.extend(audit_markdown_file(root, path))
    elif is_toml_file(path):
        findings.extend(audit_manifest_file(root, path))

    return findings


def audit_markdown_file(root: Path, path: Path) -> list[str]:
    findings: list[str] = []
    content = path.read_text(encoding="utf-8")
    rel = relative_display(root, path)

    pattern = detect_high_risk_snippet(content)
    if pattern:
        findings.append(f"{rel}: detected high-risk command pattern ({pattern}).")

    for target in extract_markdown_links(content):
        findings.extend(audit_markdown_link_target(root, path, target))

    return findings


def audit_manifest_file(root: Path, path: Path) -> list[str]:
    findings: list[str] = []
    content = path.read_text(encoding="utf-8")
    rel = relative_display(root, path)

    pattern = detect_high_risk_snippet(content)
    if pattern:
        findings.append(f"{rel}: detected high-risk command pattern ({pattern}).")

    if any(operator in content for operator in ("&&", "||", ";", "`", "$(")):
        findings.append(f"{rel}: manifest content uses shell chaining operators, which are blocked.")

    return findings


def extract_markdown_links(content: str) -> list[str]:
    return [match.group(1).strip() for match in MARKDOWN_LINK_RE.finditer(content)]


def audit_markdown_link_target(root: Path, source: Path, raw_target: str) -> list[str]:
    findings: list[str] = []
    normalized = normalize_markdown_target(raw_target)
    if not normalized or normalized.startswith("#"):
        return findings

    rel = relative_display(root, source)
    scheme = url_scheme(normalized)
    if scheme:
        if scheme in {"http", "https", "mailto"}:
            if has_markdown_suffix(normalized):
                findings.append(
                    f"{rel}: remote markdown links are blocked by skill security audit ({normalized})."
                )
            return findings
        findings.append(f"{rel}: unsupported URL scheme in markdown link ({normalized}).")
        return findings

    stripped = strip_query_and_fragment(normalized)
    if not stripped:
        return findings

    if looks_like_absolute_path(stripped):
        findings.append(f"{rel}: absolute markdown link paths are not allowed ({normalized}).")
        return findings

    if has_script_suffix(stripped):
        findings.append(f"{rel}: markdown links to script files are blocked ({normalized}).")

    if not has_markdown_suffix(stripped):
        return findings

    base_dir = source.parent
    linked_path = base_dir / stripped

    try:
        canonical_target = linked_path.resolve(strict=True)
    except FileNotFoundError:
        if is_cross_skill_reference(stripped):
            return findings
        findings.append(f"{rel}: markdown link points to a missing file ({normalized}).")
        return findings

    if not is_subpath(canonical_target, root):
        skills_root = skills_root_for(root)
        if skills_root and is_subpath(canonical_target, skills_root):
            if not canonical_target.is_file():
                findings.append(f"{rel}: markdown link must point to a file ({normalized}).")
            return findings
        findings.append(f"{rel}: markdown link escapes skill root ({normalized}).")
        return findings

    if not canonical_target.is_file():
        findings.append(f"{rel}: markdown link must point to a file ({normalized}).")

    return findings


def detect_high_risk_snippet(content: str) -> str | None:
    for pattern, label in HIGH_RISK_PATTERNS:
        if pattern.search(content):
            return label
    return None


def normalize_markdown_target(raw_target: str) -> str:
    trimmed = raw_target.strip()
    if trimmed.startswith("<"):
        trimmed = trimmed[1:]
    if trimmed.endswith(">"):
        trimmed = trimmed[:-1]
    parts = trimmed.split()
    return parts[0] if parts else ""


def strip_query_and_fragment(target: str) -> str:
    end = len(target)
    hash_idx = target.find("#")
    if hash_idx != -1:
        end = min(end, hash_idx)
    query_idx = target.find("?")
    if query_idx != -1:
        end = min(end, query_idx)
    return target[:end]


def url_scheme(target: str) -> str | None:
    if ":" not in target:
        return None
    scheme, rest = target.split(":", 1)
    if not scheme or not rest:
        return None
    if not all(ch.isalnum() or ch in "+-." for ch in scheme):
        return None
    return scheme


def looks_like_absolute_path(target: str) -> bool:
    if Path(target).is_absolute():
        return True
    if len(target) >= 3 and target[0].isalpha() and target[1] == ":" and target[2] in "\\/":
        return True
    return target.startswith("~/")


def is_cross_skill_reference(target: str) -> bool:
    normalized = target[2:] if target.startswith("./") else target
    path = Path(target)

    if ".." in path.parts:
        return True

    return "/" not in normalized and "\\" not in normalized and has_markdown_suffix(normalized)


def skills_root_for(root: Path) -> Path | None:
    current = root
    while True:
        if current.name == "skills":
            return current
        if current.parent == current:
            return None
        current = current.parent


def relative_display(root: Path, path: Path) -> str:
    try:
        rel = path.relative_to(root)
    except ValueError:
        return str(path)
    return "." if str(rel) == "." else str(rel)


def is_markdown_file(path: Path) -> bool:
    return path.suffix.lower() in {".md", ".markdown"}


def is_toml_file(path: Path) -> bool:
    return path.suffix.lower() == ".toml"


def is_unsupported_script_file(path: Path) -> bool:
    return has_script_suffix(str(path).lower()) or has_shell_shebang(path)


def has_script_suffix(raw: str) -> bool:
    lowered = raw.lower()
    return any(lowered.endswith(suffix) for suffix in SCRIPT_SUFFIXES)


def has_shell_shebang(path: Path) -> bool:
    try:
        prefix = path.read_bytes()[:128]
    except OSError:
        return False

    first_line = prefix.decode("utf-8", errors="ignore").splitlines()[0].strip().lower() if prefix else ""
    interpreter = shebang_interpreter(first_line)
    return interpreter in {"sh", "bash", "zsh", "ksh", "fish", "pwsh", "powershell"}


def shebang_interpreter(line: str) -> str | None:
    if not line.startswith("#!"):
        return None

    shebang = line[2:].strip()
    if not shebang:
        return None

    parts = shebang.split()
    first = Path(parts[0]).name

    if first == "env":
        for part in parts[1:]:
            if part.startswith("-"):
                continue
            return Path(part).name
        return None

    return first


def has_markdown_suffix(target: str) -> bool:
    lowered = target.lower()
    return lowered.endswith(".md") or lowered.endswith(".markdown")


def is_subpath(path: Path, root: Path) -> bool:
    try:
        path.relative_to(root)
        return True
    except ValueError:
        return False


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Validate the sibling skill_lib against ZeroClaw-like rules.")
    parser.add_argument(
        "--allow-scripts",
        action="store_true",
        help="Allow shell-script files during auditing.",
    )
    args = parser.parse_args(argv)

    results = validate_all_skills(allow_scripts=args.allow_scripts)
    if not results:
        print(f"FAIL no skills discovered under {SKILLS_DIR}")
        return 1

    all_ok = True
    for result in results:
        status = "PASS" if result.ok else "FAIL"
        print(f"{status} {result.record.name}")
        for finding in result.report.findings:
            print(f"  - {finding}")
        all_ok = all_ok and result.ok

    print(f"Checked {len(results)} skills in {SKILL_LIB_ROOT}")
    return 0 if all_ok else 1


if __name__ == "__main__":
    sys.exit(main())