#!/usr/bin/env python3
"""
Batch orchestrator for SWE-bench evaluation of Claude Code + claude-mem.

Iterates a list of SWE-bench Verified instances, launches a per-instance Docker
container (`claude-mem/swebench-agent:latest`) that runs the two-turn
ingest/fix protocol, and collects all resulting diffs into a single
`predictions.jsonl` compatible with the upstream SWE-bench harness.

Usage:
    python evals/swebench/run-batch.py \
        --run-id claude-mem-baseline-001 \
        --limit 3 \
        --max-concurrent 2

Rate-limit note: Anthropic API rate limits can bite quickly. The default
`--max-concurrent` is 4, but it is safer to START WITH 2 and raise the cap
only after observing no 429s in the logs.
"""

from __future__ import annotations

import argparse
import atexit
import json
import os
import platform
import shutil
import stat
import subprocess
import sys
import tempfile
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any, Iterable

from datasets import load_dataset

HIDDEN_AGENT_FIELDS = (
    "patch",
    "test_patch",
    "FAIL_TO_PASS",
    "PASS_TO_PASS",
    "environment_setup_commit",
    "version",
)

def extract_oauth_credentials() -> Path | None:
    """
    Extract Claude Code OAuth credentials (from a Max/Pro subscription) to a
    temp file the container can bind-mount. Returns the temp file path, or
    None if extraction failed / no creds present.

    macOS: creds live in the Keychain under service "Claude Code-credentials".
    Linux: creds live at ~/.claude/.credentials.json.

    CAVEAT: Anthropic Max/Pro subscriptions have usage limits (per ~5h window)
    and their ToS is framed around individual developer use. Running batch
    evaluation across parallel containers may exhaust the quota quickly or
    raise compliance concerns. This helper exists because the user explicitly
    requested it; the caller is responsible for the policy call.

    The token may age out mid-run; we mount read-only so refresh writes fail
    silently inside the container (the underlying token in the host
    Keychain/file is untouched).
    """
    temp = tempfile.NamedTemporaryFile(
        prefix="claude-mem-creds-",
        suffix=".json",
        delete=False,
    )
    temp_path = Path(temp.name)
    temp.close()
    atexit.register(lambda: temp_path.unlink(missing_ok=True))

    if platform.system() == "Darwin":
        try:
            completed = subprocess.run(
                [
                    "security",
                    "find-generic-password",
                    "-s",
                    "Claude Code-credentials",
                    "-w",
                ],
                capture_output=True,
                text=True,
                check=False,
            )
            if completed.returncode == 0 and completed.stdout.strip():
                temp_path.write_text(completed.stdout.strip(), encoding="utf-8")
                temp_path.chmod(stat.S_IRUSR | stat.S_IWUSR)
                return temp_path
        except FileNotFoundError:
            print(
                "WARN: `security` command not available; trying on-disk creds.",
                file=sys.stderr,
            )

    creds_file = Path.home() / ".claude" / ".credentials.json"
    if creds_file.exists():
        temp_path.write_text(creds_file.read_text(encoding="utf-8"), encoding="utf-8")
        temp_path.chmod(stat.S_IRUSR | stat.S_IWUSR)
        return temp_path

    if platform.system() == "Darwin":
        print(
            "WARN: Claude Code-credentials not found in macOS Keychain and "
            "~/.claude/.credentials.json missing. Run `claude login` on the "
            "host first, or fall back to ANTHROPIC_API_KEY.",
            file=sys.stderr,
        )
    return None

def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Run the claude-mem SWE-bench agent on a batch of instances.",
    )
    parser.add_argument(
        "--instance-ids",
        nargs="+",
        default=None,
        help="Optional explicit list of instance_ids to run.",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=None,
        help="If set, process only the first N instances after filtering.",
    )
    parser.add_argument(
        "--max-concurrent",
        type=int,
        default=4,
        help="Max concurrent agent containers (default 4; start with 2 and raise after observing no 429s).",
    )
    parser.add_argument(
        "--run-id",
        type=str,
        required=True,
        help="Run identifier; used for output paths.",
    )
    parser.add_argument(
        "--out",
        type=str,
        default=None,
        help="Path to predictions.jsonl (default: evals/swebench/runs/<run_id>/predictions.jsonl).",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=1800,
        help="Per-instance timeout in seconds (default 1800, matches upstream harness).",
    )
    parser.add_argument(
        "--image",
        type=str,
        default="claude-mem/swebench-agent:latest",
        help="Agent Docker image tag.",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        default="princeton-nlp/SWE-bench_Verified",
        help="HuggingFace dataset name (e.g. princeton-nlp/SWE-bench_Lite, default Verified).",
    )
    parser.add_argument(
        "--auth",
        choices=["oauth", "api-key", "auto"],
        default="auto",
        help=(
            "Auth mode. 'oauth' extracts Claude Max/Pro creds from host "
            "Keychain (macOS) or ~/.claude/.credentials.json (Linux). "
            "'api-key' uses ANTHROPIC_API_KEY env. 'auto' prefers oauth, "
            "falls back to api-key."
        ),
    )
    parser.add_argument(
        "--overwrite",
        action="store_true",
        help=(
            "Truncate existing predictions.jsonl for this --run-id. "
            "Without this flag, the run aborts if predictions already exist "
            "(protects partial results from accidental re-runs)."
        ),
    )
    return parser.parse_args()

def select_instances(
    dataset: Iterable[dict[str, Any]],
    instance_ids: list[str] | None,
    limit: int | None,
) -> list[dict[str, Any]]:
    """Filter dataset rows by instance_ids (if given) and apply limit."""
    rows: list[dict[str, Any]] = list(dataset)
    if instance_ids:
        wanted = set(instance_ids)
        rows = [r for r in rows if r["instance_id"] in wanted]
        missing = wanted - {r["instance_id"] for r in rows}
        if missing:
            print(
                f"WARN: {len(missing)} requested instance_ids not found in dataset: "
                f"{sorted(missing)[:5]}{'...' if len(missing) > 5 else ''}",
                file=sys.stderr,
            )
    if limit is not None:
        rows = rows[:limit]
    return rows

def append_prediction_row(
    predictions_path: Path,
    instance_id: str,
    model_patch: str,
    model_name_or_path: str,
    lock: threading.Lock,
) -> None:
    """Append one JSONL prediction row under a lock (appends are NOT atomic across threads)."""
    row = {
        "instance_id": instance_id,
        "model_patch": model_patch,
        "model_name_or_path": model_name_or_path,
    }
    line = json.dumps(row, ensure_ascii=False) + "\n"
    with lock:
        with predictions_path.open("a", encoding="utf-8") as fp:
            fp.write(line)

def copy_log_if_exists(src: Path, dst: Path) -> None:
    """Copy a log file from the shared scratch volume into the run-log directory, if present."""
    if src.exists() and src.is_file():
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src, dst)

def run_one_instance(
    instance: dict[str, Any],
    image: str,
    predictions_path: Path,
    predictions_dir: Path,
    run_dir: Path,
    timeout: int,
    predictions_lock: threading.Lock,
    model_name_or_path: str,
    oauth_creds_path: Path | None,
) -> tuple[str, str]:
    """
    Run the agent container for a single instance.

    Returns a (status, instance_id) tuple where status is one of:
    "succeeded", "failed", "timed_out".

    On ANY non-success (timeout, non-zero exit, missing diff), a prediction
    row with model_patch="" is still appended — the plan requires we never
    silently drop an instance.
    """
    instance_id: str = instance["instance_id"]
    repo: str = instance["repo"]
    base_commit: str = instance["base_commit"]
    problem_statement: str = instance["problem_statement"]

    instance_log_dir = run_dir / instance_id
    instance_log_dir.mkdir(parents=True, exist_ok=True)
    stderr_log_path = instance_log_dir / "stderr.log"

    scratch_dir = Path(tempfile.mkdtemp(prefix=f"swebench-{instance_id}-"))
    problem_file = scratch_dir / "problem.txt"
    problem_file.write_text(problem_statement, encoding="utf-8")

    status: str = "failed"
    model_patch: str = ""

    container_name = f"swebench-agent-{instance_id}-{os.getpid()}-{threading.get_ident()}"

    try:
        cmd: list[str] = [
            "docker",
            "run",
            "--rm",
            "--name",
            container_name,
            "-e",
            "CLAUDE_MEM_OUTPUT_DIR=/scratch",
            "-v",
            f"{scratch_dir}:/scratch",
        ]
        if oauth_creds_path is not None:
            cmd += [
                "-e",
                "CLAUDE_MEM_CREDENTIALS_FILE=/auth/.credentials.json",
                "-v",
                f"{oauth_creds_path}:/auth/.credentials.json:ro",
            ]
        else:
            cmd += ["-e", "ANTHROPIC_API_KEY"]
        cmd += [
            image,
            instance_id,
            repo,
            base_commit,
            "/scratch/problem.txt",
            "/scratch/ignored-predictions.jsonl",
        ]

        try:
            completed = subprocess.run(
                cmd,
                timeout=timeout,
                capture_output=True,
                text=True,
                check=False,
            )
            stderr_log_path.write_text(
                f"=== STDOUT ===\n{completed.stdout}\n=== STDERR ===\n{completed.stderr}\n",
                encoding="utf-8",
            )

            if completed.returncode == 0:
                diff_file = scratch_dir / "model_patch.diff"
                if diff_file.exists():
                    diff_text = diff_file.read_text(encoding="utf-8")
                    if diff_text.strip():
                        model_patch = diff_text
                        status = "succeeded"
                    else:
                        status = "failed"
                else:
                    status = "failed"
            else:
                status = "failed"

        except subprocess.TimeoutExpired as exc:
            status = "timed_out"
            subprocess.run(
                ["docker", "rm", "-f", container_name],
                capture_output=True,
                check=False,
                timeout=30,
            )
            stderr_log_path.write_text(
                f"TIMEOUT after {timeout}s (forced docker rm -f {container_name})\n"
                f"=== STDOUT (partial) ===\n{exc.stdout or ''}\n"
                f"=== STDERR (partial) ===\n{exc.stderr or ''}\n",
                encoding="utf-8",
            )

        copy_log_if_exists(scratch_dir / "ingest.jsonl", instance_log_dir / "ingest.jsonl")
        copy_log_if_exists(scratch_dir / "fix.jsonl", instance_log_dir / "fix.jsonl")

        append_prediction_row(
            predictions_path=predictions_path,
            instance_id=instance_id,
            model_patch=model_patch,
            model_name_or_path=model_name_or_path,
            lock=predictions_lock,
        )

    except Exception as exc:
        status = "failed"
        try:
            stderr_log_path.write_text(
                f"ORCHESTRATOR EXCEPTION: {exc!r}\n",
                encoding="utf-8",
            )
        except OSError:
            pass
        append_prediction_row(
            predictions_path=predictions_path,
            instance_id=instance_id,
            model_patch="",
            model_name_or_path=model_name_or_path,
            lock=predictions_lock,
        )
    finally:
        shutil.rmtree(scratch_dir, ignore_errors=True)

    return status, instance_id

def main() -> int:
    args = parse_args()

    repo_root = Path(__file__).resolve().parents[2]
    if args.out:
        predictions_path = Path(args.out).resolve()
    else:
        predictions_path = (
            repo_root
            / "evals"
            / "swebench"
            / "runs"
            / args.run_id
            / "predictions.jsonl"
        )

    predictions_dir = predictions_path.parent
    run_dir = predictions_dir
    predictions_dir.mkdir(parents=True, exist_ok=True)
    if predictions_path.exists() and predictions_path.stat().st_size > 0:
        if not args.overwrite:
            print(
                f"ERROR: {predictions_path} already exists and is non-empty. "
                "Pass --overwrite to truncate, or pick a different --run-id.",
                file=sys.stderr,
            )
            return 1
        print(
            f"WARN: --overwrite set; truncating existing {predictions_path}",
            file=sys.stderr,
        )
    predictions_path.write_text("", encoding="utf-8")

    oauth_creds_path: Path | None = None
    if args.auth in ("oauth", "auto"):
        oauth_creds_path = extract_oauth_credentials()
        if oauth_creds_path is not None:
            print(
                f"Auth: OAuth credentials extracted to {oauth_creds_path} "
                "(mounted read-only into each container). "
                "NOTE: Max/Pro has per-window usage limits; batch runs may exhaust them.",
                file=sys.stderr,
            )
        elif args.auth == "oauth":
            print(
                "ERROR: --auth=oauth requested but credentials extraction failed.",
                file=sys.stderr,
            )
            return 1

    if oauth_creds_path is None:
        if not os.environ.get("ANTHROPIC_API_KEY"):
            print(
                "ERROR: no auth available. Either run `claude login` on host "
                "(for OAuth) or set ANTHROPIC_API_KEY.",
                file=sys.stderr,
            )
            return 1
        print("Auth: ANTHROPIC_API_KEY (pay-per-call).", file=sys.stderr)

    print(f"Loading dataset {args.dataset} (split=test)...", file=sys.stderr)
    dataset = load_dataset(args.dataset, split="test")

    instances = select_instances(dataset, args.instance_ids, args.limit)
    total = len(instances)
    if total == 0:
        print("No instances selected; nothing to do.", file=sys.stderr)
        return 0

    for row in instances:
        for key in HIDDEN_AGENT_FIELDS:
            row.pop(key, None)

    model_name_or_path = "claude-opus-4-7+claude-mem"

    print(
        f"Launching {total} instance(s) with max_concurrent={args.max_concurrent}, "
        f"timeout={args.timeout}s, image={args.image}",
        file=sys.stderr,
    )

    predictions_lock = threading.Lock()
    succeeded = 0
    failed = 0
    timed_out = 0

    with ThreadPoolExecutor(max_workers=args.max_concurrent) as executor:
        future_to_id = {
            executor.submit(
                run_one_instance,
                instance=instance,
                image=args.image,
                predictions_path=predictions_path,
                predictions_dir=predictions_dir,
                run_dir=run_dir,
                timeout=args.timeout,
                predictions_lock=predictions_lock,
                model_name_or_path=model_name_or_path,
                oauth_creds_path=oauth_creds_path,
            ): instance["instance_id"]
            for instance in instances
        }

        for future in as_completed(future_to_id):
            instance_id = future_to_id[future]
            try:
                status, _ = future.result()
            except Exception as exc:
                status = "failed"
                print(
                    f"[{instance_id}] orchestrator future raised: {exc!r}",
                    file=sys.stderr,
                )
                # The orchestrator died before run_one_instance could write a
                # row. Append a fallback so this instance still appears in
                # predictions.jsonl — preserving the "never drop an instance"
                # guarantee that downstream evaluation depends on.
                append_prediction_row(
                    predictions_path=predictions_path,
                    instance_id=instance_id,
                    model_patch="",
                    model_name_or_path=model_name_or_path,
                    lock=predictions_lock,
                )

            if status == "succeeded":
                succeeded += 1
            elif status == "timed_out":
                timed_out += 1
            else:
                failed += 1

            print(
                f"[{instance_id}] {status} "
                f"({succeeded + failed + timed_out}/{total} done)",
                file=sys.stderr,
            )

    print(
        f"{total} total, {succeeded} succeeded, {failed} failed, {timed_out} timed out",
    )
    return 0

if __name__ == "__main__":
    sys.exit(main())