#!/usr/bin/env python3
"""Index all files on cdn.frogg.ie via directory crawling and brute-force probing."""

import argparse
import asyncio
import json
import re
import sys
from datetime import datetime, timezone
from urllib.parse import quote, urljoin

import httpx

BASE_URL = "https://cdn.frogg.ie"

COMMON_FILENAMES = [
    "index.html",
    "index.htm",
    "index.txt",
    "README.md",
    "README.txt",
    "README",
    "LICENSE",
    "LICENSE.txt",
    "CHANGELOG.md",
    "favicon.ico",
    "robots.txt",
    "sitemap.xml",
    "humans.txt",
    ".well-known/security.txt",
    "security.txt",
    "manifest.json",
    "crossdomain.xml",
    "browserconfig.xml",
    "keybase.txt",
    ".htaccess",
    "wp-login.php",
    "config.json",
    "config.yaml",
    "config.yml",
    "data.json",
    "feed.xml",
    "feed.json",
    "rss.xml",
    "atom.xml",
]

PROBE_BASENAMES = [
    "notes",
    "files",
    "data",
    "list",
    "links",
    "urls",
    "flights",
    "trips",
    "books",
    "music",
    "movies",
    "todo",
    "log",
    "dump",
    "backup",
    "archive",
    "export",
    "index",
    "about",
    "info",
    "contact",
    "resume",
    "cv",
    "keys",
    "pgp",
    "gpg",
    "pub",
    "id",
    "test",
    "temp",
    "tmp",
    "old",
    "new",
    "misc",
    "stuff",
    "media",
    "images",
    "img",
    "docs",
    "documents",
    "downloads",
    "assets",
    "static",
    "public",
    "private",
    "secret",
    "hidden",
    "api",
    "health",
    "status",
    "version",
    "changelog",
    "history",
    "passwords",
    "credentials",
    "tokens",
    "env",
    ".env",
    "config",
    "settings",
    "database",
    "db",
    "sql",
    "schema",
]

PROBE_EXTENSIONS = [
    ".txt",
    ".json",
    ".xml",
    ".csv",
    ".tsv",
    ".md",
    ".html",
    ".htm",
    ".pdf",
    ".yaml",
    ".yml",
    ".toml",
    ".ini",
    ".cfg",
    ".conf",
    ".log",
    ".sql",
    ".bak",
    ".gz",
    ".zip",
    ".tar",
    ".7z",
]

TREE_ENTRY = re.compile(r"^[│├└─\s]+(.+)$")


def parse_tree_listing(text: str) -> tuple[list[str], list[str]]:
    """Parse a plain-text tree directory listing. Returns (files, dirs)."""
    files = []
    dirs = []
    for line in text.splitlines():
        m = TREE_ENTRY.match(line)
        if not m:
            continue
        name = m.group(1).strip()
        if not name:
            continue
        if name.endswith("/"):
            dirs.append(name)
        else:
            dirs_hint = name.endswith("/")
            if not dirs_hint:
                files.append(name)
    return files, dirs


async def fetch_directory_listing(client: httpx.AsyncClient, url: str) -> tuple[list[str], list[str]]:
    """Fetch a directory listing and return (files, dirs)."""
    try:
        resp = await client.get(url, follow_redirects=True)
        if resp.status_code != 200:
            return [], []
    except httpx.HTTPError:
        return [], []

    return parse_tree_listing(resp.text)


async def crawl(client: httpx.AsyncClient, base: str, path: str = "/", found: set[str] | None = None) -> set[str]:
    """Recursively crawl directory listings starting from path."""
    if found is None:
        found = set()

    if not path.endswith("/"):
        path += "/"

    url = base.rstrip("/") + quote(path)
    files, dirs = await fetch_directory_listing(client, url)

    for f in files:
        full = path + f
        found.add(full)

    for d in dirs:
        dir_path = path + d
        if not dir_path.endswith("/"):
            dir_path += "/"
        if dir_path not in found:
            found.add(dir_path)
            await crawl(client, base, dir_path, found)

    return found


async def probe_path(client: httpx.AsyncClient, base: str, path: str, semaphore: asyncio.Semaphore) -> str | None:
    """Check if a path exists via GET. Returns path if found (status 200)."""
    url = base.rstrip("/") + quote(path)
    async with semaphore:
        try:
            resp = await client.get(url, follow_redirects=True)
            if resp.status_code == 200:
                return path
        except httpx.HTTPError:
            pass
    return None


async def brute_force_probe(client: httpx.AsyncClient, base: str, known: set[str], concurrency: int = 20) -> set[str]:
    """Probe for common filenames and basename+extension combos."""
    candidates = set()

    for name in COMMON_FILENAMES:
        p = f"/{name}"
        if p not in known:
            candidates.add(p)

    for basename in PROBE_BASENAMES:
        p = f"/{basename}"
        if p not in known:
            candidates.add(p)
        p_dir = f"/{basename}/"
        if p_dir not in known:
            candidates.add(p_dir)
        for ext in PROBE_EXTENSIONS:
            p = f"/{basename}{ext}"
            if p not in known:
                candidates.add(p)

    print(f"  Probing {len(candidates)} candidate paths...", file=sys.stderr)
    semaphore = asyncio.Semaphore(concurrency)
    tasks = [probe_path(client, base, path, semaphore) for path in candidates]

    discovered = set()
    results = await asyncio.gather(*tasks)
    for result in results:
        if result is not None:
            discovered.add(result)

    return discovered


async def crawl_discovered_dirs(client: httpx.AsyncClient, base: str, discovered: set[str]) -> set[str]:
    """For any newly discovered directories, crawl their listings too."""
    extra = set()
    dirs = [p for p in discovered if p.endswith("/")]
    for d in dirs:
        sub = await crawl(client, base, d, set())
        extra.update(sub)
    return extra


async def run(base_url: str, output_file: str | None, concurrency: int = 20) -> None:
    async with httpx.AsyncClient(timeout=15.0) as client:
        print(f"Crawling directory listings on {base_url}...", file=sys.stderr)
        crawled = await crawl(client, base_url)
        print(f"  Found {len(crawled)} paths from directory listings", file=sys.stderr)

        print("Brute-force probing common paths...", file=sys.stderr)
        probed = await brute_force_probe(client, base_url, crawled, concurrency)
        print(f"  Discovered {len(probed)} additional paths", file=sys.stderr)

        print("Crawling newly discovered directories...", file=sys.stderr)
        extra = await crawl_discovered_dirs(client, base_url, probed)
        print(f"  Found {len(extra)} more paths in new directories", file=sys.stderr)

        all_paths = crawled | probed | extra
        files = sorted(p for p in all_paths if not p.endswith("/"))
        dirs = sorted(p for p in all_paths if p.endswith("/"))

        result = {
            "base_url": base_url,
            "indexed_at": datetime.now(timezone.utc).isoformat(),
            "total_files": len(files),
            "total_dirs": len(dirs),
            "directories": dirs,
            "files": files,
        }

        output = json.dumps(result, indent=2)
        if output_file:
            with open(output_file, "w") as f:
                f.write(output)
                f.write("\n")
            print(f"Wrote index to {output_file}", file=sys.stderr)
        else:
            print(output)

        print(f"Total: {len(files)} files in {len(dirs)} directories", file=sys.stderr)


def main():
    parser = argparse.ArgumentParser(description="Index all files on cdn.frogg.ie")
    parser.add_argument("--url", default=BASE_URL, help="Base URL to index (default: %(default)s)")
    parser.add_argument("-o", "--output", help="Output file path (default: stdout)")
    parser.add_argument("--concurrency", type=int, default=20, help="Max concurrent requests (default: %(default)s)")
    args = parser.parse_args()

    asyncio.run(run(args.url, args.output, args.concurrency))


if __name__ == "__main__":
    main()
