#!/usr/bin/env python3
"""Index all files on cdn.frogg.ie via directory crawling and brute-force probing."""

import argparse
import asyncio
import json
import re
import sys
from datetime import datetime, timezone
from urllib.parse import quote, urljoin

import httpx

BASE_URL = "https://cdn.frogg.ie"

COMMON_FILENAMES = [
    "index.html",
    "index.htm",
    "index.txt",
    "README.md",
    "README.txt",
    "README",
    "LICENSE",
    "LICENSE.txt",
    "CHANGELOG.md",
    "favicon.ico",
    "robots.txt",
    "sitemap.xml",
    "humans.txt",
    ".well-known/security.txt",
    "security.txt",
    "manifest.json",
    "crossdomain.xml",
    "browserconfig.xml",
    "keybase.txt",
    ".htaccess",
    "wp-login.php",
    "config.json",
    "config.yaml",
    "config.yml",
    "data.json",
    "feed.xml",
    "feed.json",
    "rss.xml",
    "atom.xml",
]

PROBE_BASENAMES = [
    "notes", "files", "data", "list", "links", "urls", "flights", "trips",
    "books", "music", "movies", "todo", "log", "dump", "backup", "archive",
    "export", "index", "about", "info", "contact", "resume", "cv", "keys",
    "pgp", "gpg", "pub", "id", "test", "temp", "tmp", "old", "new", "misc",
    "stuff", "media", "images", "img", "docs", "documents", "downloads",
    "assets", "static", "public", "private", "secret", "hidden", "api",
    "health", "status", "version", "changelog", "history", "passwords",
    "credentials", "tokens", "env", ".env", "config", "settings", "database",
    "db", "sql", "schema",
    # personal / lifestyle
    "timeline", "diary", "journal", "blog", "posts", "thoughts", "ideas",
    "plans", "goals", "wishlist", "bucket", "recipes", "food", "diet",
    "weight", "fitness", "exercise", "workout", "sleep", "mood", "habits",
    "routine", "schedule", "calendar", "events", "birthdays", "dates",
    "addresses", "contacts", "friends", "family", "people", "names",
    "places", "travel", "countries", "cities", "hotels", "packing",
    "expenses", "budget", "finance", "money", "savings", "investments",
    "taxes", "insurance", "rent", "bills", "subscriptions", "purchases",
    "shopping", "inventory", "collections", "vinyl", "records", "albums",
    "playlists", "podcasts", "shows", "anime", "manga", "games", "gaming",
    "steam", "wishlist", "reviews", "ratings", "favorites", "bookmarks",
    # medical / health
    "meds", "medications", "prescriptions", "doses", "dosage", "pharmacy",
    "doctors", "appointments", "symptoms", "diagnosis", "labs", "bloodwork",
    "hormones", "hrt", "therapy", "mental", "anxiety", "depression",
    "adhd", "autism", "allergies", "vaccines", "surgery", "recovery",
    # tech / dev
    "servers", "hosts", "domains", "dns", "ips", "ssh", "certs",
    "passwords", "secrets", "keys", "api-keys", "dotfiles", "packages",
    "deps", "dependencies", "requirements", "setup", "install", "deploy",
    "docker", "compose", "ansible", "terraform", "nginx", "caddy",
    "postgres", "redis", "mongo", "backup", "cron", "jobs", "tasks",
    "projects", "repos", "git", "branches", "releases", "versions",
    "bugs", "issues", "features", "roadmap", "spec", "design", "arch",
    "stack", "tools", "software", "hardware", "specs", "benchmarks",
    # writing / reference
    "quotes", "poems", "stories", "drafts", "essays", "articles",
    "references", "bibliography", "sources", "research", "papers",
    "abstracts", "summaries", "notes", "snippets", "templates", "samples",
    "examples", "cheatsheet", "guide", "manual", "howto", "faq", "help",
    "readme", "license", "credits", "acknowledgments", "thanks",
    # substances / harm reduction (contextual to the PDFs on the CDN)
    "substances", "drugs", "dosages", "experiences", "reports", "trips",
    "combos", "interactions", "reagents", "testing", "safety", "harm",
    "reduction", "vendors", "sources", "legal", "legality", "scheduling",
    # identity / social
    "bio", "intro", "pronouns", "links", "social", "twitter", "mastodon",
    "fediverse", "discord", "matrix", "irc", "email", "pgp", "gpg",
    "keybase", "signal", "telegram", "phone", "vouch", "verify",
    # misc common file names
    "scratch", "temp", "draft", "wip", "random", "junk", "trash",
    "clipboard", "paste", "buffer", "queue", "inbox", "outbox", "sent",
    "received", "saved", "starred", "pinned", "archived", "deleted",
    "manifest", "sitemap", "changelog", "news", "updates", "announcements",
    "rules", "policy", "terms", "privacy", "cookies", "security",
    "metrics", "analytics", "stats", "counts", "logs", "audit", "access",
]

PROBE_EXTENSIONS = [
    ".txt",
    ".json",
    ".xml",
    ".csv",
    ".tsv",
    ".md",
    ".html",
    ".htm",
    ".pdf",
    ".yaml",
    ".yml",
    ".toml",
    ".ini",
    ".cfg",
    ".conf",
    ".log",
    ".sql",
    ".bak",
    ".gz",
    ".zip",
    ".tar",
    ".7z",
]

TREE_ENTRY = re.compile(r"^[│├└─\s]+(.+)$")


def parse_tree_listing(text: str) -> tuple[list[str], list[str]]:
    """Parse a plain-text tree directory listing. Returns (files, dirs)."""
    files = []
    dirs = []
    for line in text.splitlines():
        m = TREE_ENTRY.match(line)
        if not m:
            continue
        name = m.group(1).strip()
        if not name:
            continue
        if name.endswith("/"):
            dirs.append(name)
        else:
            dirs_hint = name.endswith("/")
            if not dirs_hint:
                files.append(name)
    return files, dirs


async def fetch_directory_listing(client: httpx.AsyncClient, url: str) -> tuple[list[str], list[str]]:
    """Fetch a directory listing and return (files, dirs)."""
    try:
        resp = await client.get(url, follow_redirects=True)
        if resp.status_code != 200:
            return [], []
    except httpx.HTTPError:
        return [], []

    return parse_tree_listing(resp.text)


async def crawl(client: httpx.AsyncClient, base: str, path: str = "/", found: set[str] | None = None) -> set[str]:
    """Recursively crawl directory listings starting from path."""
    if found is None:
        found = set()

    if not path.endswith("/"):
        path += "/"

    url = base.rstrip("/") + quote(path)
    files, dirs = await fetch_directory_listing(client, url)

    for f in files:
        full = path + f
        found.add(full)

    for d in dirs:
        dir_path = path + d
        if not dir_path.endswith("/"):
            dir_path += "/"
        if dir_path not in found:
            found.add(dir_path)
            await crawl(client, base, dir_path, found)

    return found


async def probe_path(client: httpx.AsyncClient, base: str, path: str, semaphore: asyncio.Semaphore) -> str | None:
    """Check if a path exists via GET. Returns path if found (status 200)."""
    url = base.rstrip("/") + quote(path)
    async with semaphore:
        try:
            resp = await client.get(url, follow_redirects=True)
            if resp.status_code == 200:
                return path
        except httpx.HTTPError:
            pass
    return None


async def brute_force_probe(client: httpx.AsyncClient, base: str, known: set[str], concurrency: int = 20, extra_words: list[str] | None = None) -> set[str]:
    """Probe for common filenames and basename+extension combos."""
    candidates = set()

    for name in COMMON_FILENAMES:
        p = f"/{name}"
        if p not in known:
            candidates.add(p)

    all_basenames = list(set(PROBE_BASENAMES + (extra_words or [])))
    for basename in all_basenames:
        p = f"/{basename}"
        if p not in known:
            candidates.add(p)
        p_dir = f"/{basename}/"
        if p_dir not in known:
            candidates.add(p_dir)
        for ext in PROBE_EXTENSIONS:
            p = f"/{basename}{ext}"
            if p not in known:
                candidates.add(p)

    print(f"  Probing {len(candidates)} candidate paths...", file=sys.stderr)
    semaphore = asyncio.Semaphore(concurrency)
    tasks = [probe_path(client, base, path, semaphore) for path in candidates]

    discovered = set()
    results = await asyncio.gather(*tasks)
    for result in results:
        if result is not None:
            discovered.add(result)

    return discovered


async def crawl_discovered_dirs(client: httpx.AsyncClient, base: str, discovered: set[str]) -> set[str]:
    """For any newly discovered directories, crawl their listings too."""
    extra = set()
    dirs = [p for p in discovered if p.endswith("/")]
    for d in dirs:
        sub = await crawl(client, base, d, set())
        extra.update(sub)
    return extra


def load_wordlist(path: str) -> list[str]:
    """Load extra probe words from a file, one per line."""
    words = []
    with open(path) as f:
        for line in f:
            word = line.strip()
            if word and not word.startswith("#"):
                words.append(word)
    return words


async def run(base_url: str, output_file: str | None, concurrency: int = 20, wordlist_path: str | None = None) -> None:
    extra_words = load_wordlist(wordlist_path) if wordlist_path else None
    async with httpx.AsyncClient(timeout=15.0) as client:
        print(f"Crawling directory listings on {base_url}...", file=sys.stderr)
        crawled = await crawl(client, base_url)
        print(f"  Found {len(crawled)} paths from directory listings", file=sys.stderr)

        print("Brute-force probing common paths...", file=sys.stderr)
        probed = await brute_force_probe(client, base_url, crawled, concurrency, extra_words)
        print(f"  Discovered {len(probed)} additional paths", file=sys.stderr)

        print("Crawling newly discovered directories...", file=sys.stderr)
        extra = await crawl_discovered_dirs(client, base_url, probed)
        print(f"  Found {len(extra)} more paths in new directories", file=sys.stderr)

        all_paths = crawled | probed | extra
        files = sorted(p for p in all_paths if not p.endswith("/"))
        dirs = sorted(p for p in all_paths if p.endswith("/"))

        result = {
            "base_url": base_url,
            "indexed_at": datetime.now(timezone.utc).isoformat(),
            "total_files": len(files),
            "total_dirs": len(dirs),
            "directories": dirs,
            "files": files,
        }

        output = json.dumps(result, indent=2)
        if output_file:
            with open(output_file, "w") as f:
                f.write(output)
                f.write("\n")
            print(f"Wrote index to {output_file}", file=sys.stderr)
        else:
            print(output)

        print(f"Total: {len(files)} files in {len(dirs)} directories", file=sys.stderr)


def main():
    parser = argparse.ArgumentParser(description="Index all files on cdn.frogg.ie")
    parser.add_argument("--url", default=BASE_URL, help="Base URL to index (default: %(default)s)")
    parser.add_argument("-o", "--output", help="Output file path (default: stdout)")
    parser.add_argument("--concurrency", type=int, default=20, help="Max concurrent requests (default: %(default)s)")
    parser.add_argument("-w", "--wordlist", help="Path to extra wordlist file (one word per line)")
    args = parser.parse_args()

    asyncio.run(run(args.url, args.output, args.concurrency, args.wordlist))


if __name__ == "__main__":
    main()
