#!/usr/bin/env python3
"""Scrape all tweets from @ARCYCLOHEXADOLL via nitter.net and save as JSON."""

import json
import re
import sys
import time

import requests
from bs4 import BeautifulSoup

NITTER = "https://nitter.net"
USERNAME = "ARCYCLOHEXADOLL"
OUTPUT = "/home/josie/notes/arcyclohexadoll_tweets.json"
MAX_RETRIES = 5
RETRY_BACKOFF = 3


def parse_stat(text):
    if not text:
        return 0
    text = text.strip().replace(",", "")
    if not text:
        return 0
    try:
        return int(text)
    except ValueError:
        return 0


def parse_tweets(soup):
    tweets = []
    for item in soup.select(".timeline-item:not(.show-more)"):
        link_el = item.select_one(".tweet-link")
        if not link_el:
            continue

        href = link_el.get("href", "")
        match = re.search(r"/status/(\d+)", href)
        if not match:
            continue
        tweet_id = match.group(1)

        text_el = item.select_one(".tweet-content")
        text = text_el.get_text(strip=True) if text_el else ""

        date_el = item.select_one(".tweet-date a")
        date_title = date_el.get("title", "") if date_el else ""

        stats = item.select(".tweet-stat")
        replies = likes = retweets = 0
        for i, stat in enumerate(stats):
            val = parse_stat(stat.get_text(strip=True))
            if i == 0:
                replies = val
            elif i == 1:
                retweets = val
            elif i == 2:
                likes = val

        media = []
        for img in item.select(".attachment.image a.still-image"):
            img_href = img.get("href", "")
            if img_href:
                media.append({"type": "image", "url": NITTER + img_href})
        for vid in item.select(".attachment.video-container"):
            vid_src = vid.select_one("source")
            if vid_src:
                media.append({"type": "video", "url": NITTER + vid_src.get("src", "")})

        tweets.append({
            "id": tweet_id,
            "url": f"https://x.com/{USERNAME}/status/{tweet_id}",
            "text": text,
            "date": date_title,
            "likes": likes,
            "retweets": retweets,
            "replies": replies,
            "media": media,
        })

    return tweets


def get_next_cursor(soup):
    show_more = soup.select(".show-more a")
    for link in show_more:
        href = link.get("href", "")
        if "cursor=" in href:
            return href
    return None


def main():
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    })

    all_tweets = []
    seen_ids = set()
    url = f"{NITTER}/{USERNAME}"
    page_num = 0

    while url:
        page_num += 1
        print(f"Page {page_num}: {url}")

        resp = None
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                resp = session.get(url, timeout=30)
                resp.raise_for_status()
                if resp.text.strip():
                    break
                print(f"  Attempt {attempt}/{MAX_RETRIES}: empty response, retrying...")
            except requests.RequestException as e:
                print(f"  Attempt {attempt}/{MAX_RETRIES}: {e}")
            if attempt < MAX_RETRIES:
                time.sleep(RETRY_BACKOFF * attempt)
            resp = None

        if not resp or not resp.text.strip():
            print("  All retries exhausted, stopping.")
            break

        soup = BeautifulSoup(resp.text, "lxml")
        tweets = parse_tweets(soup)

        new_count = 0
        for t in tweets:
            if t["id"] not in seen_ids:
                seen_ids.add(t["id"])
                all_tweets.append(t)
                new_count += 1

        print(f"  Found {len(tweets)} tweets, {new_count} new (total: {len(all_tweets)})")

        if new_count == 0:
            print("  No new tweets, stopping.")
            break

        next_path = get_next_cursor(soup)
        if next_path:
            url = NITTER + next_path
        else:
            print("  No more pages.")
            break

        time.sleep(1)

    if not all_tweets:
        print("No tweets scraped.", file=sys.stderr)
        sys.exit(1)

    all_tweets.sort(key=lambda t: t["id"], reverse=True)

    with open(OUTPUT, "w") as f:
        json.dump(all_tweets, f, indent=2, ensure_ascii=False)

    print(f"\nSaved {len(all_tweets)} tweets to {OUTPUT}")


if __name__ == "__main__":
    main()
