#!/usr/bin/env python3
"""
Test script to access 20min.ch with browser-like headers and behavior.
"""

import requests
import json
import logging
import time
import random
from pathlib import Path
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Example URL
ARTICLE_URL = "https://www.20min.ch/story/20-laender-trumps-prioritaetenliste-fuer-zollverhandlungen-enthuellt-103339848"

# Expanded browser-like headers
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "en-US,en;q=0.9,de;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "DNT": "1",
    "Sec-CH-UA": "\"Chromium\";v=\"116\", \"Not)A;Brand\";v=\"24\", \"Google Chrome\";v=\"116\"",
    "Sec-CH-UA-Mobile": "?0",
    "Sec-CH-UA-Platform": "\"Windows\"",
}

def get_article_info(url):
    """Get article info with browser-like behavior."""
    logger.info(f"Fetching article: {url}")
    
    try:
        # First visit the main site to get cookies (like a real browser would)
        session = requests.Session()
        
        # Visit main page first
        logger.info("Visiting main page to establish session")
        main_response = session.get("https://www.20min.ch/", headers=HEADERS, timeout=30)
        if main_response.status_code != 200:
            logger.error(f"Failed to access main page: {main_response.status_code}")
        
        # Add a small delay like a human would
        time.sleep(random.uniform(2, 4))
        
        # Now visit the actual article
        logger.info("Now visiting the article page")
        response = session.get(url, headers=HEADERS, timeout=30)
        
        if response.status_code != 200:
            logger.error(f"Failed to fetch article: {response.status_code}")
            return None
        
        # Save the raw HTML for inspection
        with open("raw_article.html", "w", encoding="utf-8") as f:
            f.write(response.text)
        logger.info("Saved raw HTML to raw_article.html")
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Get basic article info
        title_element = soup.select_one("h1")
        title = title_element.text.strip() if title_element else "Unknown Title"
        
        # Look for comments
        logger.info("Looking for comments")
        
        # Find comment count
        comment_count_text = None
        comment_count_elements = soup.select("span[data-testid='comment-count'], .comment-count")
        
        for element in comment_count_elements:
            comment_count_text = element.text.strip()
            logger.info(f"Found comment count element: {comment_count_text}")
        
        # Try regex approach if no element found
        if not comment_count_text:
            import re
            for element in soup.find_all(string=re.compile(r'\d+\s*Kommentare')):
                comment_count_text = element.strip()
                logger.info(f"Found comment text via regex: {comment_count_text}")
                break
        
        # Look for comment elements
        comment_selectors = [
            ".comment", 
            "[data-testid='comment']",
            ".comment-item", 
            "#comments .comment",
            ".comments-section .comment"
        ]
        
        # Try each selector
        comments_found = False
        for selector in comment_selectors:
            elements = soup.select(selector)
            logger.info(f"Selector '{selector}' found {len(elements)} elements")
            
            if elements:
                comments_found = True
                # Show first element as sample
                first_element = elements[0]
                logger.info(f"Sample comment HTML structure: {first_element.name} with classes {first_element.get('class', [])}")
                logger.info(f"Sample comment text content: {first_element.text[:100]}...")
                break
        
        return {
            "url": url,
            "title": title,
            "comment_count_text": comment_count_text,
            "comments_found": comments_found
        }
        
    except Exception as e:
        logger.error(f"Error: {e}")
        return None

def main():
    """Main function."""
    article_info = get_article_info(ARTICLE_URL)
    
    if article_info:
        logger.info("Article info:")
        for key, value in article_info.items():
            logger.info(f"  {key}: {value}")
    
if __name__ == "__main__":
    main()