#!/usr/bin/env python3
"""
20min.ch Comment Pipeline

An automated pipeline that:
1. Finds recent articles on 20min.ch
2. Extracts article IDs
3. Fetches comments for each article using the API
4. Saves the data as JSON files
5. Performs basic analysis on the comments
"""

import os
import sys
import json
import time
import logging
import argparse
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Import functionality from our comment_fetcher
from comment_fetcher import fetch_comments, save_comments_to_file

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("pipeline.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Constants
BASE_URL = "https://www.20min.ch"
POLITICS_URLS = [
    f"{BASE_URL}/schweiz/politik/",
    f"{BASE_URL}/politik/",
    f"{BASE_URL}/news/politik/",
    BASE_URL,  # Fallback to main page
]
OUTPUT_DIR = Path("data")
OUTPUT_DIR.mkdir(exist_ok=True)
ANALYSIS_DIR = Path("analysis")
ANALYSIS_DIR.mkdir(exist_ok=True)

# Browser-like headers
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": BASE_URL,
    "DNT": "1",
}

def find_articles(max_articles=10, category='politik'):
    """
    Find recent articles from 20min.ch in the specified category.
    
    Args:
        max_articles: Maximum number of articles to find
        category: Article category to search (politik, schweiz, etc.)
        
    Returns:
        List of dictionaries with article info (title, url, id)
    """
    logger.info(f"Finding up to {max_articles} articles in category: {category}")
    articles = []
    
    # Try different category URLs
    urls_to_try = POLITICS_URLS if category == 'politik' else [f"{BASE_URL}/{category}/"]
    urls_to_try.append(BASE_URL)  # Always fallback to main page
    
    for url in urls_to_try:
        logger.info(f"Trying URL: {url}")
        
        try:
            response = requests.get(url, headers=HEADERS, timeout=30)
            if response.status_code != 200:
                logger.warning(f"Failed to fetch {url}: Status code {response.status_code}")
                continue
                
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Find article links - looking for story links which usually contain the article ID
            article_links = soup.select("a[href*='/story/']")
            logger.info(f"Found {len(article_links)} article links")
            
            for link in article_links:
                if len(articles) >= max_articles:
                    break
                    
                href = link.get('href')
                if not href:
                    continue
                    
                # Get full URL
                article_url = urljoin(BASE_URL, href)
                
                # Try to extract the article ID from the URL
                article_id = extract_article_id(article_url)
                if not article_id:
                    logger.warning(f"Could not extract ID from {article_url}")
                    continue
                
                # Get article title if available
                title_element = link.select_one("h2, h3, .title")
                title = title_element.text.strip() if title_element else None
                
                # If no title in link, try to get text content
                if not title:
                    title = link.text.strip()
                
                # If still no title, use the last part of the URL
                if not title:
                    title = href.split('/')[-1].replace('-', ' ').title()
                
                article_info = {
                    "id": article_id,
                    "url": article_url,
                    "title": title
                }
                
                # Check for duplicates before adding
                if not any(a["id"] == article_id for a in articles):
                    articles.append(article_info)
                    logger.debug(f"Added article: {title} (ID: {article_id})")
            
            if articles:
                # Found some articles, no need to try other URLs
                break
                
        except Exception as e:
            logger.error(f"Error finding articles from {url}: {e}")
    
    logger.info(f"Found {len(articles)} articles")
    return articles

def extract_article_id(url):
    """
    Extract the article ID from a 20min.ch URL.
    
    Args:
        url: The URL of the article
        
    Returns:
        Article ID as string or None if not found
    """
    # Try different patterns
    # Pattern 1: /story/some-title-12345678
    match = re.search(r'/story/.*?-(\d+)$', url)
    if match:
        return match.group(1)
    
    # Pattern 2: the ID is the last numeric part of the URL
    match = re.search(r'(\d+)(?:/|$)', url)
    if match:
        return match.group(1)
    
    return None

def fetch_all_comments(articles, output_dir=OUTPUT_DIR, tenant_id=6):
    """
    Fetch comments for all articles and save to JSON files.
    
    Args:
        articles: List of article dictionaries (id, url, title)
        output_dir: Directory to save comment files
        tenant_id: Tenant ID for API (6 for German, 7 for French)
        
    Returns:
        List of articles with comment data added
    """
    logger.info(f"Fetching comments for {len(articles)} articles")
    results = []
    
    for article in tqdm(articles, desc="Fetching comments"):
        try:
            article_id = article["id"]
            
            # Create a filename for the comments
            safe_title = "".join(c if c.isalnum() else "_" for c in article["title"])
            safe_title = safe_title[:50]  # Limit length
            filename = f"{safe_title}_{article_id}.json"
            output_path = output_dir / filename
            
            # Skip if already downloaded
            if output_path.exists():
                logger.info(f"Comments for article {article_id} already exist at {output_path}")
                with open(output_path, 'r', encoding='utf-8') as f:
                    comments_data = json.load(f)
                
                article["comments_data"] = comments_data
                article["comment_count"] = len(comments_data.get("comments", []))
                results.append(article)
                continue
            
            # Fetch comments
            logger.info(f"Fetching comments for article {article_id}: {article['title']}")
            comments_data = fetch_comments(article_id, tenant_id)
            
            if comments_data:
                # Save to file
                save_comments_to_file(comments_data, output_path)
                
                # Add comment data to article info
                article["comments_data"] = comments_data
                article["comment_count"] = len(comments_data.get("comments", []))
                results.append(article)
            else:
                logger.warning(f"No comments found for article {article_id}")
            
            # Be nice to the API - add a small delay
            time.sleep(2)
            
        except Exception as e:
            logger.error(f"Error processing article {article['id']}: {e}")
    
    logger.info(f"Successfully fetched comments for {len(results)} articles")
    return results

def analyze_comments(articles_with_comments):
    """
    Perform analysis on the fetched comments.
    
    Args:
        articles_with_comments: List of articles with their comment data
        
    Returns:
        Dictionary of analysis results
    """
    logger.info("Analyzing comments")
    
    if not articles_with_comments:
        logger.warning("No articles with comments to analyze")
        return {}
    
    # Prepare data for analysis
    all_comments = []
    all_replies = []
    article_stats = []
    
    for article in articles_with_comments:
        article_id = article["id"]
        title = article["title"]
        
        comments_data = article.get("comments_data", {})
        comments = comments_data.get("comments", [])
        
        # Skip if no comments
        if not comments:
            continue
        
        # Calculate stats for this article
        comment_count = len(comments)
        reply_count = sum(len(comment.get("replies", [])) for comment in comments)
        total_reactions = {}
        
        # Process comments and replies
        for comment in comments:
            # Add article metadata to comment
            comment_data = {
                "article_id": article_id,
                "article_title": title,
                "comment_id": comment.get("id"),
                "author": comment.get("authorNickname", "Anonymous"),
                "text": comment.get("body", ""),
                "timestamp": comment.get("createdAt"),
                "is_reply": False,
                "reply_count": len(comment.get("replies", []))
            }
            
            # Add reaction counts
            reactions = comment.get("reactions", {})
            for reaction_type, count in reactions.items():
                comment_data[f"reaction_{reaction_type}"] = count
                total_reactions[reaction_type] = total_reactions.get(reaction_type, 0) + count
            
            all_comments.append(comment_data)
            
            # Process replies
            for reply in comment.get("replies", []):
                reply_data = {
                    "article_id": article_id,
                    "article_title": title,
                    "comment_id": reply.get("id"),
                    "parent_id": comment.get("id"),
                    "author": reply.get("authorNickname", "Anonymous"),
                    "text": reply.get("body", ""),
                    "timestamp": reply.get("createdAt"),
                    "is_reply": True
                }
                
                # Add reaction counts for reply
                reply_reactions = reply.get("reactions", {})
                for reaction_type, count in reply_reactions.items():
                    reply_data[f"reaction_{reaction_type}"] = count
                    total_reactions[reaction_type] = total_reactions.get(reaction_type, 0) + count
                
                all_replies.append(reply_data)
        
        # Calculate article statistics
        article_stat = {
            "article_id": article_id,
            "title": title,
            "comment_count": comment_count,
            "reply_count": reply_count,
            "total_interaction_count": comment_count + reply_count
        }
        
        # Add total reactions
        for reaction_type, count in total_reactions.items():
            article_stat[f"total_{reaction_type}"] = count
        
        article_stats.append(article_stat)
    
    # Create DataFrames for analysis
    comments_df = pd.DataFrame(all_comments)
    replies_df = pd.DataFrame(all_replies)
    articles_df = pd.DataFrame(article_stats)
    
    # Generate analysis results
    analysis = {}
    
    if not comments_df.empty:
        # 1. Total statistics
        analysis["total_comments"] = len(comments_df)
        analysis["total_replies"] = len(replies_df)
        analysis["total_interactions"] = len(comments_df) + len(replies_df)
        
        # 2. Top articles by comment count
        top_articles = articles_df.sort_values(by="total_interaction_count", ascending=False)
        analysis["top_articles"] = top_articles.to_dict(orient="records")
        
        # 3. Top commenters
        if "author" in comments_df.columns:
            top_commenters = comments_df["author"].value_counts().head(10).to_dict()
            analysis["top_commenters"] = [{"author": author, "count": count} 
                                         for author, count in top_commenters.items()]
        
        # 4. Reaction statistics
        reaction_cols = [col for col in comments_df.columns if col.startswith("reaction_")]
        if reaction_cols:
            reaction_stats = {col.replace("reaction_", ""): comments_df[col].sum() 
                             for col in reaction_cols if not comments_df[col].isna().all()}
            analysis["reaction_stats"] = reaction_stats
        
        # 5. Save analysis files
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save DataFrames
        comments_df.to_csv(ANALYSIS_DIR / f"comments_{timestamp}.csv", index=False)
        if not replies_df.empty:
            replies_df.to_csv(ANALYSIS_DIR / f"replies_{timestamp}.csv", index=False)
        articles_df.to_csv(ANALYSIS_DIR / f"articles_{timestamp}.csv", index=False)
        
        # Generate visualizations
        if not articles_df.empty:
            # Plot comment counts by article
            plt.figure(figsize=(12, 6))
            articles_plot = articles_df.sort_values("comment_count", ascending=False).head(10)
            articles_plot.plot(kind="bar", x="title", y="comment_count")
            plt.title("Comments by Article")
            plt.xlabel("Article")
            plt.ylabel("Number of Comments")
            plt.tight_layout()
            plt.savefig(ANALYSIS_DIR / f"comments_by_article_{timestamp}.png")
            
            # Plot reaction types
            if "reaction_stats" in analysis:
                plt.figure(figsize=(10, 6))
                reaction_data = analysis["reaction_stats"]
                plt.bar(reaction_data.keys(), reaction_data.values())
                plt.title("Reaction Types")
                plt.xlabel("Reaction Type")
                plt.ylabel("Count")
                plt.tight_layout()
                plt.savefig(ANALYSIS_DIR / f"reaction_types_{timestamp}.png")
        
        # Save analysis JSON
        with open(ANALYSIS_DIR / f"analysis_{timestamp}.json", 'w', encoding='utf-8') as f:
            json.dump(analysis, f, ensure_ascii=False, indent=2)
    
    logger.info("Analysis completed")
    return analysis

def main():
    """Main function to run the comment pipeline."""
    parser = argparse.ArgumentParser(
        description="""
        20min.ch Comment Pipeline
        -----------------------
        An automated pipeline to find articles, fetch comments via API, and analyze the results.
        
        Examples:
        - Run the complete pipeline with default settings:
          python comment_pipeline.py
          
        - Specify the number of articles to process:
          python comment_pipeline.py --articles 20
          
        - Specify a category:
          python comment_pipeline.py --category wirtschaft
        """,
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    
    parser.add_argument("--articles", type=int, default=10, 
                       help="Maximum number of articles to process (default: 10)")
    parser.add_argument("--category", default="politik", 
                       help="Category to search for articles (default: politik)")
    parser.add_argument("--tenant", type=int, default=6, 
                       help="Tenant ID for API (default: 6 for German, 7 for French)")
    parser.add_argument("--skip-analysis", action="store_true", 
                       help="Skip the analysis step")
    parser.add_argument("--output-dir", type=str, default=str(OUTPUT_DIR), 
                       help=f"Directory to save comment files (default: {OUTPUT_DIR})")
    
    args = parser.parse_args()
    
    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True)
    
    try:
        # Step 1: Find articles
        logger.info("Starting 20min.ch comment pipeline")
        articles = find_articles(max_articles=args.articles, category=args.category)
        
        if not articles:
            logger.error("No articles found, exiting")
            return 1
        
        # Step 2: Fetch comments for all articles
        articles_with_comments = fetch_all_comments(articles, output_dir, args.tenant)
        
        if not articles_with_comments:
            logger.warning("No comments found for any articles")
            return 0
        
        # Step 3: Analyze comments (unless skipped)
        if not args.skip_analysis:
            analysis = analyze_comments(articles_with_comments)
            
            if analysis:
                logger.info("Analysis summary:")
                logger.info(f"Total comments: {analysis.get('total_comments', 0)}")
                logger.info(f"Total replies: {analysis.get('total_replies', 0)}")
                
                # Print top articles by comment count
                if "top_articles" in analysis and analysis["top_articles"]:
                    top_article = analysis["top_articles"][0]
                    logger.info(f"Most commented article: {top_article['title']} "
                              f"({top_article.get('comment_count', 0)} comments)")
        
        logger.info("Pipeline completed successfully")
        return 0
        
    except Exception as e:
        logger.error(f"Pipeline error: {e}", exc_info=True)
        return 1

if __name__ == "__main__":
    sys.exit(main()) 