"""
Data processor utility functions for the 20min.ch scraper project.

This module provides functions for loading, processing, and analyzing the
scraped article and comment data.
"""

import os
import json
import glob
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional, Union

import pandas as pd
import networkx as nx

class DataProcessor:
    """Process and analyze scraped data from 20min.ch."""
    
    def __init__(self, data_dir: str = "data"):
        """
        Initialize the data processor.
        
        Args:
            data_dir: Directory containing JSON data files
        """
        self.data_dir = Path(data_dir)
        if not self.data_dir.exists():
            raise ValueError(f"Data directory {data_dir} does not exist")
    
    def list_data_files(self) -> List[Path]:
        """
        List all JSON data files in the data directory.
        
        Returns:
            List of paths to JSON files
        """
        return sorted(self.data_dir.glob("*.json"))
    
    def load_article(self, file_path: Union[str, Path]) -> Dict[str, Any]:
        """
        Load a single article from a JSON file.
        
        Args:
            file_path: Path to the JSON file
            
        Returns:
            Article data dictionary
        """
        with open(file_path, "r", encoding="utf-8") as f:
            return json.load(f)
    
    def load_all_articles(self) -> List[Dict[str, Any]]:
        """
        Load all articles from the data directory.
        
        Returns:
            List of article data dictionaries
        """
        articles = []
        for file_path in self.list_data_files():
            try:
                article = self.load_article(file_path)
                articles.append(article)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        return articles
    
    def articles_to_dataframe(self) -> pd.DataFrame:
        """
        Convert article data to a pandas DataFrame.
        
        Returns:
            DataFrame with article metadata
        """
        articles = self.load_all_articles()
        
        # Extract article metadata (excluding comments)
        article_data = []
        for article in articles:
            article_data.append({
                "url": article.get("url"),
                "title": article.get("title"),
                "published_date": article.get("published_date"),
                "scraped_date": article.get("scraped_date"),
                "comment_count": len(article.get("comments", [])),
            })
        
        df = pd.DataFrame(article_data)
        
        # Convert date columns to datetime
        for col in ["published_date", "scraped_date"]:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors="coerce")
        
        return df
    
    def comments_to_dataframe(self) -> pd.DataFrame:
        """
        Convert all comments to a pandas DataFrame.
        
        Returns:
            DataFrame with comment data
        """
        articles = self.load_all_articles()
        
        # Extract all comments
        comment_data = []
        for article in articles:
            article_url = article.get("url")
            article_title = article.get("title")
            
            for comment in article.get("comments", []):
                comment_data.append({
                    "article_url": article_url,
                    "article_title": article_title,
                    "comment_id": comment.get("id"),
                    "author": comment.get("author"),
                    "content": comment.get("content"),
                    "timestamp": comment.get("timestamp"),
                    "parent_id": comment.get("parent_id"),
                })
        
        df = pd.DataFrame(comment_data)
        
        # Convert timestamp to datetime if possible
        if "timestamp" in df.columns:
            df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
        
        return df
    
    def build_comment_network(self) -> nx.DiGraph:
        """
        Build a directed graph of comment relationships.
        
        Returns:
            NetworkX DiGraph where nodes are commenters and edges represent replies
        """
        comments_df = self.comments_to_dataframe()
        
        # Create a directed graph
        G = nx.DiGraph()
        
        # Add all authors as nodes
        for author in comments_df["author"].unique():
            G.add_node(author, comment_count=0)
        
        # Update comment counts for each author
        for author, count in comments_df["author"].value_counts().items():
            G.nodes[author]["comment_count"] = count
        
        # Add edges for replies
        reply_rows = comments_df[comments_df["parent_id"].notna()]
        
        for _, row in reply_rows.iterrows():
            # Find the parent comment
            parent_comment = comments_df[comments_df["comment_id"] == row["parent_id"]]
            
            if not parent_comment.empty:
                parent_author = parent_comment.iloc[0]["author"]
                replier = row["author"]
                
                # Add edge from replier to parent author
                if G.has_edge(replier, parent_author):
                    G[replier][parent_author]["weight"] += 1
                else:
                    G.add_edge(replier, parent_author, weight=1)
        
        return G
    
    def get_most_active_commenters(self, limit: int = 10) -> pd.DataFrame:
        """
        Get the most active commenters across all articles.
        
        Args:
            limit: Maximum number of commenters to return
        
        Returns:
            DataFrame with commenter activity metrics
        """
        comments_df = self.comments_to_dataframe()
        
        # Count comments by author
        commenter_counts = comments_df["author"].value_counts().reset_index()
        commenter_counts.columns = ["author", "comment_count"]
        
        # Count unique articles commented on
        articles_by_author = comments_df.groupby("author")["article_url"].nunique()
        commenter_counts["article_count"] = commenter_counts["author"].map(articles_by_author)
        
        # Count replies received
        reply_counts = (
            comments_df[comments_df["parent_id"].notna()]
            .merge(
                comments_df[["comment_id", "author"]],
                left_on="parent_id",
                right_on="comment_id",
                how="inner",
            )
            .groupby("author_y")
            .size()
        )
        
        commenter_counts["replies_received"] = commenter_counts["author"].map(reply_counts).fillna(0)
        
        return commenter_counts.nlargest(limit, "comment_count")

def load_json_directory(directory: str = "data") -> pd.DataFrame:
    """
    Load all JSON files from a directory into a pandas DataFrame.
    
    Args:
        directory: Directory containing JSON data files
        
    Returns:
        DataFrame with articles and nested comments
    """
    processor = DataProcessor(directory)
    return pd.DataFrame(processor.load_all_articles()) 