"""
Document Analyzer Module

This module uses Claude LLM to analyze extracted document text and identify
document types, extract key information, and provide descriptions.
"""

import json
import logging
import os
from typing import Dict, Any, List, Optional
import anthropic
from datetime import datetime

from dotenv import load_dotenv
load_dotenv()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-3-5-haiku-latest")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "Qwen/Qwen3-Next-80B-A3B-Instruct")
OPENAI_URL = os.getenv("OPENAI_URL", "https://api.deepinfra.com/v1/openai")

from openai import OpenAI

llm_model = os.getenv("GENAI_ENGINE", "claude")

# Configure logging
logger = logging.getLogger(__name__)

class DocumentAnalyzer:
    """
    Document analyzer that uses Claude to analyze document content
    and extract structured information.
    """
    
    def __init__(self, claude_api_key: str):
        """
        Initialize the document analyzer.
        
        Args:
            claude_api_key (str): API key for Claude
        """

        if(llm_model=='claude'):
            self.claude_client = anthropic.Anthropic(api_key=claude_api_key)
        elif(llm_model=='open_llm'):
            self.openai = OpenAI(
                            api_key=OPENAI_API_KEY,
                            base_url=OPENAI_URL,
                        )
        
        # Document type categories and their subtypes
        self.document_categories = {
            "Legal": [
                "Incorporation_Certificate", "MOA", "AOA", "PAN_Card", "PAN_Certificate",
                "GST_Certificate", "MSME_Certificate", "MSME_Registration", "CIN_Certificate",
                "Director_PAN", "Director_Aadhaar", "Address_Proof", "Lease_Agreement",
                "Partnership_Deed", "License", "Trade_License", "Registration_Certificate"
            ],
            "Financial": [
                "ITR", "Income_Tax_Return", "Balance_Sheet", "Audit_Report", "CA_Certificate",
                "Turnover_Certificate", "Revenue_Certificate", "Financial_Statement",
                "Bank_Statement", "Tax_Compliance_Certificate", "Copy_of_Cheque"
            ],
            "Certificates": [
                "ISO_Certificate", "CMMI_Certificate", "Patent_Certificate",
                "Membership_Certificate", "Quality_Certificate", "Compliance_Certificate",
                "Award_Certificate", "Training_Certificate"
            ],
            "Experience": [
                "Work_Order", "Purchase_Order", "Contract", "Agreement", "Completion_Certificate",
                "Performance_Certificate", "Client_Certificate", "Project_Certificate",
                "Reference_Letter", "Testimonial"
            ],
            "Technical": [
                "Product_Brochure", "Technical_Specification", "Price_List", "Quotation",
                "OEM_Letter", "Authorization_Letter", "Product_Certificate",
                "Technical_Document", "Manual", "Datasheet"
            ],
            "HR": [
                "Employee_Resume", "Employee_Certificate", "Salary_Certificate",
                "Experience_Letter", "Appointment_Letter", "Employee_List", "Employee_Details"
            ],
            "Other": [
                "Letter", "Email", "Correspondence", "Invoice", "Receipt",
                "General_Document", "Miscellaneous"
            ]
        }
    
    def get_analysis_prompt(self) -> str:
        """
        Get the prompt for document analysis.
        
        Returns:
            str: Analysis prompt
        """
        categories_text = "\n".join([
            f"- {category}: {', '.join(subtypes)}"
            for category, subtypes in self.document_categories.items()
        ])
        
        return f"""
You are an expert document analyzer. Analyze the provided document text and extract structured information.

DOCUMENT CATEGORIES AND TYPES:
{categories_text}

Your task is to:
1. Identify the document type using the format "Category:Subtype" (e.g., "Financial:ITR", "Experience:Work_Order")
2. Extract key information relevant to the document type
3. Provide a concise description
4. Note the original filename if mentioned

RESPONSE FORMAT - Return ONLY a valid JSON object with this exact structure:
{{
    "doc_type": "Category:Subtype",
    "key_info": {{
        "key1": "value1",
        "key2": "value2"
    }},
    "description": "Brief description of the document and its contents",
    "confidence": 0.95
}}

KEY INFORMATION EXTRACTION GUIDELINES:
- For Financial documents (ITR, Balance Sheet, etc.): Extract assessment year, amounts, company name, total revenue
- For Experience documents (Work Orders, Contracts, etc.): Extract customer, location, project details, value, dates
- For Legal documents: Extract company name, registration numbers, dates, document number/ID
- For Certificates: Extract issuing authority, validity period, certificate number, scope, purpose of certificate
- For Technical documents: Extract product details, specifications, prices, suppliers, OEM name

IMPORTANT RULES:
- Always respond with valid JSON only
- Use "Other:General_Document" if the document type cannot be clearly determined
- Extract dates in a consistent format when possible
- Include currency symbols and units where applicable
- Set confidence between 0.1 and 1.0 based on how certain you are about the classification
- If key information is not available, use empty strings or null values
- Keep descriptions concise but informative (1-3 sentences)
- If the document is signed or verified or attested, then you may please ignore the information about signing authority or person

DO NOT include any text outside of the JSON structure.
"""
    
    def analyze_document(self, document_text: str, file_path: str) -> Dict[str, Any]:
        """
        Analyze a single document and extract structured information.
        
        Args:
            document_text (str): Extracted text from the document
            file_path (str): Path to the original file
            
        Returns:
            Dict[str, Any]: Analysis results
        """
        try:
            # Prepare the analysis prompt
            prompt = self.get_analysis_prompt()
            
            # Add file information to the text
            filename = os.path.basename(file_path)
            full_text = f"FILENAME: {filename}\n\nDOCUMENT CONTENT:\n{document_text}"
            
            # Limit text length to avoid token limits
            if len(full_text) > 100000:  # Approximate token limit
                full_text = full_text[:100000] + "\n[Document truncated due to length]"
            
            if(llm_model=='claude'):
                response = self.claude_client.messages.create(
                    model=ANTHROPIC_MODEL,
                    max_tokens=2000,
                    temperature=0.1,
                    system="You are an expert document analyzer. Analyze documents and return structured JSON data about their type, key information, and description.",
                    messages=[
                        {
                            "role": "user",
                            "content": f"{prompt}\n\nANALYZE THIS DOCUMENT:\n{full_text}"
                        }
                    ]
                )
                
                # Parse the JSON response
                response_text = response.content[0].text.strip()
            elif(llm_model=='open_llm'):
                response = self.openai.chat.completions.create(
                    model=OPENAI_MODEL,
                    messages=[
                        {"role": "system", "content": "You are an expert document analyzer. Analyze documents and return structured JSON data about their type, key information, and description."},
                        {"role": "user", "content": f"{prompt}\n\nANALYZE THIS DOCUMENT:\n{full_text}"},
                    ],
                )
                response_text = response.choices[0].message.content
            
            # Clean up the response if it has markdown formatting
            if response_text.startswith("```json"):
                response_text = response_text.replace("```json", "").replace("```", "").strip()
            
            try:
                analysis_result = json.loads(response_text)
                
                # Add metadata
                analysis_result["file_path"] = file_path
                analysis_result["file_name"] = filename
                analysis_result["analysis_date"] = datetime.now().isoformat()
                
                # Validate required fields
                required_fields = ["doc_type", "key_info", "description"]
                for field in required_fields:
                    if field not in analysis_result:
                        analysis_result[field] = ""
                
                if "confidence" not in analysis_result:
                    analysis_result["confidence"] = 0.5
                
                return analysis_result
                
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse JSON response for {filename}: {e}")
                logger.error(f"Response was: {response_text}")
                
                # Return a fallback result
                return {
                    "doc_type": "Other:General_Document",
                    "key_info": {},
                    "description": f"Document analysis failed - could not parse response for {filename}",
                    "confidence": 0.1,
                    "file_path": file_path,
                    "file_name": filename,
                    "analysis_date": datetime.now().isoformat(),
                    "error": "JSON parsing failed"
                }
                
        except Exception as e:
            logger.error(f"Error analyzing document {file_path}: {e}")
            
            # Return a fallback result
            return {
                "doc_type": "Other:General_Document",
                "key_info": {},
                "description": f"Document analysis failed due to error: {str(e)}",
                "confidence": 0.1,
                "file_path": file_path,
                "file_name": os.path.basename(file_path),
                "analysis_date": datetime.now().isoformat(),
                "error": str(e)
            }
    
    def analyze_documents_batch(self, documents_text: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
        """
        Analyze multiple documents in batch.
        
        Args:
            documents_text (Dict[str, str]): Dictionary mapping file paths to extracted text
            
        Returns:
            Dict[str, Dict[str, Any]]: Dictionary mapping file paths to analysis results
        """
        analysis_results = {}
        total_docs = len(documents_text)
        
        logger.info(f"Starting analysis of {total_docs} documents")
        
        for i, (file_path, text) in enumerate(documents_text.items(), 1):
            logger.info(f"Analyzing document {i}/{total_docs}: {os.path.basename(file_path)}")
            
            analysis_result = self.analyze_document(text, file_path)
            analysis_results[file_path] = analysis_result
            
            # Log the result
            doc_type = analysis_result.get("doc_type", "Unknown")
            confidence = analysis_result.get("confidence", 0)
            logger.info(f"  -> Classified as: {doc_type} (confidence: {confidence:.2f})")
        
        logger.info(f"Completed analysis of {total_docs} documents")
        return analysis_results
    
    def get_analysis_summary(self, analysis_results: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
        """
        Generate a summary of the analysis results.
        
        Args:
            analysis_results (Dict[str, Dict[str, Any]]): Analysis results
            
        Returns:
            Dict[str, Any]: Summary statistics
        """
        if not analysis_results:
            return {"total_documents": 0, "categories": {}, "avg_confidence": 0}
        
        categories = {}
        confidences = []
        
        for result in analysis_results.values():
            doc_type = result.get("doc_type", "Other:Unknown")
            confidence = result.get("confidence", 0)
            
            if doc_type in categories:
                categories[doc_type] += 1
            else:
                categories[doc_type] = 1
            
            confidences.append(confidence)
        
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0
        
        return {
            "total_documents": len(analysis_results),
            "categories": categories,
            "avg_confidence": avg_confidence,
            "low_confidence_docs": [
                result["file_name"] for result in analysis_results.values()
                if result.get("confidence", 1) < 0.6
            ]
        }


# Factory function
def create_document_analyzer(claude_api_key: str) -> DocumentAnalyzer:
    """
    Factory function to create a DocumentAnalyzer instance.
    
    Args:
        claude_api_key (str): API key for Claude
        
    Returns:
        DocumentAnalyzer: Initialized analyzer instance
    """
    return DocumentAnalyzer(claude_api_key)