"""
Document Processing Service

This service processes documents (PDFs, TXTs) stored in S3:
- Extracts text from documents
- Merges text into a single file
- Chunks text using LLMs for better semantic organization
- Generates summaries using LLMs
- Saves results back to S3
- Provides an API endpoint for processing requests
"""

# Standard library imports
import os, json
import sys, io
import time
from datetime import datetime

# Third-party imports
import anthropic
# import fitz  # PyMuPDF for extracting text from PDF files
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import pdfkit
from docx2pdf import convert

import base64
from google import genai
from google.genai import types
import pathlib

from document_extractor import extract_RFP_text_compatible

# ===== Configuration =====

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

GEMINI_API_KEY = "AIzaSyCzr6L3E8yywy8Ls2errRBOPx740VcjV1g"
ANTHROPIC_API_KEY = "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"

# Initialize Gemini client
gemini_client = genai.Client(api_key=GEMINI_API_KEY)
# ===== Utility Functions =====

def get_llm_client(llm_type, api_key):
    """
    Returns an initialized LLM client based on the specified type.
    
    Args:
        llm_type (str): Type of LLM to use ("anthropic", "openai", or "deepseek")
        api_key (str): API key for authentication
        
    Returns:
        object: Initialized LLM client
    """
    if not api_key:
        raise ValueError("API key is required for LLM client initialization")
        
    if llm_type == "claude":
        return anthropic.Anthropic(api_key=api_key)
    # elif llm_type == "openai":
    #     return OpenAI(api_key=api_key)
    # elif llm_type == "deepseek":
    #     return OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
    # else:
    #     raise ValueError(f"Unsupported LLM type: {llm_type}")

# ===== Text Processing Functions =====

def count_words(text):
    """
    Count words in text.
    
    Args:
        text (str): Text to count words in
        
    Returns:
        int: Number of words in text
    """
    return len(word_tokenize(text))

def create_primary_chunks(text, chunk_size=2000, overlap=100):
    """
    Split text into overlapping chunks for LLM processing.
    
    Args:
        text (str): Text to split into chunks
        chunk_size (int): Target size of each chunk in words
        overlap (int): Number of words to overlap between chunks
        
    Returns:
        list: List of text chunks
    """
    chunks = []
    sentences = sent_tokenize(text)
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        sentence_word_count = count_words(sentence)

        # If adding this sentence exceeds chunk size
        if current_word_count + sentence_word_count > chunk_size:
            # Save current chunk
            if current_chunk:
                chunks.append(' '.join(current_chunk))

            # Start new chunk with some overlap
            overlap_sentences = current_chunk[-2:] if len(current_chunk) >= 2 else current_chunk  
            current_chunk = overlap_sentences + [sentence]
            current_word_count = count_words(' '.join(current_chunk))
        else:
            current_chunk.append(sentence)
            current_word_count += sentence_word_count

    # Add the last chunk if not empty
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# ===== LLM Processing Functions =====

def process_chunk_with_llm(llm_model, llm_type, api_key, chunk):
    """
    Send chunk to LLM for processing into smaller, semantically coherent chunks.
    
    Args:
        llm_model (str): Model identifier for the LLM
        llm_type (str): Type of LLM to use ("anthropic", "openai", or "deepseek")
        api_key (str): API key for authentication
        chunk (str): Text chunk to process
        
    Returns:
        list: List of processed sub-chunks
    """
    try:
        client = get_llm_client(llm_type, api_key)
        
        # Construct the prompt for chunking text
        prompt = [
            {"role": "user", "content": "You have to work as a Data Engineer who converts unstructured data into structured format for making it usable for ML Training."},
            {"role": "assistant", "content": "Sure, I will help you in text data retructuring. Please let me know the exact requirements."},
            {"role": "user", "content": "Great! So you have to analyze a text segment and break it into smaller chunks following these rules: \
             1. Each chunk must be more than 400 words; \
             2. Each chunk must maintain complete semantic meaning; \
             3. Never break in the middle of a sentence or logical thought; \
             4. If the input contains any headers, titles or section names, headings or subheadings:\
               - Identify such contextual content\
               - Prepend these relevant headers or titles or section name to each chunk to maintain hierarchical context; \
             5. Ensure proper handling of:\
               - Lists (keep items together when contextually related)\
               - Tables (keep with their captions and context)\
               - Quotes (preserve complete quotes in single chunks when possible)\
               - Citations (keep with their referenced text); \
             6. Please delimt different chunks with this delimiter: '============break============'.\
             Only create the chunks of the text and use the original text. DO NOT make up any text or content on your own. \
             \
             And please do not add any explanations in the output.\
             \nHere is the text to process:\n" + chunk},
        ]

        # Process with the appropriate LLM
        if llm_type == "claude":
            message = client.messages.create(
                model=llm_model,
                max_tokens=4000,
                messages=prompt
            )
            sub_chunks = message.content[0].text.split('============break============') 
        else:  # OpenAI or Deepseek
            response = client.chat.completions.create(
                model=llm_model,
                max_tokens=4096,
                messages=prompt,
                stream=False
            )
            sub_chunks = response.choices[0].message.content.split('============break============')
            
        # Clean up the chunks
        return [chunk.strip() for chunk in sub_chunks if chunk.strip()]
            
    except Exception as error:
        print(f"Error processing chunk with LLM: {error}")
        sys.stdout.flush()
        return []

# ===== Document Processing Functions =====

def extract_RFP_text(input_dir, output_dir):
    """
    Process all PDF Excel and Doc files from a directory and create a merged text file.
    
    Args:
        input_dir (str): Directory path for input files
        output_dir (str): Directory path for output files
        
    Returns:
        bool: True if successful, False otherwise
    """
    # Use the unified document extractor
    success, message = extract_RFP_text_compatible(input_dir, output_dir, ANTHROPIC_API_KEY)
    
    if success:
        print(message)
    else:
        print(f"Error: {message}")
    
    return success, message

def process_text_file(output_dir, api_key, llm_type, llm_model):
    """
    Process the merged text file into chunks using LLM.
    
    Args:
        output_dir (str): Directory path within bucket for output files
        api_key (str): API key for LLM
        llm_type (str): Type of LLM to use
        llm_model (str): Model identifier for the LLM
        
    Returns:
        bool: True if successful, False otherwise
    """
    # Read input file
    file_path = os.path.join(output_dir, "merged.txt")
    try:
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return False

    if not text:
        print(f"Failed to read merged text file from S3")
        sys.stdout.flush()
        return False
    # elif (count_words(text) < 4500):
    #     all_tenders_map[bid_id_of_this_tender][chunks_required] = False
    #     print(f"Chunks not required for smaller RFP documents")
    #     return True

    print("Creating primary chunks...")
    sys.stdout.flush()
    primary_chunks = create_primary_chunks(text)

    # Process each chunk
    all_processed_chunks = []
    for i, chunk in enumerate(primary_chunks):
        print(f"\nProcessing chunk {i+1} of {len(primary_chunks)}...")
        sys.stdout.flush()
        
        # Process with LLM
        processed_chunks = process_chunk_with_llm(llm_model, llm_type, api_key, chunk)
        
        # Add processed chunks to collection
        all_processed_chunks.extend(processed_chunks)

        # Add small delay between API calls
        time.sleep(1)
    
    # Filter out any empty chunks
    all_processed_chunks = [chunk for chunk in all_processed_chunks if chunk]


    # # Create DataFrame with required structure
    # df = pd.DataFrame({
    #     'Tag': [f'tag{i}' for i in range(len(all_processed_chunks))],
    #     'question': all_processed_chunks,
    #     'answer': all_processed_chunks  # Duplicate the text for both columns
    # })

    # output_excel_file = os.path.join(output_dir, "chunks.xlsx")

    # with pd.ExcelWriter(output_excel_file) as writer:
    #     df.to_excel(writer, sheet_name='input', index=False)

    # Create list of dictionaries with required structure
    json_data = [
        {
            "tagName": f"tag{i+1}",
            "question": [chunk_for_json],  # Note: wrapped in list as per sample
            "answer": chunk_for_json,
            "question_neg": []
        }
        for i, chunk_for_json in enumerate(all_processed_chunks)
    ]

    # Define output path
    output_json_file = os.path.join(output_dir, "chunks.json")

    # Write to JSON file with proper formatting
    with open(output_json_file, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=2, ensure_ascii=False)

    print(f"Successfully saved BID Chunks Json to: {output_json_file}")
    sys.stdout.flush()
    return True


def process_task(bid_dir):
    """
    Process a document processing task from the queue.
    
    Args:
        task_data (dict): Task data including client_id, request_id, and configuration
    """
    #Create output Directory if it doesnt exist
    output_dir = os.path.join(bid_dir, "tender_analysis")
    os.makedirs(output_dir, exist_ok=True)

    # Process documents
    success, err_msg = extract_RFP_text(bid_dir, output_dir)

    genai_engine = "claude"
    genai_version = "claude-3-7-sonnet-latest"

    if success:
        chunk_success = process_text_file(output_dir, ANTHROPIC_API_KEY, genai_engine, genai_version)
            
        if chunk_success:
            return True
        else:
            return False
    else:
        print(err_msg)
        return False