import os
import logging
from typing import Dict, Any, List, Optional, Tuple
import cv2
import pytesseract
from PIL import Image

class OCRProcessor:
    """
    OCR processor class for healthcare documents using Tesseract.
    """
    
    def __init__(self, tesseract_cmd: Optional[str] = None):
        """
        Initialize the OCR processor.
        
        Args:
            tesseract_cmd: Optional path to Tesseract executable
        """
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
        
        # Set up logging
        self.logger = logging.getLogger(__name__)
        
        # Verify Tesseract installation
        try:
            pytesseract.get_tesseract_version()
            self.logger.info("Tesseract OCR initialized successfully")
        except Exception as e:
            self.logger.error(f"Failed to initialize Tesseract OCR: {str(e)}")
            raise RuntimeError(f"Tesseract OCR not properly installed: {str(e)}")
    
    def preprocess_image(self, image):
        """
        Preprocess image to improve OCR accuracy.
        
        Args:
            image: OpenCV image object
            
        Returns:
            Preprocessed image
        """
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Apply threshold to get black and white image
        _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        
        # Invert back
        binary = 255 - binary
        
        return binary
    
    def process_image(self, image_path: str) -> Dict[str, Any]:
        """
        Process an image file and extract text using OCR.
        
        Args:
            image_path: Path to the image file
            
        Returns:
            Dictionary containing extracted text and metadata
        """
        if not os.path.exists(image_path):
            self.logger.error(f"Image file not found: {image_path}")
            raise FileNotFoundError(f"Image file not found: {image_path}")
        
        try:
            # Read image using OpenCV
            img = cv2.imread(image_path)
            if img is None:
                raise ValueError(f"Failed to read image: {image_path}")
            
            # Preprocess image
            preprocessed = self.preprocess_image(img)
            
            # Apply OCR
            raw_text = pytesseract.image_to_string(preprocessed)
            
            # Get detailed OCR data including confidence levels
            ocr_data = pytesseract.image_to_data(preprocessed, output_type=pytesseract.Output.DICT)
            
            # Extract structured data
            structured_data = self.extract_healthcare_data(raw_text)
            
            result = {
                "raw_text": raw_text,
                "structured_data": structured_data,
                "confidence": self._calculate_avg_confidence(ocr_data),
                "metadata": {
                    "source_file": image_path,
                    "ocr_engine": "Tesseract",
                    "ocr_version": pytesseract.get_tesseract_version()
                }
            }
            
            self.logger.info(f"Successfully processed image: {image_path}")
            return result
            
        except Exception as e:
            self.logger.error(f"OCR processing error: {str(e)}")
            raise RuntimeError(f"Failed to process image with OCR: {str(e)}")
    
    def extract_healthcare_data(self, text: str) -> Dict[str, Any]:
        """
        Extract structured healthcare data from OCR text.
        
        Args:
            text: Raw OCR text
            
        Returns:
            Dictionary containing extracted healthcare data fields
        """
        # This is a simplified implementation that would need to be enhanced
        # with more sophisticated extraction logic for a real-world application
        
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        data = {
            "patient": {
                "name": None,
                "dob": None,
                "id": None,
                "gender": None
            },
            "document_type": self._detect_document_type(text),
            "extracted_fields": {}
        }
        
        # Simple extraction based on keywords (would need enhancement for production)
        for line in lines:
            if "name:" in line.lower() or "patient:" in line.lower():
                data["patient"]["name"] = self._extract_after_colon(line)
            elif "dob:" in line.lower() or "birth" in line.lower() or "born:" in line.lower():
                data["patient"]["dob"] = self._extract_after_colon(line)
            elif "id:" in line.lower() or "mrn:" in line.lower() or "record" in line.lower():
                data["patient"]["id"] = self._extract_after_colon(line)
            elif "gender:" in line.lower() or "sex:" in line.lower():
                data["patient"]["gender"] = self._extract_after_colon(line)
        
        return data
    
    def _detect_document_type(self, text: str) -> str:
        """
        Attempt to detect the type of healthcare document.
        
        Args:
            text: Raw OCR text
            
        Returns:
            Document type string
        """
        text_lower = text.lower()
        
        if "insurance" in text_lower and ("card" in text_lower or "policy" in text_lower):
            return "insurance_card"
        elif "prescription" in text_lower:
            return "prescription"
        elif "lab" in text_lower and ("result" in text_lower or "report" in text_lower):
            return "lab_result"
        elif "discharge" in text_lower and "summary" in text_lower:
            return "discharge_summary"
        else:
            return "unknown"
    
    def _extract_after_colon(self, text: str) -> str:
        """Extract the content after a colon in a string."""
        if ":" in text:
            return text.split(":", 1)[1].strip()
        return text.strip()
    
    def _calculate_avg_confidence(self, ocr_data: Dict) -> float:
        """
        Calculate average confidence score from OCR data.
        
        Args:
            ocr_data: Dictionary containing OCR data from pytesseract
            
        Returns:
            Average confidence score as a percentage
        """
        confidences = [conf for conf in ocr_data.get('conf', []) if conf != -1]
        if not confidences:
            return 0.0
        return sum(confidences) / len(confidences)