NORM/FHIR_OCR_POC/ocr_module/ocr_processor.py

import os
import logging
from typing import Dict, Any, List, Optional, Tuple
import cv2
import pytesseract
from PIL import Image

class OCRProcessor:
    """
    OCR processor class for healthcare documents using Tesseract.
    """

    def __init__(self, tesseract_cmd: Optional[str] = None):
        """
        Initialize the OCR processor.

        Args:
            tesseract_cmd: Optional path to Tesseract executable
        """
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

        # Set up logging
        self.logger = logging.getLogger(__name__)

        # Verify Tesseract installation
        try:
            pytesseract.get_tesseract_version()
            self.logger.info("Tesseract OCR initialized successfully")
        except Exception as e:
            self.logger.error(f"Failed to initialize Tesseract OCR: {str(e)}")
            raise RuntimeError(f"Tesseract OCR not properly installed: {str(e)}")

    def preprocess_image(self, image):
        """
        Preprocess image to improve OCR accuracy.

        Args:
            image: OpenCV image object

        Returns:
            Preprocessed image
        """
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply threshold to get black and white image
        _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

        # Invert back
        binary = 255 - binary

        return binary

    def process_image(self, image_path: str) -> Dict[str, Any]:
        """
        Process an image file and extract text using OCR.

        Args:
            image_path: Path to the image file

        Returns:
            Dictionary containing extracted text and metadata
        """
        if not os.path.exists(image_path):
            self.logger.error(f"Image file not found: {image_path}")
            raise FileNotFoundError(f"Image file not found: {image_path}")

        try:
            # Read image using OpenCV
            img = cv2.imread(image_path)
            if img is None:
                raise ValueError(f"Failed to read image: {image_path}")

            # Preprocess image
            preprocessed = self.preprocess_image(img)

            # Apply OCR
            raw_text = pytesseract.image_to_string(preprocessed)

            # Get detailed OCR data including confidence levels
            ocr_data = pytesseract.image_to_data(preprocessed, output_type=pytesseract.Output.DICT)

            # Extract structured data
            structured_data = self.extract_healthcare_data(raw_text)

            result = {
                "raw_text": raw_text,
                "structured_data": structured_data,
                "confidence": self._calculate_avg_confidence(ocr_data),
                "metadata": {
                    "source_file": image_path,
                    "ocr_engine": "Tesseract",
                    "ocr_version": pytesseract.get_tesseract_version()
                }
            }

            self.logger.info(f"Successfully processed image: {image_path}")
            return result

        except Exception as e:
            self.logger.error(f"OCR processing error: {str(e)}")
            raise RuntimeError(f"Failed to process image with OCR: {str(e)}")

    def extract_healthcare_data(self, text: str) -> Dict[str, Any]:
        """
        Extract structured healthcare data from OCR text.

        Args:
            text: Raw OCR text

        Returns:
            Dictionary containing extracted healthcare data fields
        """
        # This is a simplified implementation that would need to be enhanced
        # with more sophisticated extraction logic for a real-world application

        lines = [line.strip() for line in text.split('\n') if line.strip()]
        data = {
            "patient": {
                "name": None,
                "dob": None,
                "id": None,
                "gender": None
            },
            "document_type": self._detect_document_type(text),
            "extracted_fields": {}
        }

        # Simple extraction based on keywords (would need enhancement for production)
        for line in lines:
            if "name:" in line.lower() or "patient:" in line.lower():
                data["patient"]["name"] = self._extract_after_colon(line)
            elif "dob:" in line.lower() or "birth" in line.lower() or "born:" in line.lower():
                data["patient"]["dob"] = self._extract_after_colon(line)
            elif "id:" in line.lower() or "mrn:" in line.lower() or "record" in line.lower():
                data["patient"]["id"] = self._extract_after_colon(line)
            elif "gender:" in line.lower() or "sex:" in line.lower():
                data["patient"]["gender"] = self._extract_after_colon(line)

        return data

    def _detect_document_type(self, text: str) -> str:
        """
        Attempt to detect the type of healthcare document.

        Args:
            text: Raw OCR text

        Returns:
            Document type string
        """
        text_lower = text.lower()

        if "insurance" in text_lower and ("card" in text_lower or "policy" in text_lower):
            return "insurance_card"
        elif "prescription" in text_lower:
            return "prescription"
        elif "lab" in text_lower and ("result" in text_lower or "report" in text_lower):
            return "lab_result"
        elif "discharge" in text_lower and "summary" in text_lower:
            return "discharge_summary"
        else:
            return "unknown"

    def _extract_after_colon(self, text: str) -> str:
        """Extract the content after a colon in a string."""
        if ":" in text:
            return text.split(":", 1)[1].strip()
        return text.strip()

    def _calculate_avg_confidence(self, ocr_data: Dict) -> float:
        """
        Calculate average confidence score from OCR data.

        Args:
            ocr_data: Dictionary containing OCR data from pytesseract

        Returns:
            Average confidence score as a percentage
        """
        confidences = [conf for conf in ocr_data.get('conf', []) if conf != -1]
        if not confidences:
            return 0.0
        return sum(confidences) / len(confidences)