import os import logging from typing import Dict, Any, List, Optional, Tuple import cv2 import pytesseract from PIL import Image class OCRProcessor: """ OCR processor class for healthcare documents using Tesseract. """ def __init__(self, tesseract_cmd: Optional[str] = None): """ Initialize the OCR processor. Args: tesseract_cmd: Optional path to Tesseract executable """ if tesseract_cmd: pytesseract.pytesseract.tesseract_cmd = tesseract_cmd # Set up logging self.logger = logging.getLogger(__name__) # Verify Tesseract installation try: pytesseract.get_tesseract_version() self.logger.info("Tesseract OCR initialized successfully") except Exception as e: self.logger.error(f"Failed to initialize Tesseract OCR: {str(e)}") raise RuntimeError(f"Tesseract OCR not properly installed: {str(e)}") def preprocess_image(self, image): """ Preprocess image to improve OCR accuracy. Args: image: OpenCV image object Returns: Preprocessed image """ # Convert to grayscale gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Apply threshold to get black and white image _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Invert back binary = 255 - binary return binary def process_image(self, image_path: str) -> Dict[str, Any]: """ Process an image file and extract text using OCR. Args: image_path: Path to the image file Returns: Dictionary containing extracted text and metadata """ if not os.path.exists(image_path): self.logger.error(f"Image file not found: {image_path}") raise FileNotFoundError(f"Image file not found: {image_path}") try: # Read image using OpenCV img = cv2.imread(image_path) if img is None: raise ValueError(f"Failed to read image: {image_path}") # Preprocess image preprocessed = self.preprocess_image(img) # Apply OCR raw_text = pytesseract.image_to_string(preprocessed) # Get detailed OCR data including confidence levels ocr_data = pytesseract.image_to_data(preprocessed, output_type=pytesseract.Output.DICT) # Extract structured data structured_data = self.extract_healthcare_data(raw_text) result = { "raw_text": raw_text, "structured_data": structured_data, "confidence": self._calculate_avg_confidence(ocr_data), "metadata": { "source_file": image_path, "ocr_engine": "Tesseract", "ocr_version": pytesseract.get_tesseract_version() } } self.logger.info(f"Successfully processed image: {image_path}") return result except Exception as e: self.logger.error(f"OCR processing error: {str(e)}") raise RuntimeError(f"Failed to process image with OCR: {str(e)}") def extract_healthcare_data(self, text: str) -> Dict[str, Any]: """ Extract structured healthcare data from OCR text. Args: text: Raw OCR text Returns: Dictionary containing extracted healthcare data fields """ # This is a simplified implementation that would need to be enhanced # with more sophisticated extraction logic for a real-world application lines = [line.strip() for line in text.split('\n') if line.strip()] data = { "patient": { "name": None, "dob": None, "id": None, "gender": None }, "document_type": self._detect_document_type(text), "extracted_fields": {} } # Simple extraction based on keywords (would need enhancement for production) for line in lines: if "name:" in line.lower() or "patient:" in line.lower(): data["patient"]["name"] = self._extract_after_colon(line) elif "dob:" in line.lower() or "birth" in line.lower() or "born:" in line.lower(): data["patient"]["dob"] = self._extract_after_colon(line) elif "id:" in line.lower() or "mrn:" in line.lower() or "record" in line.lower(): data["patient"]["id"] = self._extract_after_colon(line) elif "gender:" in line.lower() or "sex:" in line.lower(): data["patient"]["gender"] = self._extract_after_colon(line) return data def _detect_document_type(self, text: str) -> str: """ Attempt to detect the type of healthcare document. Args: text: Raw OCR text Returns: Document type string """ text_lower = text.lower() if "insurance" in text_lower and ("card" in text_lower or "policy" in text_lower): return "insurance_card" elif "prescription" in text_lower: return "prescription" elif "lab" in text_lower and ("result" in text_lower or "report" in text_lower): return "lab_result" elif "discharge" in text_lower and "summary" in text_lower: return "discharge_summary" else: return "unknown" def _extract_after_colon(self, text: str) -> str: """Extract the content after a colon in a string.""" if ":" in text: return text.split(":", 1)[1].strip() return text.strip() def _calculate_avg_confidence(self, ocr_data: Dict) -> float: """ Calculate average confidence score from OCR data. Args: ocr_data: Dictionary containing OCR data from pytesseract Returns: Average confidence score as a percentage """ confidences = [conf for conf in ocr_data.get('conf', []) if conf != -1] if not confidences: return 0.0 return sum(confidences) / len(confidences)