You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
NORM/FHIR_OCR_POC/ocr_module/ocr_processor.py

185 lines
6.5 KiB
Python

import os
import logging
from typing import Dict, Any, List, Optional, Tuple
import cv2
import pytesseract
from PIL import Image
class OCRProcessor:
"""
OCR processor class for healthcare documents using Tesseract.
"""
def __init__(self, tesseract_cmd: Optional[str] = None):
"""
Initialize the OCR processor.
Args:
tesseract_cmd: Optional path to Tesseract executable
"""
if tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
# Set up logging
self.logger = logging.getLogger(__name__)
# Verify Tesseract installation
try:
pytesseract.get_tesseract_version()
self.logger.info("Tesseract OCR initialized successfully")
except Exception as e:
self.logger.error(f"Failed to initialize Tesseract OCR: {str(e)}")
raise RuntimeError(f"Tesseract OCR not properly installed: {str(e)}")
def preprocess_image(self, image):
"""
Preprocess image to improve OCR accuracy.
Args:
image: OpenCV image object
Returns:
Preprocessed image
"""
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply threshold to get black and white image
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Invert back
binary = 255 - binary
return binary
def process_image(self, image_path: str) -> Dict[str, Any]:
"""
Process an image file and extract text using OCR.
Args:
image_path: Path to the image file
Returns:
Dictionary containing extracted text and metadata
"""
if not os.path.exists(image_path):
self.logger.error(f"Image file not found: {image_path}")
raise FileNotFoundError(f"Image file not found: {image_path}")
try:
# Read image using OpenCV
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"Failed to read image: {image_path}")
# Preprocess image
preprocessed = self.preprocess_image(img)
# Apply OCR
raw_text = pytesseract.image_to_string(preprocessed)
# Get detailed OCR data including confidence levels
ocr_data = pytesseract.image_to_data(preprocessed, output_type=pytesseract.Output.DICT)
# Extract structured data
structured_data = self.extract_healthcare_data(raw_text)
result = {
"raw_text": raw_text,
"structured_data": structured_data,
"confidence": self._calculate_avg_confidence(ocr_data),
"metadata": {
"source_file": image_path,
"ocr_engine": "Tesseract",
"ocr_version": pytesseract.get_tesseract_version()
}
}
self.logger.info(f"Successfully processed image: {image_path}")
return result
except Exception as e:
self.logger.error(f"OCR processing error: {str(e)}")
raise RuntimeError(f"Failed to process image with OCR: {str(e)}")
def extract_healthcare_data(self, text: str) -> Dict[str, Any]:
"""
Extract structured healthcare data from OCR text.
Args:
text: Raw OCR text
Returns:
Dictionary containing extracted healthcare data fields
"""
# This is a simplified implementation that would need to be enhanced
# with more sophisticated extraction logic for a real-world application
lines = [line.strip() for line in text.split('\n') if line.strip()]
data = {
"patient": {
"name": None,
"dob": None,
"id": None,
"gender": None
},
"document_type": self._detect_document_type(text),
"extracted_fields": {}
}
# Simple extraction based on keywords (would need enhancement for production)
for line in lines:
if "name:" in line.lower() or "patient:" in line.lower():
data["patient"]["name"] = self._extract_after_colon(line)
elif "dob:" in line.lower() or "birth" in line.lower() or "born:" in line.lower():
data["patient"]["dob"] = self._extract_after_colon(line)
elif "id:" in line.lower() or "mrn:" in line.lower() or "record" in line.lower():
data["patient"]["id"] = self._extract_after_colon(line)
elif "gender:" in line.lower() or "sex:" in line.lower():
data["patient"]["gender"] = self._extract_after_colon(line)
return data
def _detect_document_type(self, text: str) -> str:
"""
Attempt to detect the type of healthcare document.
Args:
text: Raw OCR text
Returns:
Document type string
"""
text_lower = text.lower()
if "insurance" in text_lower and ("card" in text_lower or "policy" in text_lower):
return "insurance_card"
elif "prescription" in text_lower:
return "prescription"
elif "lab" in text_lower and ("result" in text_lower or "report" in text_lower):
return "lab_result"
elif "discharge" in text_lower and "summary" in text_lower:
return "discharge_summary"
else:
return "unknown"
def _extract_after_colon(self, text: str) -> str:
"""Extract the content after a colon in a string."""
if ":" in text:
return text.split(":", 1)[1].strip()
return text.strip()
def _calculate_avg_confidence(self, ocr_data: Dict) -> float:
"""
Calculate average confidence score from OCR data.
Args:
ocr_data: Dictionary containing OCR data from pytesseract
Returns:
Average confidence score as a percentage
"""
confidences = [conf for conf in ocr_data.get('conf', []) if conf != -1]
if not confidences:
return 0.0
return sum(confidences) / len(confidences)