You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
185 lines
6.5 KiB
Python
185 lines
6.5 KiB
Python
import os
|
|
import logging
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
import cv2
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
class OCRProcessor:
|
|
"""
|
|
OCR processor class for healthcare documents using Tesseract.
|
|
"""
|
|
|
|
def __init__(self, tesseract_cmd: Optional[str] = None):
|
|
"""
|
|
Initialize the OCR processor.
|
|
|
|
Args:
|
|
tesseract_cmd: Optional path to Tesseract executable
|
|
"""
|
|
if tesseract_cmd:
|
|
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
|
|
|
# Set up logging
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Verify Tesseract installation
|
|
try:
|
|
pytesseract.get_tesseract_version()
|
|
self.logger.info("Tesseract OCR initialized successfully")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to initialize Tesseract OCR: {str(e)}")
|
|
raise RuntimeError(f"Tesseract OCR not properly installed: {str(e)}")
|
|
|
|
def preprocess_image(self, image):
|
|
"""
|
|
Preprocess image to improve OCR accuracy.
|
|
|
|
Args:
|
|
image: OpenCV image object
|
|
|
|
Returns:
|
|
Preprocessed image
|
|
"""
|
|
# Convert to grayscale
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Apply threshold to get black and white image
|
|
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
|
|
# Invert back
|
|
binary = 255 - binary
|
|
|
|
return binary
|
|
|
|
def process_image(self, image_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Process an image file and extract text using OCR.
|
|
|
|
Args:
|
|
image_path: Path to the image file
|
|
|
|
Returns:
|
|
Dictionary containing extracted text and metadata
|
|
"""
|
|
if not os.path.exists(image_path):
|
|
self.logger.error(f"Image file not found: {image_path}")
|
|
raise FileNotFoundError(f"Image file not found: {image_path}")
|
|
|
|
try:
|
|
# Read image using OpenCV
|
|
img = cv2.imread(image_path)
|
|
if img is None:
|
|
raise ValueError(f"Failed to read image: {image_path}")
|
|
|
|
# Preprocess image
|
|
preprocessed = self.preprocess_image(img)
|
|
|
|
# Apply OCR
|
|
raw_text = pytesseract.image_to_string(preprocessed)
|
|
|
|
# Get detailed OCR data including confidence levels
|
|
ocr_data = pytesseract.image_to_data(preprocessed, output_type=pytesseract.Output.DICT)
|
|
|
|
# Extract structured data
|
|
structured_data = self.extract_healthcare_data(raw_text)
|
|
|
|
result = {
|
|
"raw_text": raw_text,
|
|
"structured_data": structured_data,
|
|
"confidence": self._calculate_avg_confidence(ocr_data),
|
|
"metadata": {
|
|
"source_file": image_path,
|
|
"ocr_engine": "Tesseract",
|
|
"ocr_version": pytesseract.get_tesseract_version()
|
|
}
|
|
}
|
|
|
|
self.logger.info(f"Successfully processed image: {image_path}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"OCR processing error: {str(e)}")
|
|
raise RuntimeError(f"Failed to process image with OCR: {str(e)}")
|
|
|
|
def extract_healthcare_data(self, text: str) -> Dict[str, Any]:
|
|
"""
|
|
Extract structured healthcare data from OCR text.
|
|
|
|
Args:
|
|
text: Raw OCR text
|
|
|
|
Returns:
|
|
Dictionary containing extracted healthcare data fields
|
|
"""
|
|
# This is a simplified implementation that would need to be enhanced
|
|
# with more sophisticated extraction logic for a real-world application
|
|
|
|
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
data = {
|
|
"patient": {
|
|
"name": None,
|
|
"dob": None,
|
|
"id": None,
|
|
"gender": None
|
|
},
|
|
"document_type": self._detect_document_type(text),
|
|
"extracted_fields": {}
|
|
}
|
|
|
|
# Simple extraction based on keywords (would need enhancement for production)
|
|
for line in lines:
|
|
if "name:" in line.lower() or "patient:" in line.lower():
|
|
data["patient"]["name"] = self._extract_after_colon(line)
|
|
elif "dob:" in line.lower() or "birth" in line.lower() or "born:" in line.lower():
|
|
data["patient"]["dob"] = self._extract_after_colon(line)
|
|
elif "id:" in line.lower() or "mrn:" in line.lower() or "record" in line.lower():
|
|
data["patient"]["id"] = self._extract_after_colon(line)
|
|
elif "gender:" in line.lower() or "sex:" in line.lower():
|
|
data["patient"]["gender"] = self._extract_after_colon(line)
|
|
|
|
return data
|
|
|
|
def _detect_document_type(self, text: str) -> str:
|
|
"""
|
|
Attempt to detect the type of healthcare document.
|
|
|
|
Args:
|
|
text: Raw OCR text
|
|
|
|
Returns:
|
|
Document type string
|
|
"""
|
|
text_lower = text.lower()
|
|
|
|
if "insurance" in text_lower and ("card" in text_lower or "policy" in text_lower):
|
|
return "insurance_card"
|
|
elif "prescription" in text_lower:
|
|
return "prescription"
|
|
elif "lab" in text_lower and ("result" in text_lower or "report" in text_lower):
|
|
return "lab_result"
|
|
elif "discharge" in text_lower and "summary" in text_lower:
|
|
return "discharge_summary"
|
|
else:
|
|
return "unknown"
|
|
|
|
def _extract_after_colon(self, text: str) -> str:
|
|
"""Extract the content after a colon in a string."""
|
|
if ":" in text:
|
|
return text.split(":", 1)[1].strip()
|
|
return text.strip()
|
|
|
|
def _calculate_avg_confidence(self, ocr_data: Dict) -> float:
|
|
"""
|
|
Calculate average confidence score from OCR data.
|
|
|
|
Args:
|
|
ocr_data: Dictionary containing OCR data from pytesseract
|
|
|
|
Returns:
|
|
Average confidence score as a percentage
|
|
"""
|
|
confidences = [conf for conf in ocr_data.get('conf', []) if conf != -1]
|
|
if not confidences:
|
|
return 0.0
|
|
return sum(confidences) / len(confidences) |