OCR Text Recognition for Multiple Targeted Image Regions
Implementation Approach
This solution leverages the Tesseract OCR engine to extract text from specific regions of an image, with multithreading impleemnted to optimize processing speed. The script accepts command-line inputs for the image path and target regions, formatted as follows:
# Command syntax: python ocr_extractor.py <image_path> <x1 y1 w1 h1> <x2 y2 w2 h2> ...
python ocr_extractor.py sample_image.png 130 478 456 60 195 560 480 60 195 640 480 60
Python Implementation
Dependencies and Setup
Install required Python packages:
pip install pillow
pip install pytesseract
Ensure Tesseract OCR is installed and update the path in the code below.
Code and Explanation
import sys
import io
import concurrent.futures
from PIL import Image
import pytesseract
# Configure UTF-8 output
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# Set Tesseract OCR executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# OCR configuration: legacy engine, block segmentation, English + Simplified Chinese
ocr_config = r'--oem 3 --psm 6 -l eng+chi_sim'
def extract_targeted_text(image_path, regions):
"""
Extract text from specified regions in an image
:param image_path: Path to the target image
:param regions: List of (x, y, width, height) tuples defining target regions
:return: Dictionary mapping region indices to extracted text
"""
text_results = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
future_tasks = []
for idx, region in enumerate(regions):
task = executor.submit(extract_single_region, image_path, region)
future_tasks.append((idx, task))
for idx, future in future_tasks:
text_results[idx] = future.result()
return text_results
def extract_single_region(image_path, region):
x, y, w, h = region
with Image.open(image_path) as img:
cropped = img.crop((x, y, x + w, y + h))
text = pytesseract.image_to_string(cropped, config=ocr_config)
return text.strip()
def main():
if len(sys.argv) < 2:
print("Usage: python ocr_extractor.py <image_path> <x1 y1 w1 h1> <x2 y2 w2 h2> ...")
return
img_path = sys.argv[1]
regions = []
try:
for i in range(2, len(sys.argv), 4):
x = int(sys.argv[i])
y = int(sys.argv[i+1])
w = int(sys.argv[i+2])
h = int(sys.argv[i+3])
regions.append((x, y, w, h))
except ValueError:
print("Invalid region coordinates.")
return
results = extract_targeted_text(img_path, regions)
for idx in range(len(regions)):
print(results[idx])
if __name__ == "__main__":
main()
Java Integration
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
public class OCRExtractor {
public static void main(String[] args) {
long startTime = System.currentTimeMillis();
try {
String pythonCommand = "python ocr_extractor.py C:\\test_images\\sample_image.png " +
"130 478 456 60 " +
"195 560 480 60 " +
"195 640 480 60 " +
"195 727 480 60 " +
"195 820 480 60 " +
"195 905 480 60 " +
"195 995 480 60 " +
"195 1085 480 60 " +
"195 1176 480 60";
Process process = Runtime.getRuntime().exec(pythonCommand);
BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream()));
String line;
while ((line = outputReader.readLine()) != null) {
System.out.println(line.replaceAll("\\s", ""));
}
BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream()));
while ((line = errorReader.readLine()) != null) {
System.err.println(line);
}
int exitCode = process.waitFor();
if (exitCode == 0) {
System.out.println("OCR process completed successfully");
} else {
System.out.println("OCR process failed with error code: " + exitCode);
}
System.out.println((System.currentTimeMillis() - startTime) + "ms elapsed.");
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
}
}