What’s New?
- PDF Support – Seamlessly extract data from PDFs, just like images.
Inherent Limitations:
- The layout and page size of the items inside the PDF must be consistent.
- No caching process, so it’s not memory efficient. The program’s stability depends on your processor and available memory.
- The Tesseract library is quite old but still gets the job done—if the PDF quality is good, the output will be too.
Fair Warning:
The code output will not be excellent. It needs to be reviewed. If you want 100% results, try tweaking this code and use Azure Document vision or AWS textract. Its pay as you go basis and would cost around 10 USD for 1000 pages.
Requirements:
pip install Pillow pytesseract pandas clipboard opencv-python numpy pdf2image
- the packages as mentioned in the below PIP
- Python 3.xx series with installation in PATH
- Tesseract OCR software and Poppler 24.08.0 installed to program files path along with models
- A reasonable computer (Needs 16:9 ratio FHD monitor) – i have not tested it in any other computers. So cannot comment on this requirement
- Some patience
Link to download the required data is already shared in the previous post
Code
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from PIL import Image, ImageTk, ImageEnhance
import pytesseract
import os
import pandas as pd
import clipboard
import cv2
import numpy as np
from pdf2image import convert_from_path
import tempfile
# Configure paths - UPDATE THESE TO YOUR INSTALLATION
poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin'
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
class OCRBatchApp:
def __init__(self, root):
self.root = root
self.root.title("Smart OCR Tool")
self.root.geometry("1400x900")
self.root.minsize(1200, 800)
# Application state
self.templates = [] # (template_img, template_name, original_size)
self.image_paths = []
self.current_image = None
self.current_path = None
self.selected_snip = None
self.custom_config = r'--oem 3 --psm 6 -l eng+script/Devanagari'
self.enhance_factor = 1.5 # Image enhancement factor
# GUI initialization
self.create_widgets()
self.setup_styles()
self.setup_menus()
def create_widgets(self):
# Main container
main_pane = ttk.PanedWindow(self.root, orient=tk.HORIZONTAL)
main_pane.pack(fill=tk.BOTH, expand=True)
# Left Panel (Image and Results)
left_pane = ttk.PanedWindow(main_pane, orient=tk.VERTICAL)
main_pane.add(left_pane, weight=3)
# Image Canvas with Scrollbars
img_frame = ttk.Frame(left_pane)
left_pane.add(img_frame, weight=2)
self.canvas = tk.Canvas(img_frame, bg='white', cursor="cross")
h_scroll = ttk.Scrollbar(img_frame, orient=tk.HORIZONTAL, command=self.canvas.xview)
v_scroll = ttk.Scrollbar(img_frame, orient=tk.VERTICAL, command=self.canvas.yview)
self.canvas.configure(xscrollcommand=h_scroll.set, yscrollcommand=v_scroll.set)
self.canvas.grid(row=0, column=0, sticky='nsew')
v_scroll.grid(row=0, column=1, sticky='ns')
h_scroll.grid(row=1, column=0, sticky='ew')
img_frame.grid_rowconfigure(0, weight=1)
img_frame.grid_columnconfigure(0, weight=1)
# Results Table
table_frame = ttk.Frame(left_pane)
left_pane.add(table_frame, weight=1)
self.tree = ttk.Treeview(table_frame, columns=['File'], show='headings')
tree_scroll_y = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=self.tree.yview)
tree_scroll_x = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=self.tree.xview)
self.tree.configure(yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set)
self.tree.grid(row=0, column=0, sticky='nsew')
tree_scroll_y.grid(row=0, column=1, sticky='ns')
tree_scroll_x.grid(row=1, column=0, sticky='ew')
table_frame.grid_rowconfigure(0, weight=1)
table_frame.grid_columnconfigure(0, weight=1)
# Right Panel (Snips and Files)
right_pane = ttk.PanedWindow(main_pane, orient=tk.VERTICAL)
main_pane.add(right_pane, weight=1)
# Snips Panel
snip_frame = ttk.Frame(right_pane)
right_pane.add(snip_frame, weight=1)
ttk.Label(snip_frame, text="Snips").pack(pady=5)
self.snip_list = tk.Listbox(snip_frame, selectmode=tk.SINGLE, height=10)
snip_scroll = ttk.Scrollbar(snip_frame, orient=tk.VERTICAL, command=self.snip_list.yview)
self.snip_list.configure(yscrollcommand=snip_scroll.set)
self.snip_list.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
snip_scroll.pack(side=tk.RIGHT, fill=tk.Y)
# Files Panel
file_frame = ttk.Frame(right_pane)
right_pane.add(file_frame, weight=1)
ttk.Label(file_frame, text="Files").pack(pady=5)
self.file_list = tk.Listbox(file_frame, height=15)
file_scroll = ttk.Scrollbar(file_frame, orient=tk.VERTICAL, command=self.file_list.yview)
self.file_list.configure(yscrollcommand=file_scroll.set)
self.file_list.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
file_scroll.pack(side=tk.RIGHT, fill=tk.Y)
# Control Panel
control_frame = ttk.Frame(self.root)
control_frame.pack(fill=tk.X, padx=10, pady=5)
ttk.Button(control_frame, text="Import Files", command=self.select_files).pack(side=tk.LEFT, padx=5)
ttk.Button(control_frame, text="New Snip", command=self.start_new_snip).pack(side=tk.LEFT, padx=5)
ttk.Button(control_frame, text="Run OCR", command=self.process_batch).pack(side=tk.LEFT, padx=5)
ttk.Button(control_frame, text="Export Excel", command=self.export_to_excel).pack(side=tk.LEFT, padx=5)
ttk.Button(control_frame, text="Clear All", command=self.clear_all).pack(side=tk.LEFT)
# Status Bar
self.status = ttk.Label(self.root, text="Ready", relief=tk.SUNKEN)
self.status.pack(side=tk.BOTTOM, fill=tk.X)
def setup_styles(self):
style = ttk.Style()
style.configure("Treeview", rowheight=25)
style.configure("TButton", padding=6)
def setup_menus(self):
self.snip_menu = tk.Menu(self.root, tearoff=0)
self.snip_menu.add_command(label="Delete", command=self.delete_selected_snip)
self.snip_menu.add_command(label="Edit", command=self.edit_selected_snip)
self.file_menu = tk.Menu(self.root, tearoff=0)
self.file_menu.add_command(label="Delete", command=self.delete_selected_file)
self.tree_menu = tk.Menu(self.root, tearoff=0)
self.tree_menu.add_command(label="Copy", command=self.copy_to_clipboard)
self.tree_menu.add_command(label="Export Excel", command=self.export_to_excel)
# Event Bindings
self.snip_list.bind("<<ListboxSelect>>", self.on_snip_select)
self.snip_list.bind("<Button-3>", lambda e: self.snip_menu.post(e.x_root, e.y_root))
self.file_list.bind("<Double-1>", self.load_selected_file)
self.file_list.bind("<Button-3>", lambda e: self.file_menu.post(e.x_root, e.y_root))
self.tree.bind("<Double-1>", self.on_tree_double_click)
self.tree.bind("<Button-3>", lambda e: self.tree_menu.post(e.x_root, e.y_root))
self.canvas.bind("<ButtonPress-1>", self.start_selection)
self.canvas.bind("<B1-Motion>", self.update_selection)
self.canvas.bind("<ButtonRelease-1>", self.finalize_selection)
def select_files(self):
files = filedialog.askopenfilenames(filetypes=[
("Documents", "*.png *.jpg *.jpeg *.pdf"),
("All files", "*.*")
])
if files:
for f in files:
if f not in self.image_paths:
self.image_paths.append(f)
self.file_list.insert(tk.END, os.path.basename(f))
self.status.config(text=f"Loaded {len(files)} files")
def delete_selected_file(self):
selection = self.file_list.curselection()
if selection:
index = selection[0]
del self.image_paths[index]
self.file_list.delete(index)
self.status.config(text=f"Deleted file {index+1}")
def load_selected_file(self, event):
selection = self.file_list.curselection()
if selection:
path = self.image_paths[selection[0]]
self.load_image(path)
self.redraw_snips()
def load_image(self, path):
try:
self.current_path = path
if path.lower().endswith('.pdf'):
with tempfile.TemporaryDirectory() as path_dir:
images = convert_from_path(
path,
poppler_path=poppler_path,
dpi=300,
first_page=1,
last_page=1
)
self.current_image = images[0].convert('RGB')
else:
self.current_image = Image.open(path).convert('RGB')
# Enhance image quality
enhancer = ImageEnhance.Sharpness(self.current_image)
self.current_image = enhancer.enhance(self.enhance_factor)
self.tk_image = ImageTk.PhotoImage(self.current_image)
self.canvas.config(scrollregion=(0, 0, *self.current_image.size))
self.canvas.delete("all")
self.canvas.create_image(0, 0, anchor=tk.NW, image=self.tk_image)
self.redraw_snips()
except Exception as e:
messagebox.showerror("Error", f"Failed to load file: {str(e)}")
def process_batch(self):
if not self.templates:
messagebox.showerror("Error", "No snips defined!")
return
self.tree.delete(*self.tree.get_children())
columns = ['File'] + [name for _, name, _ in self.templates] + ['FullPath']
self.tree.config(columns=columns)
for path in self.image_paths:
try:
images = []
if path.lower().endswith('.pdf'):
with tempfile.TemporaryDirectory() as path_dir:
images = convert_from_path(
path,
poppler_path=poppler_path,
dpi=300
)
else:
images = [Image.open(path)]
for page_num, pil_img in enumerate(images):
pil_img = pil_img.convert('RGB')
cv_img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
gray_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
row = [f"{os.path.basename(path)} (pg {page_num+1})"]
for template, name, original_size in self.templates:
try:
# Scale template if needed
scale_factor = pil_img.size[0] / original_size[0]
scaled_template = cv2.resize(
template,
None,
fx=scale_factor,
fy=scale_factor,
interpolation=cv2.INTER_CUBIC
)
res = cv2.matchTemplate(gray_img,
cv2.cvtColor(scaled_template, cv2.COLOR_BGR2GRAY),
cv2.TM_CCOEFF_NORMED)
_, conf, _, max_loc = cv2.minMaxLoc(res)
if conf < 0.5:
row.append("<Low Confidence>")
continue
h, w = scaled_template.shape[:2]
x, y = max_loc
region = (x, y, x+w, y+h)
cropped = pil_img.crop(region)
processed_crop = self.preprocess_image(cropped)
text = pytesseract.image_to_string(
processed_crop,
config=self.custom_config
).strip()
text = self.clean_text(text)
row.append(text if text else "<No Text>")
except Exception as e:
row.append("<Error>")
continue
row.append(path)
self.tree.insert('', tk.END, values=row)
except Exception as e:
messagebox.showerror("Error", f"Error processing {path}: {str(e)}")
self.status.config(text=f"Processed {len(self.image_paths)} files")
def preprocess_image(self, image):
img = np.array(image)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Enhance contrast using CLAHE
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
enhanced = clahe.apply(gray)
# Denoise and threshold
denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
_, threshold = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return Image.fromarray(threshold)
def clean_text(self, text):
replacements = {
'‘': "'", '’': "'", '“': '"', '”': '"', '—': '-',
'Rs': '₹', 'rupees': '₹', 'rupee': '₹', '\\$': '$',
'\n': ' ', '\t': ' ', ' ': ' '
}
for k, v in replacements.items():
text = text.replace(k, v)
return text.strip()
def start_new_snip(self):
self.selected_snip = None
self.snip_list.selection_clear(0, tk.END)
self.status.config(text="Ready to create new snip region")
def redraw_snips(self):
self.canvas.delete("snip")
if self.current_image:
cv_image = cv2.cvtColor(np.array(self.current_image), cv2.COLOR_RGB2BGR)
for idx, (template, name, original_size) in enumerate(self.templates):
try:
scale_factor = self.current_image.size[0] / original_size[0]
scaled_template = cv2.resize(
template,
None,
fx=scale_factor,
fy=scale_factor,
interpolation=cv2.INTER_CUBIC
)
gray_template = cv2.cvtColor(scaled_template, cv2.COLOR_BGR2GRAY)
gray_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
res = cv2.matchTemplate(gray_image, gray_template, cv2.TM_CCOEFF_NORMED)
_, _, _, max_loc = cv2.minMaxLoc(res)
h, w = scaled_template.shape[:2]
x, y = max_loc
self.canvas.create_rectangle(
x, y, x+w, y+h,
outline='red' if idx == self.selected_snip else 'blue',
width=2, tags="snip"
)
except: pass
def on_snip_select(self, event):
selection = self.snip_list.curselection()
if selection:
self.selected_snip = selection[0]
self.redraw_snips()
self.status.config(text=f"Selected snip {self.selected_snip+1} for editing")
def edit_selected_snip(self):
if self.selected_snip is not None:
self.status.config(text="Drag to edit selected snip region")
else:
messagebox.showinfo("Info", "Please select a snip to edit")
def start_selection(self, event):
self.start_x = self.canvas.canvasx(event.x)
self.start_y = self.canvas.canvasy(event.y)
self.rect = self.canvas.create_rectangle(
self.start_x, self.start_y, self.start_x, self.start_y,
outline='green', width=2, tags="selection"
)
def update_selection(self, event):
self.canvas.coords(
self.rect,
self.start_x, self.start_y,
self.canvas.canvasx(event.x),
self.canvas.canvasy(event.y)
)
def finalize_selection(self, event):
x1 = int(min(self.start_x, self.canvas.canvasx(event.x)))
y1 = int(min(self.start_y, self.canvas.canvasy(event.y)))
x2 = int(max(self.start_x, self.canvas.canvasx(event.x)))
y2 = int(max(self.start_y, self.canvas.canvasy(event.y)))
if abs(x2-x1) < 5 or abs(y2-y1) < 5:
messagebox.showerror("Error", "Selection too small!")
return
try:
cv_image = cv2.cvtColor(np.array(self.current_image), cv2.COLOR_RGB2BGR)
template = cv_image[y1:y2, x1:x2]
original_size = self.current_image.size
if self.selected_snip is not None:
# Update existing snip
self.templates[self.selected_snip] = (template, f"Snip {self.selected_snip+1}", original_size)
self.status.config(text=f"Updated snip {self.selected_snip+1}")
else:
# Add new snip
self.templates.append((template, f"Snip {len(self.templates)+1}", original_size))
self.status.config(text="Created new snip")
self.update_snip_list()
self.update_table_columns()
self.redraw_snips()
self.selected_snip = None
except Exception as e:
messagebox.showerror("Error", f"Snip creation failed: {str(e)}")
finally:
self.canvas.delete("selection")
def on_tree_double_click(self, event):
selected = self.tree.selection()
if selected:
item = self.tree.item(selected[0])
path = item['values'][-1]
if path != self.current_path:
self.load_image(path)
self.redraw_snips()
def update_snip_list(self):
self.snip_list.delete(0, tk.END)
for _, name, _ in self.templates:
self.snip_list.insert(tk.END, name)
def update_table_columns(self):
columns = ['File'] + [name for _, name, _ in self.templates] + ['FullPath']
self.tree.config(columns=columns)
for col in columns:
self.tree.heading(col, text=col)
self.tree.column(col, width=120, stretch=tk.YES)
self.tree.column("FullPath", width=0, stretch=tk.NO)
def delete_selected_snip(self):
selection = self.snip_list.curselection()
if selection:
index = selection[0]
del self.templates[index]
self.update_snip_list()
self.update_table_columns()
self.redraw_snips()
self.status.config(text=f"Deleted snip {index+1}")
def copy_to_clipboard(self):
selected = self.tree.selection()
if selected:
item = self.tree.item(selected[0])
clipboard.copy("\t".join(str(v) for v in item['values'][:-1]))
self.status.config(text="Copied to clipboard!")
def export_to_excel(self):
file_path = filedialog.asksaveasfilename(
defaultextension=".xlsx",
filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")]
)
if file_path:
try:
columns = ['File'] + [name for _, name, _ in self.templates]
data = []
for item in self.tree.get_children():
values = self.tree.item(item)['values']
data.append(dict(zip(columns, values[:-1])))
pd.DataFrame(data).to_excel(file_path, index=False)
self.status.config(text=f"Exported to {os.path.basename(file_path)}")
except Exception as e:
messagebox.showerror("Export Error", f"Failed to export: {str(e)}")
def clear_all(self):
self.templates = []
self.image_paths = []
self.current_image = None
self.selected_snip = None
self.tree.delete(*self.tree.get_children())
self.snip_list.delete(0, tk.END)
self.file_list.delete(0, tk.END)
self.canvas.delete("all")
self.status.config(text="All data cleared")
if __name__ == "__main__":
root = tk.Tk()
app = OCRBatchApp(root)
root.mainloop()
Hi Karthik here. Please feel free to comment. Please enter your name and email. The email is just for formality so that bots are filtered out. Your email won’t be disclosed