Skip to content

Audit – Vouching Documentation AID

Hi all, today morning (22 March 2025), something struck me to make a small aid which will assist in vouching.

Automating vouching can save us countless hours—this Python tool leverages Tesseract OCR to extract similar areas from structured invoices or documents, reducing manual effort and improving efficiency.

Back in 11th grade, I spent weeks building a Python-based website maker using Kivy and Requests. Fast forward to today, and I built a tool that extracts text from images, searches for similar patterns in a folder, and organizes the results.

What Does This Tool Do?

🔹 Helps verify EPF & ESI challans, export invoices, power bills, and other structured documents.
🔹 Extracts data and exports results to Excel or clipboard.
🔹 Works completely offline—no data leaks, ensuring full privacy.
🔹 (PDF support coming soon!)

Pre Requisites

  1. Python 3.xx series with installation in PATH
  2. Tesseract OCR software and Poppler installed to program files path
  3. the Pytesseract, PIL,pandas, clipboard, opencv, numpy, pdf2image, temp file libraries
  4. A reasonable computer (Needs 16:9 ratio FHD monitor) – i have not tested it in any other computers. So cannot comment on this requirement

Installing the requisites

  1. Python – from Python Releases for Windows | Python.org (ensure that PATH is enabled)
  2. pytesseract app from https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe
  3. install the required packages by running the below command line
  4. Poppler – Into Program files folder – https://blog.withkarthik.com/wp-content/uploads/2025/03/poppler-24.08.0.zip (This is the link to the poppler file)
pip install pillow pytesseract pandas clipboard opencv-python numpy pdf2image

The Code

import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from PIL import Image, ImageTk
import pytesseract
import os
import pandas as pd
import clipboard
import cv2
import numpy as np
from pdf2image import convert_from_path
import tempfile

#Poppler Path - Enter the path in which you have copied the file at. Ensure that the poppler version is updaed correctly in this code below
poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin' 
# Configure Tesseract path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

class OCRBatchApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Smart OCR Tool")
        self.root.geometry("1400x900")
        self.root.minsize(1200, 800)
        
        # Application state
        self.folder_path = tk.StringVar()
        self.templates = []  # Stores (template_img, template_name)
        self.image_paths = []
        self.current_image = None
        self.selected_snip = None  # Index of selected snip for editing
        
        # GUI initialization
        self.create_widgets()
        self.setup_styles()
    def process_batch(self):
        if not self.templates:
            messagebox.showerror("Error", "No snips defined!")
            return

        self.tree.delete(*self.tree.get_children())
        columns = ['File'] + [name for _, name in self.templates] + ['FullPath']
        self.tree.config(columns=columns)
        
        for img_path in self.image_paths:
            try:
                if img_path.lower().endswith('.pdf'):
                    with tempfile.TemporaryDirectory() as path:
                        images = convert_from_path(img_path, poppler_path=poppler_path)
                        pil_img = images[0]  # Process first page only
                else:
                    pil_img = Image.open(img_path)
                
                preprocessed = self.preprocess_image(pil_img)
                cv_img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
                gray_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
                
                row = [os.path.basename(img_path)]
                
                for template, name in self.templates:
                    try:
                        res = cv2.matchTemplate(gray_img, 
                                              cv2.cvtColor(template, cv2.COLOR_BGR2GRAY), 
                                              cv2.TM_CCOEFF_NORMED)
                        _, _, _, max_loc = cv2.minMaxLoc(res)
                        
                        h, w = template.shape[:2]
                        x, y = max_loc
                        region = (x, y, x+w, y+h)
                        
                        cropped = pil_img.crop(region)
                        processed_crop = self.preprocess_image(cropped)
                        
                        # Custom configuration for financial numbers
                        text = pytesseract.image_to_string(
                            processed_crop,
                            config=self.custom_config,
                            lang='eng+script/Devanagari'  # For rupee symbol recognition
                        ).strip()
                        
                        # Post-processing for financial data
                        text = self.clean_financial_text(text)
                        row.append(text if text else "<No Text Found>")
                    except Exception as e:
                        row.append("<OCR Error>")
                        continue
                
                row.append(img_path)
                self.tree.insert('', tk.END, values=row)
            
            except Exception as e:
                messagebox.showerror("Error", f"Error processing {img_path}: {str(e)}")
        
        self.status.config(text=f"Processed {len(self.image_paths)} images")

    def clean_financial_text(self, text):
        """Post-process OCR results for financial data"""
        # Remove common OCR artifacts
        text = text.replace('—', '-').replace('~', '-').replace('_', '')
        # Standardize currency symbols
        currency_replacements = {
            'Rs': '₹', 'rupees': '₹', 'rupee': '₹', '€': '€', '$': '$'
        }
        for k, v in currency_replacements.items():
            text = text.replace(k, v)
        return text    

    def create_widgets(self):
        # Main container
        main_pane = ttk.PanedWindow(self.root, orient=tk.HORIZONTAL)
        main_pane.pack(fill=tk.BOTH, expand=True)

        # Left Panel (Image and Results)
        left_pane = ttk.PanedWindow(main_pane, orient=tk.VERTICAL)
        main_pane.add(left_pane, weight=4)

        # Right Panel (Snip List)
        right_pane = ttk.Frame(main_pane)
        main_pane.add(right_pane, weight=1)

        # Control Panel (Top)
        control_frame = ttk.Frame(self.root)
        control_frame.pack(fill=tk.X, padx=10, pady=5)

        ttk.Label(control_frame, text="Image Folder:").pack(side=tk.LEFT)
        self.entry_folder = ttk.Entry(control_frame, textvariable=self.folder_path, width=40)
        self.entry_folder.pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="Browse", command=self.select_folder).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="New Snip", command=self.start_new_snip).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="Run OCR", command=self.process_batch).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="Clear All", command=self.clear_all).pack(side=tk.LEFT)

        # Image Preview Area
        img_container = ttk.Frame(left_pane)
        left_pane.add(img_container, weight=2)
        
        self.canvas = tk.Canvas(img_container, bg='white', cursor="cross")
        h_scroll = ttk.Scrollbar(img_container, orient=tk.HORIZONTAL, command=self.canvas.xview)
        v_scroll = ttk.Scrollbar(img_container, orient=tk.VERTICAL, command=self.canvas.yview)
        self.canvas.configure(xscrollcommand=h_scroll.set, yscrollcommand=v_scroll.set)

        self.canvas.grid(row=0, column=0, sticky='nsew')
        v_scroll.grid(row=0, column=1, sticky='ns')
        h_scroll.grid(row=1, column=0, sticky='ew')
        img_container.grid_rowconfigure(0, weight=1)
        img_container.grid_columnconfigure(0, weight=1)

        # Results Table
        table_container = ttk.Frame(left_pane)
        left_pane.add(table_container, weight=1)
        
        self.tree = ttk.Treeview(table_container, columns=['File'], show='headings')
        tree_scroll_y = ttk.Scrollbar(table_container, orient=tk.VERTICAL, command=self.tree.yview)
        tree_scroll_x = ttk.Scrollbar(table_container, orient=tk.HORIZONTAL, command=self.tree.xview)
        self.tree.configure(yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set)

        self.tree.grid(row=0, column=0, sticky='nsew')
        tree_scroll_y.grid(row=0, column=1, sticky='ns')
        tree_scroll_x.grid(row=1, column=0, sticky='ew')
        table_container.grid_rowconfigure(0, weight=1)
        table_container.grid_columnconfigure(0, weight=1)

        # Snip List Panel
        snip_container = ttk.Frame(right_pane)
        snip_container.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        self.snip_list = tk.Listbox(snip_container, selectmode=tk.SINGLE)
        snip_scroll = ttk.Scrollbar(snip_container, orient=tk.VERTICAL, command=self.snip_list.yview)
        self.snip_list.configure(yscrollcommand=snip_scroll.set)

        self.snip_list.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
        snip_scroll.pack(side=tk.RIGHT, fill=tk.Y)

        # Context Menus
        self.snip_context_menu = tk.Menu(self.root, tearoff=0)
        self.snip_context_menu.add_command(label="Delete Snip", command=self.delete_selected_snip)
        
        self.tree_context_menu = tk.Menu(self.root, tearoff=0)
        self.tree_context_menu.add_command(label="Copy to Clipboard", command=self.copy_to_clipboard)
        self.tree_context_menu.add_command(label="Export to Excel", command=self.export_to_excel)

        # Event Bindings
        self.snip_list.bind("<<ListboxSelect>>", self.on_snip_select)
        self.snip_list.bind("<Button-3>", lambda e: self.snip_context_menu.post(e.x_root, e.y_root))
        self.tree.bind("<Button-3>", lambda e: self.tree_context_menu.post(e.x_root, e.y_root))
        self.tree.bind("<Double-1>", self.on_tree_double_click)
        self.canvas.bind("<ButtonPress-1>", self.start_selection)
        self.canvas.bind("<B1-Motion>", self.update_selection)
        self.canvas.bind("<ButtonRelease-1>", self.finalize_selection)

        # Status Bar
        self.status = ttk.Label(self.root, text="Ready", relief=tk.SUNKEN)
        self.status.pack(side=tk.BOTTOM, fill=tk.X)

    def setup_styles(self):
        style = ttk.Style()
        style.configure("Treeview", rowheight=25)
        style.configure("Accent.TButton", foreground='white', background='#4a7abc')

    def select_folder(self):
        folder = filedialog.askdirectory()
        if folder:
            self.folder_path.set(folder)
            self.load_images()
            self.status.config(text=f"Loaded folder: {folder}")

    def load_images(self):
        self.image_paths = [
            os.path.join(self.folder_path.get(), f) 
            for f in os.listdir(self.folder_path.get()) 
            if f.lower().endswith(('.png', '.jpg', '.jpeg'))
        ]
        if self.image_paths:
            self.load_image(self.image_paths[0])

    def load_image(self, path):
        try:
            self.current_image = Image.open(path)
            self.tk_image = ImageTk.PhotoImage(self.current_image)
            self.canvas.config(scrollregion=(0, 0, *self.current_image.size))
            self.canvas.delete("all")
            self.canvas.create_image(0, 0, anchor=tk.NW, image=self.tk_image)
            self.redraw_snips()
        except Exception as e:
            messagebox.showerror("Error", f"Failed to load image: {str(e)}")

    def redraw_snips(self):
        self.canvas.delete("snip")
        if self.current_image:
            cv_image = cv2.cvtColor(np.array(self.current_image), cv2.COLOR_RGB2BGR)
            
            for idx, (template, name) in enumerate(self.templates):
                try:
                    gray_template = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
                    gray_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
                    
                    res = cv2.matchTemplate(gray_image, gray_template, cv2.TM_CCOEFF_NORMED)
                    _, _, _, max_loc = cv2.minMaxLoc(res)
                    
                    h, w = gray_template.shape
                    x, y = max_loc
                    self.canvas.create_rectangle(
                        x, y, x+w, y+h,
                        outline='red' if idx == self.selected_snip else 'blue',
                        width=2, tags="snip"
                    )
                except Exception as e:
                    continue

    def start_new_snip(self):
        self.selected_snip = None
        self.snip_list.selection_clear(0, tk.END)
        self.status.config(text="Ready to create new snip region")

    def on_snip_select(self, event):
        selection = self.snip_list.curselection()
        if selection:
            self.selected_snip = selection[0]
            self.status.config(text=f"Selected snip {self.selected_snip+1} for editing")

    def delete_selected_snip(self):
        selection = self.snip_list.curselection()
        if selection:
            index = selection[0]
            del self.templates[index]
            self.update_snip_list()
            self.update_table_columns()
            self.redraw_snips()
            self.status.config(text=f"Deleted snip {index+1}")
            self.selected_snip = None

    def on_tree_double_click(self, event):
        selected = self.tree.selection()
        if selected:
            item = self.tree.item(selected[0])
            file_path = item['values'][-1]
            self.load_image(file_path)
            self.redraw_snips()

    def start_selection(self, event):
        self.start_x = self.canvas.canvasx(event.x)
        self.start_y = self.canvas.canvasy(event.y)
        self.rect = self.canvas.create_rectangle(
            self.start_x, self.start_y, self.start_x, self.start_y,
            outline='green', width=2, tags="selection"
        )

    def update_selection(self, event):
        self.canvas.coords(
            self.rect,
            self.start_x, self.start_y,
            self.canvas.canvasx(event.x),
            self.canvas.canvasy(event.y)
        )

    def finalize_selection(self, event):
        end_x = self.canvas.canvasx(event.x)
        end_y = self.canvas.canvasy(event.y)
        
        x1 = int(min(self.start_x, end_x))
        y1 = int(min(self.start_y, end_y))
        x2 = int(max(self.start_x, end_x))
        y2 = int(max(self.start_y, end_y))
        
        if abs(x2 - x1) < 5 or abs(y2 - y1) < 5:
            messagebox.showerror("Error", "Selection area too small!")
            return
        
        try:
            cv_image = cv2.cvtColor(np.array(self.current_image), cv2.COLOR_RGB2BGR)
            template = cv_image[y1:y2, x1:x2]
            
            if template.size == 0:
                messagebox.showerror("Error", "Invalid selection!")
                return
            
            template_name = f"Snip {len(self.templates)+1}" if self.selected_snip is None else f"Snip {self.selected_snip+1}"
            
            if self.selected_snip is not None:
                # Update existing snip
                self.templates[self.selected_snip] = (template, template_name)
            else:
                # Add new snip
                self.templates.append((template, template_name))
            
            self.update_snip_list()
            self.update_table_columns()
            self.redraw_snips()
            self.selected_snip = None  # Reset selection after update
            
        except Exception as e:
            messagebox.showerror("Error", f"Failed to create snip: {str(e)}")
        finally:
            self.canvas.delete("selection")

    def update_snip_list(self):
        self.snip_list.delete(0, tk.END)
        for idx, (_, name) in enumerate(self.templates):
            self.snip_list.insert(tk.END, name)

    def update_table_columns(self):
        columns = ['File'] + [name for _, name in self.templates] + ['FullPath']
        self.tree.config(columns=columns)
        for col in columns:
            self.tree.heading(col, text=col)
            self.tree.column(col, width=120, stretch=tk.YES)
        self.tree.column("FullPath", width=0, stretch=tk.NO)

    def process_batch(self):
        if not self.templates:
            messagebox.showerror("Error", "No snips defined!")
            return

        self.tree.delete(*self.tree.get_children())
        columns = ['File'] + [name for _, name in self.templates] + ['FullPath']
        self.tree.config(columns=columns)
        
        for img_path in self.image_paths:
            try:
                pil_img = Image.open(img_path)
                cv_img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
                gray_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
                
                row = [os.path.basename(img_path)]
                
                for template, name in self.templates:
                    try:
                        res = cv2.matchTemplate(gray_img, 
                                              cv2.cvtColor(template, cv2.COLOR_BGR2GRAY), 
                                              cv2.TM_CCOEFF_NORMED)
                        _, _, _, max_loc = cv2.minMaxLoc(res)
                        
                        h, w = template.shape[:2]
                        x, y = max_loc
                        region = (x, y, x+w, y+h)
                        
                        cropped = pil_img.crop(region)
                        text = pytesseract.image_to_string(cropped).strip()
                        row.append(text if text else "<No Text Found>")
                    except Exception as e:
                        row.append("<OCR Error>")
                        continue
                
                row.append(img_path)
                self.tree.insert('', tk.END, values=row)
            
            except Exception as e:
                messagebox.showerror("Error", f"Error processing {img_path}: {str(e)}")
        
        self.status.config(text=f"Processed {len(self.image_paths)} images")

    def copy_to_clipboard(self):
        selected = self.tree.selection()
        if selected:
            item = self.tree.item(selected[0])
            clipboard.copy("\t".join(str(v) for v in item['values'][:-1]))
            self.status.config(text="Copied to clipboard!")

    def export_to_excel(self):
        file_path = filedialog.asksaveasfilename(
            defaultextension=".xlsx",
            filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")]
        )
        if file_path:
            try:
                columns = ['File'] + [name for _, name in self.templates]
                data = []
                for item in self.tree.get_children():
                    values = self.tree.item(item)['values']
                    data.append(dict(zip(columns, values[:-1])))
                
                pd.DataFrame(data).to_excel(file_path, index=False)
                self.status.config(text=f"Exported to {os.path.basename(file_path)}")
            except Exception as e:
                messagebox.showerror("Export Error", f"Failed to export: {str(e)}")

    def clear_all(self):
        self.templates = []
        self.selected_snip = None
        self.tree.delete(*self.tree.get_children())
        self.snip_list.delete(0, tk.END)
        self.canvas.delete("all")
        self.status.config(text="All data cleared")

if __name__ == "__main__":
    root = tk.Tk()
    app = OCRBatchApp(root)
    root.mainloop()

Limitations

  1. This code imports the images in folder. Images are only supported
  2. Since tesseract is old, the OCR is not that great. Sometimes the rupee symbol is mistaken for X or some other letter.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.