Audit – Vouching Documentation AID _V 2.0

What’s New?

PDF Support – Seamlessly extract data from PDFs, just like images.

Inherent Limitations:

The layout and page size of the items inside the PDF must be consistent.
No caching process, so it’s not memory efficient. The program’s stability depends on your processor and available memory.
The Tesseract library is quite old but still gets the job done—if the PDF quality is good, the output will be too.

Fair Warning:

The code output will not be excellent. It needs to be reviewed. If you want 100% results, try tweaking this code and use Azure Document vision or AWS textract. Its pay as you go basis and would cost around 10 USD for 1000 pages.

Requirements:

pip install Pillow pytesseract pandas clipboard opencv-python numpy pdf2image

the packages as mentioned in the below PIP
Python 3.xx series with installation in PATH
Tesseract OCR software and Poppler 24.08.0 installed to program files path along with models
A reasonable computer (Needs 16:9 ratio FHD monitor) – i have not tested it in any other computers. So cannot comment on this requirement
Some patience

Link to download the required data is already shared in the previous post

Code

import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from PIL import Image, ImageTk, ImageEnhance
import pytesseract
import os
import pandas as pd
import clipboard
import cv2
import numpy as np
from pdf2image import convert_from_path
import tempfile

# Configure paths - UPDATE THESE TO YOUR INSTALLATION
poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin'
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

class OCRBatchApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Smart OCR Tool")
        self.root.geometry("1400x900")
        self.root.minsize(1200, 800)

        # Application state
        self.templates = []  # (template_img, template_name, original_size)
        self.image_paths = []
        self.current_image = None
        self.current_path = None
        self.selected_snip = None
        self.custom_config = r'--oem 3 --psm 6 -l eng+script/Devanagari'
        self.enhance_factor = 1.5  # Image enhancement factor

        # GUI initialization
        self.create_widgets()
        self.setup_styles()
        self.setup_menus()

    def create_widgets(self):
        # Main container
        main_pane = ttk.PanedWindow(self.root, orient=tk.HORIZONTAL)
        main_pane.pack(fill=tk.BOTH, expand=True)

        # Left Panel (Image and Results)
        left_pane = ttk.PanedWindow(main_pane, orient=tk.VERTICAL)
        main_pane.add(left_pane, weight=3)

        # Image Canvas with Scrollbars
        img_frame = ttk.Frame(left_pane)
        left_pane.add(img_frame, weight=2)

        self.canvas = tk.Canvas(img_frame, bg='white', cursor="cross")
        h_scroll = ttk.Scrollbar(img_frame, orient=tk.HORIZONTAL, command=self.canvas.xview)
        v_scroll = ttk.Scrollbar(img_frame, orient=tk.VERTICAL, command=self.canvas.yview)
        self.canvas.configure(xscrollcommand=h_scroll.set, yscrollcommand=v_scroll.set)

        self.canvas.grid(row=0, column=0, sticky='nsew')
        v_scroll.grid(row=0, column=1, sticky='ns')
        h_scroll.grid(row=1, column=0, sticky='ew')
        img_frame.grid_rowconfigure(0, weight=1)
        img_frame.grid_columnconfigure(0, weight=1)

        # Results Table
        table_frame = ttk.Frame(left_pane)
        left_pane.add(table_frame, weight=1)

        self.tree = ttk.Treeview(table_frame, columns=['File'], show='headings')
        tree_scroll_y = ttk.Scrollbar(table_frame, orient=tk.VERTICAL, command=self.tree.yview)
        tree_scroll_x = ttk.Scrollbar(table_frame, orient=tk.HORIZONTAL, command=self.tree.xview)
        self.tree.configure(yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set)

        self.tree.grid(row=0, column=0, sticky='nsew')
        tree_scroll_y.grid(row=0, column=1, sticky='ns')
        tree_scroll_x.grid(row=1, column=0, sticky='ew')
        table_frame.grid_rowconfigure(0, weight=1)
        table_frame.grid_columnconfigure(0, weight=1)

        # Right Panel (Snips and Files)
        right_pane = ttk.PanedWindow(main_pane, orient=tk.VERTICAL)
        main_pane.add(right_pane, weight=1)

        # Snips Panel
        snip_frame = ttk.Frame(right_pane)
        right_pane.add(snip_frame, weight=1)

        ttk.Label(snip_frame, text="Snips").pack(pady=5)
        self.snip_list = tk.Listbox(snip_frame, selectmode=tk.SINGLE, height=10)
        snip_scroll = ttk.Scrollbar(snip_frame, orient=tk.VERTICAL, command=self.snip_list.yview)
        self.snip_list.configure(yscrollcommand=snip_scroll.set)
        self.snip_list.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
        snip_scroll.pack(side=tk.RIGHT, fill=tk.Y)

        # Files Panel
        file_frame = ttk.Frame(right_pane)
        right_pane.add(file_frame, weight=1)

        ttk.Label(file_frame, text="Files").pack(pady=5)
        self.file_list = tk.Listbox(file_frame, height=15)
        file_scroll = ttk.Scrollbar(file_frame, orient=tk.VERTICAL, command=self.file_list.yview)
        self.file_list.configure(yscrollcommand=file_scroll.set)
        self.file_list.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
        file_scroll.pack(side=tk.RIGHT, fill=tk.Y)

        # Control Panel
        control_frame = ttk.Frame(self.root)
        control_frame.pack(fill=tk.X, padx=10, pady=5)

        ttk.Button(control_frame, text="Import Files", command=self.select_files).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="New Snip", command=self.start_new_snip).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="Run OCR", command=self.process_batch).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="Export Excel", command=self.export_to_excel).pack(side=tk.LEFT, padx=5)
        ttk.Button(control_frame, text="Clear All", command=self.clear_all).pack(side=tk.LEFT)

        # Status Bar
        self.status = ttk.Label(self.root, text="Ready", relief=tk.SUNKEN)
        self.status.pack(side=tk.BOTTOM, fill=tk.X)

    def setup_styles(self):
        style = ttk.Style()
        style.configure("Treeview", rowheight=25)
        style.configure("TButton", padding=6)

    def setup_menus(self):
        self.snip_menu = tk.Menu(self.root, tearoff=0)
        self.snip_menu.add_command(label="Delete", command=self.delete_selected_snip)
        self.snip_menu.add_command(label="Edit", command=self.edit_selected_snip)

        self.file_menu = tk.Menu(self.root, tearoff=0)
        self.file_menu.add_command(label="Delete", command=self.delete_selected_file)

        self.tree_menu = tk.Menu(self.root, tearoff=0)
        self.tree_menu.add_command(label="Copy", command=self.copy_to_clipboard)
        self.tree_menu.add_command(label="Export Excel", command=self.export_to_excel)

        # Event Bindings
        self.snip_list.bind("<<ListboxSelect>>", self.on_snip_select)
        self.snip_list.bind("<Button-3>", lambda e: self.snip_menu.post(e.x_root, e.y_root))
        self.file_list.bind("<Double-1>", self.load_selected_file)
        self.file_list.bind("<Button-3>", lambda e: self.file_menu.post(e.x_root, e.y_root))
        self.tree.bind("<Double-1>", self.on_tree_double_click)
        self.tree.bind("<Button-3>", lambda e: self.tree_menu.post(e.x_root, e.y_root))
        self.canvas.bind("<ButtonPress-1>", self.start_selection)
        self.canvas.bind("<B1-Motion>", self.update_selection)
        self.canvas.bind("<ButtonRelease-1>", self.finalize_selection)

    def select_files(self):
        files = filedialog.askopenfilenames(filetypes=[
            ("Documents", "*.png *.jpg *.jpeg *.pdf"),
            ("All files", "*.*")
        ])
        if files:
            for f in files:
                if f not in self.image_paths:
                    self.image_paths.append(f)
                    self.file_list.insert(tk.END, os.path.basename(f))
            self.status.config(text=f"Loaded {len(files)} files")

    def delete_selected_file(self):
        selection = self.file_list.curselection()
        if selection:
            index = selection[0]
            del self.image_paths[index]
            self.file_list.delete(index)
            self.status.config(text=f"Deleted file {index+1}")

    def load_selected_file(self, event):
        selection = self.file_list.curselection()
        if selection:
            path = self.image_paths[selection[0]]
            self.load_image(path)
            self.redraw_snips()

    def load_image(self, path):
        try:
            self.current_path = path
            if path.lower().endswith('.pdf'):
                with tempfile.TemporaryDirectory() as path_dir:
                    images = convert_from_path(
                        path,
                        poppler_path=poppler_path,
                        dpi=300,
                        first_page=1,
                        last_page=1
                    )
                    self.current_image = images[0].convert('RGB')
            else:
                self.current_image = Image.open(path).convert('RGB')

            # Enhance image quality
            enhancer = ImageEnhance.Sharpness(self.current_image)
            self.current_image = enhancer.enhance(self.enhance_factor)

            self.tk_image = ImageTk.PhotoImage(self.current_image)
            self.canvas.config(scrollregion=(0, 0, *self.current_image.size))
            self.canvas.delete("all")
            self.canvas.create_image(0, 0, anchor=tk.NW, image=self.tk_image)
            self.redraw_snips()
        except Exception as e:
            messagebox.showerror("Error", f"Failed to load file: {str(e)}")

    def process_batch(self):
        if not self.templates:
            messagebox.showerror("Error", "No snips defined!")
            return

        self.tree.delete(*self.tree.get_children())
        columns = ['File'] + [name for _, name, _ in self.templates] + ['FullPath']
        self.tree.config(columns=columns)

        for path in self.image_paths:
            try:
                images = []
                if path.lower().endswith('.pdf'):
                    with tempfile.TemporaryDirectory() as path_dir:
                        images = convert_from_path(
                            path,
                            poppler_path=poppler_path,
                            dpi=300
                        )
                else:
                    images = [Image.open(path)]

                for page_num, pil_img in enumerate(images):
                    pil_img = pil_img.convert('RGB')
                    cv_img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
                    gray_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)

                    row = [f"{os.path.basename(path)} (pg {page_num+1})"]

                    for template, name, original_size in self.templates:
                        try:
                            # Scale template if needed
                            scale_factor = pil_img.size[0] / original_size[0]
                            scaled_template = cv2.resize(
                                template,
                                None,
                                fx=scale_factor,
                                fy=scale_factor,
                                interpolation=cv2.INTER_CUBIC
                            )

                            res = cv2.matchTemplate(gray_img,
                                                   cv2.cvtColor(scaled_template, cv2.COLOR_BGR2GRAY),
                                                   cv2.TM_CCOEFF_NORMED)
                            _, conf, _, max_loc = cv2.minMaxLoc(res)

                            if conf < 0.5:
                                row.append("<Low Confidence>")
                                continue

                            h, w = scaled_template.shape[:2]
                            x, y = max_loc
                            region = (x, y, x+w, y+h)

                            cropped = pil_img.crop(region)
                            processed_crop = self.preprocess_image(cropped)

                            text = pytesseract.image_to_string(
                                processed_crop,
                                config=self.custom_config
                            ).strip()

                            text = self.clean_text(text)
                            row.append(text if text else "<No Text>")
                        except Exception as e:
                            row.append("<Error>")
                            continue

                    row.append(path)
                    self.tree.insert('', tk.END, values=row)

            except Exception as e:
                messagebox.showerror("Error", f"Error processing {path}: {str(e)}")

        self.status.config(text=f"Processed {len(self.image_paths)} files")

    def preprocess_image(self, image):
        img = np.array(image)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Enhance contrast using CLAHE
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        enhanced = clahe.apply(gray)

        # Denoise and threshold
        denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
        _, threshold = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        return Image.fromarray(threshold)

    def clean_text(self, text):
        replacements = {
            '‘': "'", '’': "'", '“': '"', '”': '"', '—': '-',
            'Rs': '₹', 'rupees': '₹', 'rupee': '₹', '\\$': '$',
            '\n': ' ', '\t': ' ', '  ': ' '
        }
        for k, v in replacements.items():
            text = text.replace(k, v)
        return text.strip()

    def start_new_snip(self):
        self.selected_snip = None
        self.snip_list.selection_clear(0, tk.END)
        self.status.config(text="Ready to create new snip region")

    def redraw_snips(self):
        self.canvas.delete("snip")
        if self.current_image:
            cv_image = cv2.cvtColor(np.array(self.current_image), cv2.COLOR_RGB2BGR)
            for idx, (template, name, original_size) in enumerate(self.templates):
                try:
                    scale_factor = self.current_image.size[0] / original_size[0]
                    scaled_template = cv2.resize(
                        template,
                        None,
                        fx=scale_factor,
                        fy=scale_factor,
                        interpolation=cv2.INTER_CUBIC
                    )

                    gray_template = cv2.cvtColor(scaled_template, cv2.COLOR_BGR2GRAY)
                    gray_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
                    res = cv2.matchTemplate(gray_image, gray_template, cv2.TM_CCOEFF_NORMED)
                    _, _, _, max_loc = cv2.minMaxLoc(res)

                    h, w = scaled_template.shape[:2]
                    x, y = max_loc
                    self.canvas.create_rectangle(
                        x, y, x+w, y+h,
                        outline='red' if idx == self.selected_snip else 'blue',
                        width=2, tags="snip"
                    )
                except: pass

    def on_snip_select(self, event):
        selection = self.snip_list.curselection()
        if selection:
            self.selected_snip = selection[0]
            self.redraw_snips()
            self.status.config(text=f"Selected snip {self.selected_snip+1} for editing")

    def edit_selected_snip(self):
        if self.selected_snip is not None:
            self.status.config(text="Drag to edit selected snip region")
        else:
            messagebox.showinfo("Info", "Please select a snip to edit")

    def start_selection(self, event):
        self.start_x = self.canvas.canvasx(event.x)
        self.start_y = self.canvas.canvasy(event.y)
        self.rect = self.canvas.create_rectangle(
            self.start_x, self.start_y, self.start_x, self.start_y,
            outline='green', width=2, tags="selection"
        )

    def update_selection(self, event):
        self.canvas.coords(
            self.rect,
            self.start_x, self.start_y,
            self.canvas.canvasx(event.x),
            self.canvas.canvasy(event.y)
        )

    def finalize_selection(self, event):
        x1 = int(min(self.start_x, self.canvas.canvasx(event.x)))
        y1 = int(min(self.start_y, self.canvas.canvasy(event.y)))
        x2 = int(max(self.start_x, self.canvas.canvasx(event.x)))
        y2 = int(max(self.start_y, self.canvas.canvasy(event.y)))

        if abs(x2-x1) < 5 or abs(y2-y1) < 5:
            messagebox.showerror("Error", "Selection too small!")
            return

        try:
            cv_image = cv2.cvtColor(np.array(self.current_image), cv2.COLOR_RGB2BGR)
            template = cv_image[y1:y2, x1:x2]
            original_size = self.current_image.size

            if self.selected_snip is not None:
                # Update existing snip
                self.templates[self.selected_snip] = (template, f"Snip {self.selected_snip+1}", original_size)
                self.status.config(text=f"Updated snip {self.selected_snip+1}")
            else:
                # Add new snip
                self.templates.append((template, f"Snip {len(self.templates)+1}", original_size))
                self.status.config(text="Created new snip")

            self.update_snip_list()
            self.update_table_columns()
            self.redraw_snips()
            self.selected_snip = None
        except Exception as e:
            messagebox.showerror("Error", f"Snip creation failed: {str(e)}")
        finally:
            self.canvas.delete("selection")

    def on_tree_double_click(self, event):
        selected = self.tree.selection()
        if selected:
            item = self.tree.item(selected[0])
            path = item['values'][-1]
            if path != self.current_path:
                self.load_image(path)
            self.redraw_snips()

    def update_snip_list(self):
        self.snip_list.delete(0, tk.END)
        for _, name, _ in self.templates:
            self.snip_list.insert(tk.END, name)

    def update_table_columns(self):
        columns = ['File'] + [name for _, name, _ in self.templates] + ['FullPath']
        self.tree.config(columns=columns)
        for col in columns:
            self.tree.heading(col, text=col)
            self.tree.column(col, width=120, stretch=tk.YES)
        self.tree.column("FullPath", width=0, stretch=tk.NO)

    def delete_selected_snip(self):
        selection = self.snip_list.curselection()
        if selection:
            index = selection[0]
            del self.templates[index]
            self.update_snip_list()
            self.update_table_columns()
            self.redraw_snips()
            self.status.config(text=f"Deleted snip {index+1}")

    def copy_to_clipboard(self):
        selected = self.tree.selection()
        if selected:
            item = self.tree.item(selected[0])
            clipboard.copy("\t".join(str(v) for v in item['values'][:-1]))
            self.status.config(text="Copied to clipboard!")

    def export_to_excel(self):
        file_path = filedialog.asksaveasfilename(
            defaultextension=".xlsx",
            filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")]
        )
        if file_path:
            try:
                columns = ['File'] + [name for _, name, _ in self.templates]
                data = []
                for item in self.tree.get_children():
                    values = self.tree.item(item)['values']
                    data.append(dict(zip(columns, values[:-1])))

                pd.DataFrame(data).to_excel(file_path, index=False)
                self.status.config(text=f"Exported to {os.path.basename(file_path)}")
            except Exception as e:
                messagebox.showerror("Export Error", f"Failed to export: {str(e)}")

    def clear_all(self):
        self.templates = []
        self.image_paths = []
        self.current_image = None
        self.selected_snip = None
        self.tree.delete(*self.tree.get_children())
        self.snip_list.delete(0, tk.END)
        self.file_list.delete(0, tk.END)
        self.canvas.delete("all")
        self.status.config(text="All data cleared")

if __name__ == "__main__":
    root = tk.Tk()
    app = OCRBatchApp(root)
    root.mainloop()

Audit – Vouching Documentation AID _V 2.0

Fair Warning:

Requirements:

Link to download the required data is already shared in the previous post

Code

Leave a Reply Cancel reply