Problem:
- there were many big files around 200-300 mb with around 7 Lakh line items which needed to be pivotted and v looked up
- Excel crashed
- Power query took 30 minutes to load and the moment i put another filter, start again- takes another 30 minutes
Solution:
- Data Bases are faster and more optimised for large data
- Duck DB is really good for loading big files in a faster manner
Code
import sys
import pandas as pd
import duckdb
from pathlib import Path
from datetime import datetime
from PyQt5.QtWidgets import (
QApplication, QMainWindow, QTabWidget, QWidget, QVBoxLayout,
QHBoxLayout, QPushButton, QFileDialog, QTableWidget, QTableWidgetItem,
QComboBox, QLabel, QListWidget, QAbstractItemView, QMessageBox,
QGroupBox, QScrollArea, QLineEdit, QDialog, QDialogButtonBox, QInputDialog,
QCheckBox, QSpinBox, QTextEdit
)
from PyQt5.QtCore import Qt
# --- Step Logger Class ---
class StepLogger:
"""Records data transformation steps in a human-readable format."""
def __init__(self):
self.steps = []
def log(self, message: str):
"""Adds a timestamped step to the log."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_entry = f"[{timestamp}] {message}"
self.steps.append(log_entry)
def get_steps_report(self) -> str:
"""Returns the full log as a single string."""
if not self.steps:
return "No transformation steps were recorded."
return "\n".join(self.steps)
def clear(self):
"""Clears all recorded steps."""
self.steps = []
# --- Helper function to display DataFrame in QTableWidget ---
def display_df_in_table(df: pd.DataFrame, table_widget: QTableWidget, limit_rows: bool, max_rows: int):
"""Populates a QTableWidget with data from a pandas DataFrame."""
table_widget.clear()
if df is None or df.empty:
table_widget.setRowCount(0)
table_widget.setColumnCount(0)
return
# Limit rows for UI performance if enabled
display_df = df.head(max_rows) if limit_rows else df
table_widget.setRowCount(display_df.shape[0])
table_widget.setColumnCount(display_df.shape[1])
table_widget.setHorizontalHeaderLabels(display_df.columns)
for row_idx, row in enumerate(display_df.itertuples(index=False)):
for col_idx, cell_data in enumerate(row):
item = QTableWidgetItem(str(cell_data))
item.setFlags(item.flags() & ~Qt.ItemIsEditable) # Read-only
table_widget.setItem(row_idx, col_idx, item)
table_widget.resizeColumnsToContents()
class ExcelSheetDialog(QDialog):
"""A dialog to select a sheet from an Excel file."""
def __init__(self, sheet_names, parent=None):
super().__init__(parent)
self.setWindowTitle("Select Excel Sheet")
layout = QVBoxLayout(self)
self.sheet_combo = QComboBox()
self.sheet_combo.addItems(sheet_names)
layout.addWidget(QLabel("Please select the sheet to load:"))
layout.addWidget(self.sheet_combo)
buttons = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
buttons.accepted.connect(self.accept)
buttons.rejected.connect(self.reject)
layout.addWidget(buttons)
def selected_sheet(self):
return self.sheet_combo.currentText()
class ExportDialog(QDialog):
"""A custom dialog for exporting data with an option to include steps."""
def __init__(self, parent=None):
super().__init__(parent)
self.setWindowTitle("Export Data")
self.layout = QVBoxLayout(self)
self.path_edit = QLineEdit()
self.browse_btn = QPushButton("Browse...")
self.browse_btn.clicked.connect(self.browse)
path_layout = QHBoxLayout()
path_layout.addWidget(QLabel("Save to:"))
path_layout.addWidget(self.path_edit)
path_layout.addWidget(self.browse_btn)
self.layout.addLayout(path_layout)
self.include_steps_checkbox = QCheckBox("Include transformation steps in a separate .txt file")
self.include_steps_checkbox.setChecked(True)
self.layout.addWidget(self.include_steps_checkbox)
buttons = QDialogButtonBox(QDialogButtonBox.Save | QDialogButtonBox.Cancel)
buttons.accepted.connect(self.accept)
buttons.rejected.connect(self.reject)
self.layout.addWidget(buttons)
def browse(self):
filepath, _ = QFileDialog.getSaveFileName(self, "Save File", "", "CSV Files (*.csv);;Excel Files (*.xlsx)")
if filepath:
self.path_edit.setText(filepath)
def get_export_options(self):
return {
"filepath": self.path_edit.text(),
"include_steps": self.include_steps_checkbox.isChecked()
}
class DataMasterApp(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("High-Performance Data Analyzer (DuckDB Edition)")
self.setGeometry(100, 100, 1400, 900)
self.logger = StepLogger()
self.con = duckdb.connect(database=':memory:', read_only=False)
self.loaded_tables = {}
self.filtered_df, self.pivot_df, self.merge_df, self.append_df = None, None, None, None
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QVBoxLayout(central_widget)
self.tabs = QTabWidget()
main_layout.addWidget(self.tabs)
self.load_tab, self.filter_tab, self.pivot_tab, self.merge_tab, self.append_tab, self.history_tab = QWidget(), QWidget(), QWidget(), QWidget(), QWidget(), QWidget()
self.tabs.addTab(self.load_tab, "1. Load & Manage Data")
self.tabs.addTab(self.filter_tab, "2. Filter Data")
self.tabs.addTab(self.pivot_tab, "3. Pivot Data")
self.tabs.addTab(self.merge_tab, "4. Merge Queries (Join)")
self.tabs.addTab(self.append_tab, "5. Append Queries (Union)")
self.tabs.addTab(self.history_tab, "Transformation History")
self.init_load_tab()
self.init_filter_tab()
self.init_pivot_tab()
self.init_merge_tab()
self.init_append_tab()
self.init_history_tab()
self.init_display_settings(main_layout)
self.log_step("Application started.")
def init_display_settings(self, main_layout):
display_group = QGroupBox("Display Settings")
display_layout = QHBoxLayout(display_group)
self.limit_rows_checkbox = QCheckBox("Limit displayed rows")
self.limit_rows_checkbox.setChecked(True)
self.limit_rows_spinbox = QSpinBox()
self.limit_rows_spinbox.setRange(1, 100000)
self.limit_rows_spinbox.setValue(1000)
display_layout.addWidget(self.limit_rows_checkbox)
display_layout.addWidget(self.limit_rows_spinbox)
display_layout.addStretch()
main_layout.addWidget(display_group)
def init_history_tab(self):
layout = QVBoxLayout(self.history_tab)
layout.addWidget(QLabel("Live log of all data transformation steps:"))
self.history_text_edit = QTextEdit()
self.history_text_edit.setReadOnly(True)
self.history_text_edit.setFontFamily("Courier")
layout.addWidget(self.history_text_edit)
def init_load_tab(self):
layout = QHBoxLayout(self.load_tab)
left_panel, right_panel = QWidget(), QWidget()
left_layout, right_layout = QVBoxLayout(left_panel), QVBoxLayout(right_panel)
left_layout.setAlignment(Qt.AlignTop)
self.load_file_btn = QPushButton("Load CSV / Excel File")
self.load_file_btn.clicked.connect(self.load_file)
left_layout.addWidget(self.load_file_btn)
left_layout.addWidget(QLabel("Loaded Tables (Double-click to rename):"))
self.loaded_tables_list = QListWidget()
self.loaded_tables_list.itemSelectionChanged.connect(self.on_table_select)
self.loaded_tables_list.itemDoubleClicked.connect(self.rename_table_dialog)
left_layout.addWidget(self.loaded_tables_list)
self.delete_table_btn = QPushButton("Delete Selected Table")
self.delete_table_btn.clicked.connect(self.delete_table)
left_layout.addWidget(self.delete_table_btn)
type_group = QGroupBox("Change Column Data Types (Select rows to apply)")
type_layout = QVBoxLayout(type_group)
self.type_table = QTableWidget()
self.type_table.setColumnCount(3)
self.type_table.setHorizontalHeaderLabels(["Column", "Current Type", "New Type"])
self.type_table.setSelectionBehavior(QAbstractItemView.SelectRows)
type_layout.addWidget(self.type_table)
self.apply_types_btn = QPushButton("Apply Type Changes to Selected")
self.apply_types_btn.clicked.connect(self.apply_type_changes)
type_layout.addWidget(self.apply_types_btn)
left_layout.addWidget(type_group)
right_layout.addWidget(QLabel("Data Preview:"))
self.preview_table = QTableWidget()
right_layout.addWidget(self.preview_table)
layout.addWidget(left_panel, 1)
layout.addWidget(right_panel, 3)
def init_filter_tab(self):
layout = QVBoxLayout(self.filter_tab)
top_controls = QHBoxLayout()
top_controls.addWidget(QLabel("Select Table to Filter:"))
self.filter_table_combo = QComboBox()
top_controls.addWidget(self.filter_table_combo, 1)
self.add_filter_btn = QPushButton("Add Filter Condition")
self.add_filter_btn.clicked.connect(self.add_filter_condition)
top_controls.addWidget(self.add_filter_btn)
self.apply_filters_btn = QPushButton("Apply All Filters")
self.apply_filters_btn.clicked.connect(self.apply_filters)
top_controls.addWidget(self.apply_filters_btn)
self.export_filtered_btn = QPushButton("Export Result")
self.export_filtered_btn.clicked.connect(lambda: self.export_data(self.filtered_df))
top_controls.addWidget(self.export_filtered_btn)
layout.addLayout(top_controls)
self.filter_scroll_area = QScrollArea()
self.filter_scroll_area.setWidgetResizable(True)
self.filter_conditions_widget = QWidget()
self.filter_conditions_layout = QVBoxLayout(self.filter_conditions_widget)
self.filter_conditions_layout.setAlignment(Qt.AlignTop)
self.filter_scroll_area.setWidget(self.filter_conditions_widget)
layout.addWidget(self.filter_scroll_area, 1)
layout.addWidget(QLabel("Filtered Result:"), 0)
self.filter_result_table = QTableWidget()
layout.addWidget(self.filter_result_table, 3)
self.filter_conditions = []
def init_pivot_tab(self):
layout = QHBoxLayout(self.pivot_tab)
left_panel, right_panel = QVBoxLayout(), QVBoxLayout()
left_panel.setAlignment(Qt.AlignTop)
left_panel.addWidget(QLabel("Select Table to Pivot:"))
self.pivot_table_combo = QComboBox()
self.pivot_table_combo.currentTextChanged.connect(self.populate_pivot_columns)
left_panel.addWidget(self.pivot_table_combo)
left_panel.addWidget(QLabel("Group By (Rows - Ctrl+Click for multiple):"))
self.pivot_index_list = QListWidget()
self.pivot_index_list.setSelectionMode(QAbstractItemView.MultiSelection)
left_panel.addWidget(self.pivot_index_list)
left_panel.addWidget(QLabel("Values (for Aggregation):"))
self.pivot_values_list = QListWidget()
self.pivot_values_list.setSelectionMode(QAbstractItemView.MultiSelection)
left_panel.addWidget(self.pivot_values_list)
left_panel.addWidget(QLabel("Aggregation Function:"))
self.pivot_agg_combo = QComboBox()
self.pivot_agg_combo.addItems(['SUM', 'AVG', 'COUNT', 'MIN', 'MAX'])
left_panel.addWidget(self.pivot_agg_combo)
self.generate_pivot_btn = QPushButton("Generate Pivot Table")
self.generate_pivot_btn.clicked.connect(self.generate_pivot)
left_panel.addWidget(self.generate_pivot_btn)
self.export_pivot_btn = QPushButton("Export Pivot Result")
self.export_pivot_btn.clicked.connect(lambda: self.export_data(self.pivot_df))
left_panel.addWidget(self.export_pivot_btn)
right_panel.addWidget(QLabel("Pivot Result:"))
self.pivot_result_table = QTableWidget()
right_panel.addWidget(self.pivot_result_table)
layout.addLayout(left_panel, 1)
layout.addLayout(right_panel, 3)
def init_merge_tab(self):
layout = QVBoxLayout(self.merge_tab)
top_controls = QHBoxLayout()
join_group = QGroupBox("Join Configuration")
join_group_layout = QHBoxLayout(join_group)
join_group_layout.addWidget(QLabel("Join Type:"))
self.merge_type_combo = QComboBox()
self.merge_type_combo.addItems(['INNER', 'LEFT', 'RIGHT', 'FULL', 'LEFT ANTI', 'RIGHT ANTI'])
join_group_layout.addWidget(self.merge_type_combo)
self.perform_merge_btn = QPushButton("Perform Merge")
self.perform_merge_btn.clicked.connect(self.perform_merge)
join_group_layout.addWidget(self.perform_merge_btn)
self.export_merge_btn = QPushButton("Export Merge Result")
self.export_merge_btn.clicked.connect(lambda: self.export_data(self.merge_df))
join_group_layout.addWidget(self.export_merge_btn)
top_controls.addWidget(join_group)
layout.addLayout(top_controls)
key_selection_layout = QHBoxLayout()
left_group, right_group = QGroupBox("Left Table"), QGroupBox("Right Table")
left_group_layout, right_group_layout = QVBoxLayout(left_group), QVBoxLayout(right_group)
left_group_layout.addWidget(QLabel("Select Left Table:"))
self.merge_left_table_combo = QComboBox()
self.merge_left_table_combo.currentTextChanged.connect(lambda: self.populate_merge_keys('left'))
left_group_layout.addWidget(self.merge_left_table_combo)
left_group_layout.addWidget(QLabel("Left Join Keys (Ctrl+Click for multiple):"))
self.merge_left_key_list = QListWidget()
self.merge_left_key_list.setSelectionMode(QAbstractItemView.MultiSelection)
left_group_layout.addWidget(self.merge_left_key_list)
right_group_layout.addWidget(QLabel("Select Right Table:"))
self.merge_right_table_combo = QComboBox()
self.merge_right_table_combo.currentTextChanged.connect(lambda: self.populate_merge_keys('right'))
right_group_layout.addWidget(self.merge_right_table_combo)
right_group_layout.addWidget(QLabel("Right Join Keys (Ctrl+Click for multiple):"))
self.merge_right_key_list = QListWidget()
self.merge_right_key_list.setSelectionMode(QAbstractItemView.MultiSelection)
right_group_layout.addWidget(self.merge_right_key_list)
key_selection_layout.addWidget(left_group)
key_selection_layout.addWidget(right_group)
layout.addLayout(key_selection_layout)
layout.addWidget(QLabel("Merge Result:"))
self.merge_result_table = QTableWidget()
layout.addWidget(self.merge_result_table)
def init_append_tab(self):
layout = QHBoxLayout(self.append_tab)
left_panel, right_panel = QVBoxLayout(), QVBoxLayout()
left_panel.setAlignment(Qt.AlignTop)
left_panel.addWidget(QLabel("Select tables to append (must have same columns):"))
self.append_tables_list = QListWidget()
self.append_tables_list.setSelectionMode(QAbstractItemView.MultiSelection)
left_panel.addWidget(self.append_tables_list)
self.perform_append_btn = QPushButton("Append Selected Tables")
self.perform_append_btn.clicked.connect(self.perform_append)
left_panel.addWidget(self.perform_append_btn)
self.export_append_btn = QPushButton("Export Append Result")
self.export_append_btn.clicked.connect(lambda: self.export_data(self.append_df))
left_panel.addWidget(self.export_append_btn)
right_panel.addWidget(QLabel("Append Result:"))
self.append_result_table = QTableWidget()
right_panel.addWidget(self.append_result_table)
layout.addLayout(left_panel, 1)
layout.addLayout(right_panel, 3)
def log_step(self, message):
self.logger.log(message)
self.history_text_edit.append(self.logger.steps[-1])
def load_file(self):
filepath, _ = QFileDialog.getOpenFileName(self, "Open Data File", "", "Data Files (*.csv *.xlsx)")
if not filepath:
return
try:
base_name = "".join(filter(str.isalnum, Path(filepath).stem))
table_name = base_name
count = 1
while table_name in self.loaded_tables:
table_name = f"{base_name}_{count}"
count += 1
if filepath.endswith('.csv'):
self.con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{filepath}', header=true)")
elif filepath.endswith('.xlsx'):
xls = pd.ExcelFile(filepath)
sheet_name = xls.sheet_names[0]
if len(xls.sheet_names) > 1:
dialog = ExcelSheetDialog(xls.sheet_names, self)
if dialog.exec_() == QDialog.Accepted:
sheet_name = dialog.selected_sheet()
else:
return
df = pd.read_excel(filepath, sheet_name=sheet_name)
self.con.register(table_name, df)
self.loaded_tables[table_name] = self.con.execute(f"SELECT * FROM {table_name} LIMIT 10").fetchdf()
self.update_all_table_widgets()
self.loaded_tables_list.setCurrentRow(self.loaded_tables_list.count() - 1)
self.log_step(f"Loaded file '{Path(filepath).name}' into table '{table_name}'.")
QMessageBox.information(self, "Success", f"Loaded '{filepath}' as table '{table_name}'.")
except Exception as e:
QMessageBox.critical(self, "Error", f"Failed to load file: {e}")
def on_table_select(self):
selected_items = self.loaded_tables_list.selectedItems()
if not selected_items:
self.preview_table.clear()
self.type_table.setRowCount(0)
return
table_name = selected_items[0].text()
df_schema = self.con.execute(f"DESCRIBE {table_name}").fetchdf()
preview_df = self.con.execute(f"SELECT * FROM {table_name}").fetchdf()
display_df_in_table(preview_df, self.preview_table, self.limit_rows_checkbox.isChecked(), self.limit_rows_spinbox.value())
self.type_table.setRowCount(0)
self.type_table.setRowCount(len(df_schema))
valid_types = ['VARCHAR', 'INTEGER', 'BIGINT', 'FLOAT', 'DOUBLE', 'DATE', 'TIMESTAMP', 'BOOLEAN']
for row, record in enumerate(df_schema.itertuples()):
self.type_table.setItem(row, 0, QTableWidgetItem(record.column_name))
self.type_table.setItem(row, 1, QTableWidgetItem(record.column_type))
combo = QComboBox()
combo.addItems(valid_types)
self.type_table.setCellWidget(row, 2, combo)
self.type_table.resizeColumnsToContents()
def apply_type_changes(self):
selected_table_items = self.loaded_tables_list.selectedItems()
if not selected_table_items:
QMessageBox.warning(self, "Warning", "Please select a table first.")
return
table_name = selected_table_items[0].text()
selected_rows = self.type_table.selectionModel().selectedRows()
if not selected_rows:
QMessageBox.warning(self, "Warning", "Please select one or more columns in the type table to apply changes.")
return
try:
log_messages = []
for index in selected_rows:
row = index.row()
col_name = self.type_table.item(row, 0).text()
new_type = self.type_table.cellWidget(row, 2).currentText()
self.con.execute(f'ALTER TABLE "{table_name}" ALTER "{col_name}" TYPE {new_type}')
log_messages.append(f"Changed type of column '{col_name}' to {new_type}")
self.log_step(f"Applied type changes to table '{table_name}': {'; '.join(log_messages)}.")
QMessageBox.information(self, "Success", "Data types for selected columns updated successfully.")
self.on_table_select()
except Exception as e:
QMessageBox.critical(self, "Error", f"Could not apply type changes: {e}")
def rename_table_dialog(self, item):
old_name = item.text()
new_name, ok = QInputDialog.getText(self, "Rename Table", "Enter new table name:", text=old_name)
if ok and new_name and new_name != old_name:
if new_name in self.loaded_tables:
QMessageBox.critical(self, "Error", f"Table name '{new_name}' already exists.")
return
try:
self.con.execute(f'ALTER TABLE "{old_name}" RENAME TO "{new_name}"')
self.loaded_tables[new_name] = self.loaded_tables.pop(old_name)
self.update_all_table_widgets()
self.log_step(f"Renamed table '{old_name}' to '{new_name}'.")
QMessageBox.information(self, "Success", f"Table '{old_name}' renamed to '{new_name}'.")
except Exception as e:
QMessageBox.critical(self, "Error", f"Could not rename table: {e}")
def delete_table(self):
selected_items = self.loaded_tables_list.selectedItems()
if not selected_items:
QMessageBox.warning(self, "Warning", "Please select a table to delete.")
return
table_name = selected_items[0].text()
reply = QMessageBox.question(self, 'Confirm Delete', f"Are you sure you want to permanently delete the table '{table_name}'?",
QMessageBox.Yes | QMessageBox.No, QMessageBox.No)
if reply == QMessageBox.Yes:
try:
self.con.execute(f'DROP TABLE "{table_name}"')
del self.loaded_tables[table_name]
self.update_all_table_widgets()
self.log_step(f"Deleted table '{table_name}'.")
QMessageBox.information(self, "Success", f"Table '{table_name}' has been deleted.")
except Exception as e:
QMessageBox.critical(self, "Error", f"Could not delete table: {e}")
def add_filter_condition(self):
table_name = self.filter_table_combo.currentText()
if not table_name:
QMessageBox.warning(self, "Warning", "Please select a table to filter first.")
return
condition_widget = QWidget()
layout = QHBoxLayout(condition_widget)
columns = self.con.execute(f"DESCRIBE {table_name}").fetchdf()['column_name'].tolist()
col_combo = QComboBox()
col_combo.addItems(columns)
op_combo = QComboBox()
op_combo.addItems(['=', '!=', '>', '<', '>=', '<=', 'LIKE', 'NOT LIKE', 'IS NULL', 'IS NOT NULL'])
val_edit = QLineEdit()
remove_btn = QPushButton("X")
remove_btn.setFixedWidth(30)
layout.addWidget(col_combo)
layout.addWidget(op_combo)
layout.addWidget(val_edit)
layout.addWidget(remove_btn)
self.filter_conditions_layout.addWidget(condition_widget)
condition_ref = {'widget': condition_widget, 'col': col_combo, 'op': op_combo, 'val': val_edit}
self.filter_conditions.append(condition_ref)
remove_btn.clicked.connect(lambda: self.remove_filter_condition(condition_ref))
def remove_filter_condition(self, condition_ref):
condition_ref['widget'].deleteLater()
self.filter_conditions.remove(condition_ref)
def apply_filters(self):
table_name = self.filter_table_combo.currentText()
if not table_name: return
where_clauses = []
log_clauses = []
for cond in self.filter_conditions:
col = f'"{cond["col"].currentText()}"'
op = cond["op"].currentText()
val = cond["val"].text()
if op in ['IS NULL', 'IS NOT NULL']:
where_clauses.append(f"{col} {op}")
log_clauses.append(f"{col} {op}")
else:
# Properly quote string values for query
val_quoted = val.replace("'", "''")
where_clauses.append(f"{col} {op} '{val_quoted}'")
log_clauses.append(f"{col} {op} '{val}'")
if not where_clauses:
QMessageBox.information(self, "Info", "No filter conditions to apply.")
return
query = f'SELECT * FROM "{table_name}" WHERE {" AND ".join(where_clauses)}'
try:
self.filtered_df = self.con.execute(query).fetchdf()
display_df_in_table(self.filtered_df, self.filter_result_table, self.limit_rows_checkbox.isChecked(), self.limit_rows_spinbox.value())
self.log_step(f"Filtered table '{table_name}' with conditions: {' AND '.join(log_clauses)}.")
except Exception as e:
self.filtered_df = None
QMessageBox.critical(self, "Query Error", f"Failed to apply filters: {e}")
def generate_pivot(self):
table_name = self.pivot_table_combo.currentText()
group_by_cols = [item.text() for item in self.pivot_index_list.selectedItems()]
agg_func = self.pivot_agg_combo.currentText()
value_cols = [item.text() for item in self.pivot_values_list.selectedItems()]
if not all([table_name, group_by_cols, value_cols]):
QMessageBox.warning(self, "Input Error", "Please select a table, at least one grouping column, and at least one value column.")
return
group_by_string = ", ".join([f'"{col}"' for col in group_by_cols])
selections = [f'"{col}"' for col in group_by_cols]
for col in value_cols:
sanitized_alias = f"{agg_func}_{col}".replace('"', '""')
selections.append(f'{agg_func}("{col}") AS "{sanitized_alias}"')
query = f'SELECT {", ".join(selections)} FROM "{table_name}" GROUP BY {group_by_string}'
try:
self.pivot_df = self.con.execute(query).fetchdf()
display_df_in_table(self.pivot_df, self.pivot_result_table, self.limit_rows_checkbox.isChecked(), self.limit_rows_spinbox.value())
log_msg = f"Created Pivot on table '{table_name}'. Grouped by: {', '.join(group_by_cols)}. Aggregated: {', '.join(value_cols)} using {agg_func}."
self.log_step(log_msg)
except Exception as e:
self.pivot_df = None
QMessageBox.critical(self, "Pivot Error", f"Failed to generate pivot table: {e}\nQuery: {query}")
def perform_merge(self):
left_table = self.merge_left_table_combo.currentText()
right_table = self.merge_right_table_combo.currentText()
left_keys = [item.text() for item in self.merge_left_key_list.selectedItems()]
right_keys = [item.text() for item in self.merge_right_key_list.selectedItems()]
join_type = self.merge_type_combo.currentText()
if not (left_table and right_table and left_keys and right_keys):
QMessageBox.warning(self, "Input Error", "Please select left/right tables and at least one join key for each.")
return
if len(left_keys) != len(right_keys):
QMessageBox.warning(self, "Input Error", "The number of join keys for the left and right tables must be equal.")
return
on_clauses = [f'"{left_table}"."{lk}" = "{right_table}"."{rk}"' for lk, rk in zip(left_keys, right_keys)]
on_string = " AND ".join(on_clauses)
query = f'SELECT * FROM "{left_table}" {join_type} JOIN "{right_table}" ON {on_string}'
try:
self.merge_df = self.con.execute(query).fetchdf()
display_df_in_table(self.merge_df, self.merge_result_table, self.limit_rows_checkbox.isChecked(), self.limit_rows_spinbox.value())
log_msg = f"Merged table '{left_table}' and '{right_table}' using a {join_type} JOIN on keys: ({', '.join(left_keys)}) = ({', '.join(right_keys)})."
self.log_step(log_msg)
except Exception as e:
self.merge_df = None
QMessageBox.critical(self, "Merge Error", f"Failed to perform merge: {e}")
def perform_append(self):
tables_to_append = [item.text() for item in self.append_tables_list.selectedItems()]
if len(tables_to_append) < 2:
QMessageBox.warning(self, "Input Error", "Please select at least two tables to append.")
return
try:
base_schema = self.con.execute(f"DESCRIBE {tables_to_append[0]}").fetchdf()['column_name'].tolist()
for table in tables_to_append[1:]:
current_schema = self.con.execute(f"DESCRIBE {table}").fetchdf()['column_name'].tolist()
if base_schema != current_schema:
QMessageBox.critical(self, "Schema Mismatch", f"Table '{table}' does not have the same columns as '{tables_to_append[0]}'.")
return
except Exception as e:
QMessageBox.critical(self, "Error", f"Could not verify schemas: {e}")
return
union_query = " UNION ALL ".join([f'SELECT * FROM "{table}"' for table in tables_to_append])
try:
self.append_df = self.con.execute(union_query).fetchdf()
display_df_in_table(self.append_df, self.append_result_table, self.limit_rows_checkbox.isChecked(), self.limit_rows_spinbox.value())
self.log_step(f"Appended tables: {', '.join(tables_to_append)}.")
except Exception as e:
self.append_df = None
QMessageBox.critical(self, "Append Error", f"Failed to append tables: {e}")
def export_data(self, df):
if df is None or df.empty:
QMessageBox.warning(self, "No Data", "There is no result data to export.")
return
dialog = ExportDialog(self)
if dialog.exec_() == QDialog.Accepted:
options = dialog.get_export_options()
filepath = options["filepath"]
include_steps = options["include_steps"]
if not filepath:
QMessageBox.warning(self, "Input Error", "File path cannot be empty.")
return
try:
if filepath.endswith('.csv'):
df.to_csv(filepath, index=False)
elif filepath.endswith('.xlsx'):
df.to_excel(filepath, index=False)
else:
QMessageBox.critical(self, "Export Error", "Unsupported file type. Please use .csv or .xlsx.")
return
QMessageBox.information(self, "Success", f"Data successfully exported to {filepath}")
self.log_step(f"Exported result to '{Path(filepath).name}'.")
if include_steps:
steps_filepath = Path(filepath).with_suffix('.steps.txt')
with open(steps_filepath, 'w', encoding='utf-8') as f:
f.write(self.logger.get_steps_report())
QMessageBox.information(self, "Success", f"Transformation steps saved to {steps_filepath}")
self.log_step(f"Exported steps to '{steps_filepath.name}'.")
except Exception as e:
QMessageBox.critical(self, "Export Error", f"Failed to export file: {e}")
def update_all_table_widgets(self):
table_names = list(self.loaded_tables.keys())
current_filter_selection = self.filter_table_combo.currentText()
for combo in [self.filter_table_combo, self.pivot_table_combo, self.merge_left_table_combo, self.merge_right_table_combo]:
current_selection = combo.currentText()
combo.blockSignals(True)
combo.clear()
combo.addItems(table_names)
if current_selection in table_names:
combo.setCurrentText(current_selection)
combo.blockSignals(False)
self.loaded_tables_list.clear()
self.loaded_tables_list.addItems(table_names)
self.append_tables_list.clear()
self.append_tables_list.addItems(table_names)
if self.filter_table_combo.currentText() != current_filter_selection:
self.filter_conditions.clear()
while self.filter_conditions_layout.count():
child = self.filter_conditions_layout.takeAt(0)
if child.widget():
child.widget().deleteLater()
self.populate_pivot_columns()
self.populate_merge_keys('left')
self.populate_merge_keys('right')
def populate_pivot_columns(self):
table_name = self.pivot_table_combo.currentText()
self.pivot_index_list.clear()
self.pivot_values_list.clear()
if not table_name: return
schema = self.con.execute(f"DESCRIBE {table_name}").fetchdf()
all_cols = schema['column_name'].tolist()
numeric_cols = schema[schema['column_type'].isin(['INTEGER', 'BIGINT', 'FLOAT', 'DOUBLE', 'DECIMAL'])]['column_name'].tolist()
self.pivot_index_list.addItems(all_cols)
self.pivot_values_list.addItems(numeric_cols)
def populate_merge_keys(self, side):
if side == 'left':
list_widget = self.merge_left_key_list
table_name = self.merge_left_table_combo.currentText()
else:
list_widget = self.merge_right_key_list
table_name = self.merge_right_table_combo.currentText()
list_widget.clear()
if not table_name: return
cols = self.con.execute(f"DESCRIBE {table_name}").fetchdf()['column_name'].tolist()
list_widget.addItems(cols)
if __name__ == '__main__':
app = QApplication(sys.argv)
main_win = DataMasterApp()
main_win.show()
sys.exit(app.exec_())