SDA Database Report Analysis Script.#

This script analyzes all available test data on the machine and provides comprehensive database metrics about tests, files, table detection, worksheet structure, and processing success rates.

import time

import pandas as pd

from sda.api import list_all_files, list_all_tests
from sda.api.database_report import (
    analyze_database_files,
    create_summary_dataframe,
    save_excel_with_formatting,
)

# Main analysis execution
print("🔍 SDA Test Data Comprehensive Analysis")
print("=" * 60)
print("Analyzing all available test data on this machine...\n")

start_time = time.time()

# 1. Discover all tests and files
print("📊 DISCOVERY PHASE")
print("-" * 30)

all_tests = list_all_tests()
print(f"✅ Found {len(all_tests)} tests")

all_files = list_all_files(verbose=0)
print(f"✅ Found {len(all_files)} total files")

# Filter Excel files for analysis
excel_files = [f for f in all_files if f.suffix.lower() in [".xlsx", ".xlsm", ".xls"]]
print(f"📋 Excel files for analysis: {len(excel_files)}")

if not excel_files:
    print("❌ No Excel files found for analysis. Exiting.")
    exit()

print()

# 2. Perform comprehensive database analysis
print("⚡ DATABASE ANALYSIS PHASE")
print("-" * 40)

results_df = analyze_database_files(
    file_filter="*.xls*",
    max_files=None,  # Analyze ALL files
    verbose=True,
    include_details=False,
)

if results_df.empty:
    print("❌ No files were successfully analyzed.")
    exit()

# 3. Generate comprehensive metrics
print("\n📈 COMPREHENSIVE METRICS")
print("=" * 60)

# Basic counts
total_files = len(results_df)
successful_files = len(results_df[results_df["status"] == "Success"])
failed_files = len(results_df[results_df["status"] == "Error"])

# Table detection counts (Excel files only)
excel_files_df = results_df[
    results_df["file_name"].str.endswith((".xlsx", ".xlsm", ".xls"), na=False)
]
total_excel_files = len(excel_files_df)
tables_detected_count = (
    len(excel_files_df[excel_files_df["tables_detected"]])
    if total_excel_files > 0
    else 0
)
tables_openable_count = (
    len(excel_files_df[excel_files_df["table_openable"]])
    if total_excel_files > 0
    else 0
)

# Data metrics
successful_df = results_df[results_df["status"] == "Success"]
total_data_points = successful_df["num_points"].sum() if len(successful_df) > 0 else 0

# Performance metrics
total_parse_time = (
    successful_df["parse_time_seconds"].sum() if len(successful_df) > 0 else 0
)
avg_parse_time = (
    successful_df["parse_time_seconds"].mean() if len(successful_df) > 0 else 0
)
avg_parsing_rate = (
    successful_df["points_per_second"].mean() if len(successful_df) > 0 else 0
)

print("📊 FILE ANALYSIS:")
print(f"   • Total tests discovered: {len(all_tests)}")
print(f"   • Total files discovered: {len(all_files)}")
print(f"   • Excel files analyzed: {total_files}")
print(
    f"   • Files successfully read: {successful_files} ({successful_files / total_files * 100:.1f}%)"
)
print(
    f"   • Files that failed to read: {failed_files} ({failed_files / total_files * 100:.1f}%)"
)
print()

print("📋 TABLE DETECTION:")
if total_excel_files > 0:
    print(
        f"   • Tests with tables detected: {tables_detected_count} "
        f"({tables_detected_count / total_excel_files * 100:.1f}%)"
    )
    assert successful_files == tables_openable_count
    print(
        f"   • Tests with tables openable: {tables_openable_count} "
        f"({tables_openable_count / tables_detected_count * 100:.1f}% of detected)"
    )
else:
    print("   • No Excel files to analyze for table detection")
print()

print("📈 DATA METRICS:")
print(f"   • Total data points (run points): {total_data_points:,}")
print(
    f"   • Average data points per file: {total_data_points / successful_files:.0f}"
    if successful_files > 0
    else "   • Average data points per file: 0"
)
print(
    f"   • Total file size: {successful_df['file_size_mb'].sum():.1f} MB"
    if len(successful_df) > 0
    else "   • Total file size: 0 MB"
)
print()

print("⏱️  PERFORMANCE METRICS:")
print(f"   • Total parsing time: {total_parse_time:.1f} seconds")
print(f"   • Average opening time per file: {avg_parse_time:.3f} seconds")
print(f"   • Average parsing rate: {avg_parsing_rate:.0f} points/second")

if len(successful_df) > 0:
    fastest_file = successful_df.loc[successful_df["parse_time_seconds"].idxmin()]
    slowest_file = successful_df.loc[successful_df["parse_time_seconds"].idxmax()]
    largest_file = successful_df.loc[successful_df["num_points"].idxmax()]

    print("🏆 Performance Highlights:")
    print(
        f"   • Slowest file: {slowest_file['file_name']} ({slowest_file['parse_time_seconds']:.3f}s)"
    )
    print(
        f"   • Largest file: {largest_file['file_name']} ({largest_file['num_points']:,} points)"
    )

# 4. Create summary DataFrame using library function
print("\n📋 SUMMARY DATAFRAME")
print("=" * 60)

summary_df = create_summary_dataframe(results_df, all_tests, all_files)

# Format the DataFrame for display
summary_display = summary_df.copy()
summary_display["Value"] = summary_display["Value"].apply(
    lambda x: f"{x:,.1f}" if isinstance(x, float) else f"{x:,}"
)
summary_display["Percentage"] = summary_display["Percentage"].apply(
    lambda x: f"{x:.1f}%" if x is not None else ""
)

print(summary_display.to_string(index=False))

# 5. Save comprehensive results
analysis_time = time.time() - start_time

print("\n💾 SAVING RESULTS")
print("=" * 60)

# Prepare data for Excel export
output_file = "sda_comprehensive_analysis.xlsx"

# Create a copy for Excel export and convert URLs to hyperlinks
excel_export_df = results_df.copy()
if "plume_url" in excel_export_df.columns:
    excel_export_df["plume_url"] = excel_export_df["plume_url"].apply(
        lambda url: f'=HYPERLINK("{url}", "{url}")' if pd.notna(url) and url else ""
    )

# Create data dictionary for Excel export
excel_data = {"Summary": summary_df, "Detailed Results": excel_export_df}

# Add failed files analysis
if failed_files > 0:
    failed_analysis = excel_export_df[excel_export_df["status"] == "Error"][
        ["test_name", "file_name", "plume_url", "error_message", "file_size_mb"]
    ]
    excel_data["Failed Files"] = failed_analysis

# Save with formatting using library function
success = save_excel_with_formatting(excel_data, output_file)

if not success:
    print("❌ Failed to save Excel report")

# Final summary
print("\n🎯 ANALYSIS COMPLETE")
print("=" * 60)
print(f"⏱️  Total analysis time: {analysis_time:.1f} seconds")
print(f"📊 Files analyzed: {total_files}")
print(f"📊 Test Files (with Tables) found: {tables_detected_count}")
print(
    f"✅ Success rate: {successful_files / tables_detected_count * 100:.1f}%"
    if total_files > 0
    else "✅ Success rate: 0%"
)
print(f"📈 Total data points: {total_data_points:,}")
print(f"💾 Results saved to: {output_file}")