.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_examples/database_report_analysis.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        You can download :ref:`below <sphx_glr_download_auto_examples_database_report_analysis.py>`
        the full example code and run it online in `Codespaces <https://codespaces.new/spark-cleantech-l3/sda-copy?quickstart=1>`__

        .. image:: https://github.com/codespaces/badge.svg
            :target: https://codespaces.new/spark-cleantech-l3/sda-copy?quickstart=1

---

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_examples_database_report_analysis.py:


SDA Database Report Analysis Script.
====================================

This script analyzes all available test data on the machine and provides
comprehensive database metrics about tests, files, table detection,
worksheet structure, and processing success rates.

.. GENERATED FROM PYTHON SOURCE LINES 9-224

.. code-block:: Python


    import time

    import pandas as pd

    from sda.api import list_all_files, list_all_tests
    from sda.api.database_report import (
        analyze_database_files,
        create_summary_dataframe,
        save_excel_with_formatting,
    )

    # Main analysis execution
    print("🔍 SDA Test Data Comprehensive Analysis")
    print("=" * 60)
    print("Analyzing all available test data on this machine...\n")

    start_time = time.time()

    # 1. Discover all tests and files
    print("📊 DISCOVERY PHASE")
    print("-" * 30)

    all_tests = list_all_tests()
    print(f"✅ Found {len(all_tests)} tests")

    all_files = list_all_files(verbose=0)
    print(f"✅ Found {len(all_files)} total files")

    # Filter Excel files for analysis
    excel_files = [f for f in all_files if f.suffix.lower() in [".xlsx", ".xlsm", ".xls"]]
    print(f"📋 Excel files for analysis: {len(excel_files)}")

    if not excel_files:
        print("❌ No Excel files found for analysis. Exiting.")
        exit()

    print()

    # 2. Perform comprehensive database analysis
    print("⚡ DATABASE ANALYSIS PHASE")
    print("-" * 40)

    results_df = analyze_database_files(
        file_filter="*.xls*",
        max_files=None,  # Analyze ALL files
        verbose=True,
        include_details=False,
    )

    if results_df.empty:
        print("❌ No files were successfully analyzed.")
        exit()

    # 3. Generate comprehensive metrics
    print("\n📈 COMPREHENSIVE METRICS")
    print("=" * 60)

    # Basic counts
    total_files = len(results_df)
    successful_files = len(results_df[results_df["status"] == "Success"])
    failed_files = len(results_df[results_df["status"] == "Error"])

    # Table detection counts (Excel files only)
    excel_files_df = results_df[
        results_df["file_name"].str.endswith((".xlsx", ".xlsm", ".xls"), na=False)
    ]
    total_excel_files = len(excel_files_df)
    tables_detected_count = (
        len(excel_files_df[excel_files_df["tables_detected"]])
        if total_excel_files > 0
        else 0
    )
    tables_openable_count = (
        len(excel_files_df[excel_files_df["table_openable"]])
        if total_excel_files > 0
        else 0
    )

    # Data metrics
    successful_df = results_df[results_df["status"] == "Success"]
    total_data_points = successful_df["num_points"].sum() if len(successful_df) > 0 else 0

    # Performance metrics
    total_parse_time = (
        successful_df["parse_time_seconds"].sum() if len(successful_df) > 0 else 0
    )
    avg_parse_time = (
        successful_df["parse_time_seconds"].mean() if len(successful_df) > 0 else 0
    )
    avg_parsing_rate = (
        successful_df["points_per_second"].mean() if len(successful_df) > 0 else 0
    )

    print("📊 FILE ANALYSIS:")
    print(f"   • Total tests discovered: {len(all_tests)}")
    print(f"   • Total files discovered: {len(all_files)}")
    print(f"   • Excel files analyzed: {total_files}")
    print(
        f"   • Files successfully read: {successful_files} ({successful_files / total_files * 100:.1f}%)"
    )
    print(
        f"   • Files that failed to read: {failed_files} ({failed_files / total_files * 100:.1f}%)"
    )
    print()

    print("📋 TABLE DETECTION:")
    if total_excel_files > 0:
        print(
            f"   • Tests with tables detected: {tables_detected_count} "
            f"({tables_detected_count / total_excel_files * 100:.1f}%)"
        )
        assert successful_files == tables_openable_count
        print(
            f"   • Tests with tables openable: {tables_openable_count} "
            f"({tables_openable_count / tables_detected_count * 100:.1f}% of detected)"
        )
    else:
        print("   • No Excel files to analyze for table detection")
    print()

    print("📈 DATA METRICS:")
    print(f"   • Total data points (run points): {total_data_points:,}")
    print(
        f"   • Average data points per file: {total_data_points / successful_files:.0f}"
        if successful_files > 0
        else "   • Average data points per file: 0"
    )
    print(
        f"   • Total file size: {successful_df['file_size_mb'].sum():.1f} MB"
        if len(successful_df) > 0
        else "   • Total file size: 0 MB"
    )
    print()

    print("⏱️  PERFORMANCE METRICS:")
    print(f"   • Total parsing time: {total_parse_time:.1f} seconds")
    print(f"   • Average opening time per file: {avg_parse_time:.3f} seconds")
    print(f"   • Average parsing rate: {avg_parsing_rate:.0f} points/second")

    if len(successful_df) > 0:
        fastest_file = successful_df.loc[successful_df["parse_time_seconds"].idxmin()]
        slowest_file = successful_df.loc[successful_df["parse_time_seconds"].idxmax()]
        largest_file = successful_df.loc[successful_df["num_points"].idxmax()]

        print("🏆 Performance Highlights:")
        print(
            f"   • Slowest file: {slowest_file['file_name']} ({slowest_file['parse_time_seconds']:.3f}s)"
        )
        print(
            f"   • Largest file: {largest_file['file_name']} ({largest_file['num_points']:,} points)"
        )

    # 4. Create summary DataFrame using library function
    print("\n📋 SUMMARY DATAFRAME")
    print("=" * 60)

    summary_df = create_summary_dataframe(results_df, all_tests, all_files)

    # Format the DataFrame for display
    summary_display = summary_df.copy()
    summary_display["Value"] = summary_display["Value"].apply(
        lambda x: f"{x:,.1f}" if isinstance(x, float) else f"{x:,}"
    )
    summary_display["Percentage"] = summary_display["Percentage"].apply(
        lambda x: f"{x:.1f}%" if x is not None else ""
    )

    print(summary_display.to_string(index=False))

    # 5. Save comprehensive results
    analysis_time = time.time() - start_time

    print("\n💾 SAVING RESULTS")
    print("=" * 60)

    # Prepare data for Excel export
    output_file = "sda_comprehensive_analysis.xlsx"

    # Create a copy for Excel export and convert URLs to hyperlinks
    excel_export_df = results_df.copy()
    if "plume_url" in excel_export_df.columns:
        excel_export_df["plume_url"] = excel_export_df["plume_url"].apply(
            lambda url: f'=HYPERLINK("{url}", "{url}")' if pd.notna(url) and url else ""
        )

    # Create data dictionary for Excel export
    excel_data = {"Summary": summary_df, "Detailed Results": excel_export_df}

    # Add failed files analysis
    if failed_files > 0:
        failed_analysis = excel_export_df[excel_export_df["status"] == "Error"][
            ["test_name", "file_name", "plume_url", "error_message", "file_size_mb"]
        ]
        excel_data["Failed Files"] = failed_analysis

    # Save with formatting using library function
    success = save_excel_with_formatting(excel_data, output_file)

    if not success:
        print("❌ Failed to save Excel report")

    # Final summary
    print("\n🎯 ANALYSIS COMPLETE")
    print("=" * 60)
    print(f"⏱️  Total analysis time: {analysis_time:.1f} seconds")
    print(f"📊 Files analyzed: {total_files}")
    print(f"📊 Test Files (with Tables) found: {tables_detected_count}")
    print(
        f"✅ Success rate: {successful_files / tables_detected_count * 100:.1f}%"
        if total_files > 0
        else "✅ Success rate: 0%"
    )
    print(f"📈 Total data points: {total_data_points:,}")
    print(f"💾 Results saved to: {output_file}")


.. _sphx_glr_download_auto_examples_database_report_analysis.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: database_report_analysis.ipynb <database_report_analysis.ipynb>`

    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: database_report_analysis.py <database_report_analysis.py>`

    .. container:: sphx-glr-download sphx-glr-download-zip

      :download:`Download zipped: database_report_analysis.zip <database_report_analysis.zip>`