Load all test data.#

This example shows how to load multiple test data files using the library’s centralized functions for batch operations : list_all_files() and parse_files().

First, we import the required libraries.#

import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

from sda.api.load import list_all_files, parse_files

# Suppress warnings for cleaner output.
# See https://stackoverflow.com/questions/53965596/python-3-openpyxl-userwarning-data-validation-extension-not-supported
warnings.simplefilter(action="ignore", category=UserWarning)

Discover all available test files.#

# Use the library function to discover all available files
file_paths = list_all_files(filter="*.xls*")
print(f"Found {len(file_paths)} files. Here are the paths:")
for i, path in enumerate(file_paths):
    print(f"{i}: {path}")

Load test data using the universal parse_files() function.#

columns_to_keep = ["plasma_power", "CH4_conversion"]

print(f"Found {len(file_paths)} files to parse")

# Ignore some :
file_paths = [
    path
    for path in file_paths
    if Path(path).name
    not in [
        "XP_001_Explication_par_jour.xlsx",
        "T110_DATA_validation_torch_V7C1.xlsx",
        # "T111_Generateur_pilote_reception.xlsx",
        # "T116_DATA_validation_torch_V6I.xlsx",
        # "T126_DATA.xlsx",
        "T127_validation_CA1B.xlsx",
        # "T132_Generateur_pilote_reception_avec_template_SOLO3.xlsx",
        # "T132_Generateur_pilote_reception_avec_template_SOLO4.xlsx",
        "T162.xlsx",
        "T097B_test_data_wrong_table_name.xlsx",  # Test file with invalid table names
    ]
]

df = parse_files(
    file_paths,
    command={},  # Legacy parameters - now uses automatic Excel table detection
    columns_to_keep=columns_to_keep,
    verbose=2,
    column_not_found="warn",
    table_not_found="warn",  # don't raise an error if no Data Table is found in a file
)
# reverse the order of filepaths:
file_paths.reverse()

Add generator type for analysis.#

# Add a new column for the generator type.
# By default, we assume the generator type is "NRP".
df["Generator Type"] = "NRP"
# For some tests (depending on the file), it is "DC".
DC_tests = [
    "T157",
    "T173",
    "T196",
    "T197",
    "T234",
    "T268",
    "T281",
]
# Set the generator type to "DC" for the specified tests.
for test in DC_tests:
    mask = df["file"].str.contains(test, case=False, na=False)
    if mask.any():
        df.loc[mask, "Generator Type"] = "DC"

Filter and prepare data for plotting.#

# Print the number of points before filtering.
print(f"Number of points before filtering: {len(df)}")

# Extract the relevant columns for plotting.
plasma_power = df["plasma_power value"]
ch4_conversion = df["CH4_conversion value"]
generator_type = df["Generator Type"]

# Convert to numeric, errors='coerce' will convert non-numeric values to NaN.
plasma_power = pd.to_numeric(plasma_power, errors="coerce")
ch4_conversion = pd.to_numeric(ch4_conversion, errors="coerce")

# Remove points at plasma power = 0 W, or CH4 conversion = 0% or 100%.
mask = (plasma_power > 0) & (ch4_conversion > 0) & (ch4_conversion < 1)
plasma_power = plasma_power[mask]
ch4_conversion = ch4_conversion[mask]
generator_type = generator_type[mask]

# Remove NaN values.
mask = plasma_power.notna() & ch4_conversion.notna() & generator_type.notna()
plasma_power = plasma_power[mask]
ch4_conversion = ch4_conversion[mask]
generator_type = generator_type[mask]

# Print the number of points after filtering.
print(f"Number of points after filtering: {len(plasma_power)}")

Plot the results.#

fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data.
# Change the color based on the generator type.
colors = {"NRP": "blue", "DC": "orange"}
for gen_type in generator_type.unique():
    mask = generator_type == gen_type
    ax.scatter(
        plasma_power[mask],
        ch4_conversion[mask] * 100,  # Convert to percentage
        label=gen_type,
        color=colors.get(gen_type, "gray"),
        alpha=0.5,
    )
ax.set_xlabel("Plasma Power (W)")
ax.set_ylabel("$CH_4$ Conversion (%)")
ax.set_title("Plasma Power vs $CH_4$ Conversion")
ax.legend(title="Generator Type")
ax.set_ylim(0, 100)  # Set y-axis limits to [0, 100] for percentage

plt.show()