—
Load all test data.#
This example shows how to load multiple test data files using the library’s
centralized functions for batch operations : list_all_files()
and parse_files().
First, we import the required libraries.#
import warnings
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
from sda.api.load import list_all_files, parse_files
# Suppress warnings for cleaner output.
# See https://stackoverflow.com/questions/53965596/python-3-openpyxl-userwarning-data-validation-extension-not-supported
warnings.simplefilter(action="ignore", category=UserWarning)
Discover all available test files.#
# Use the library function to discover all available files
file_paths = list_all_files(filter="*.xls*")
print(f"Found {len(file_paths)} files. Here are the paths:")
for i, path in enumerate(file_paths):
print(f"{i}: {path}")
Load test data using the universal parse_files() function.#
columns_to_keep = ["plasma_power", "CH4_conversion"]
print(f"Found {len(file_paths)} files to parse")
# Ignore some :
file_paths = [
path
for path in file_paths
if Path(path).name
not in [
"XP_001_Explication_par_jour.xlsx",
"T110_DATA_validation_torch_V7C1.xlsx",
# "T111_Generateur_pilote_reception.xlsx",
# "T116_DATA_validation_torch_V6I.xlsx",
# "T126_DATA.xlsx",
"T127_validation_CA1B.xlsx",
# "T132_Generateur_pilote_reception_avec_template_SOLO3.xlsx",
# "T132_Generateur_pilote_reception_avec_template_SOLO4.xlsx",
"T162.xlsx",
"T097B_test_data_wrong_table_name.xlsx", # Test file with invalid table names
]
]
df = parse_files(
file_paths,
command={}, # Legacy parameters - now uses automatic Excel table detection
columns_to_keep=columns_to_keep,
verbose=2,
column_not_found="warn",
table_not_found="warn", # don't raise an error if no Data Table is found in a file
)
# reverse the order of filepaths:
file_paths.reverse()
Add generator type for analysis.#
# Add a new column for the generator type.
# By default, we assume the generator type is "NRP".
df["Generator Type"] = "NRP"
# For some tests (depending on the file), it is "DC".
DC_tests = [
"T157",
"T173",
"T196",
"T197",
"T234",
"T268",
"T281",
]
# Set the generator type to "DC" for the specified tests.
for test in DC_tests:
mask = df["file"].str.contains(test, case=False, na=False)
if mask.any():
df.loc[mask, "Generator Type"] = "DC"
Filter and prepare data for plotting.#
# Print the number of points before filtering.
print(f"Number of points before filtering: {len(df)}")
# Extract the relevant columns for plotting.
plasma_power = df["plasma_power value"]
ch4_conversion = df["CH4_conversion value"]
generator_type = df["Generator Type"]
# Convert to numeric, errors='coerce' will convert non-numeric values to NaN.
plasma_power = pd.to_numeric(plasma_power, errors="coerce")
ch4_conversion = pd.to_numeric(ch4_conversion, errors="coerce")
# Remove points at plasma power = 0 W, or CH4 conversion = 0% or 100%.
mask = (plasma_power > 0) & (ch4_conversion > 0) & (ch4_conversion < 1)
plasma_power = plasma_power[mask]
ch4_conversion = ch4_conversion[mask]
generator_type = generator_type[mask]
# Remove NaN values.
mask = plasma_power.notna() & ch4_conversion.notna() & generator_type.notna()
plasma_power = plasma_power[mask]
ch4_conversion = ch4_conversion[mask]
generator_type = generator_type[mask]
# Print the number of points after filtering.
print(f"Number of points after filtering: {len(plasma_power)}")
Plot the results.#
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data.
# Change the color based on the generator type.
colors = {"NRP": "blue", "DC": "orange"}
for gen_type in generator_type.unique():
mask = generator_type == gen_type
ax.scatter(
plasma_power[mask],
ch4_conversion[mask] * 100, # Convert to percentage
label=gen_type,
color=colors.get(gen_type, "gray"),
alpha=0.5,
)
ax.set_xlabel("Plasma Power (W)")
ax.set_ylabel("$CH_4$ Conversion (%)")
ax.set_title("Plasma Power vs $CH_4$ Conversion")
ax.legend(title="Generator Type")
ax.set_ylim(0, 100) # Set y-axis limits to [0, 100] for percentage
plt.show()