"""Convert PEtab version 1 problems to version 2."""
from __future__ import annotations
import re
import shutil
import warnings
from contextlib import suppress
from pathlib import Path
from tempfile import TemporaryDirectory
from urllib.parse import urlparse
from uuid import uuid4
import pandas as pd
from pandas.io.common import get_handle, is_url
from .. import v1, v2
from ..v1.math import sympify_petab
from ..v1.yaml import get_path_prefix, load_yaml, validate
from ..versions import get_major_version
from .models import MODEL_TYPE_SBML
__all__ = ["petab1to2"]
[docs]
def petab1to2(
yaml_config: Path | str, output_dir: Path | str = None
) -> v2.Problem | None:
"""Convert from PEtab 1.0 to PEtab 2.0 format.
Convert a PEtab problem from PEtab 1.0 to PEtab 2.0 format.
.. note::
Some aspects of PEtab v1 were not well-defined. For example, model
initialization order (e.g., applying initial assignments before or
after condition table overrides) and the impact of compartment size
changes were not specified. In such cases, we made assumptions that are
consistent with the clarified PEtab v2 specifications,
the PEtab test suite, or common practice.
Therefore, it is recommended to carefully review the generated PEtab v2
problem to ensure it aligns with the expected behavior.
:param yaml_config:
The PEtab problem as dictionary or YAML file name.
:param output_dir:
The output directory to save the converted PEtab problem, or ``None``,
to return a :class:`petab.v2.Problem` instance.
:raises ValueError:
If the input is invalid or does not pass linting or if the generated
files do not pass linting.
"""
if output_dir is not None:
return petab_files_1to2(yaml_config, output_dir)
with TemporaryDirectory() as tmp_dir:
petab_files_1to2(yaml_config, tmp_dir)
return v2.Problem.from_yaml(Path(tmp_dir, Path(yaml_config).name))
def petab_files_1to2(yaml_config: Path | str | dict, output_dir: Path | str):
"""Convert PEtab files from PEtab 1.0 to PEtab 2.0.
:param yaml_config:
The PEtab problem as dictionary or YAML file name.
:param output_dir:
The output directory to save the converted PEtab problem.
:raises ValueError:
If the input is invalid or does not pass linting or if the generated
files do not pass linting.
"""
if isinstance(yaml_config, Path | str):
yaml_file = str(yaml_config)
path_prefix = get_path_prefix(yaml_file)
yaml_config = load_yaml(yaml_config)
get_src_path = lambda filename: f"{path_prefix}/{filename}" # noqa: E731
else:
yaml_file = None
path_prefix = None
get_src_path = lambda filename: filename # noqa: E731
get_dest_path = lambda filename: f"{output_dir}/{filename}" # noqa: E731
# Validate the original PEtab problem
validate(yaml_config, path_prefix=path_prefix)
if get_major_version(yaml_config) != 1:
raise ValueError("PEtab problem is not version 1.")
petab_problem = v1.Problem.from_yaml(yaml_file or yaml_config)
# TODO: move to mapping table
# get rid of conditionName column if present (unsupported in v2)
petab_problem.condition_df = petab_problem.condition_df.drop(
columns=[v1.C.CONDITION_NAME], errors="ignore"
)
if v1.lint_problem(petab_problem):
raise ValueError("Provided PEtab problem does not pass linting.")
output_dir = Path(output_dir)
# Update YAML file
new_yaml_config = _update_yaml(yaml_config)
new_yaml_config = v2.ProblemConfig(**new_yaml_config)
# Update tables
# parameter table
parameter_df = v1v2_parameter_df(petab_problem.parameter_df.copy())
v2.write_parameter_df(
parameter_df, get_dest_path(new_yaml_config.parameter_files[0])
)
# copy files that don't need conversion: models
for file in (
model.location for model in new_yaml_config.model_files.values()
):
_copy_file(get_src_path(file), Path(get_dest_path(file)))
# Update observable table
for observable_file in new_yaml_config.observable_files:
observable_df = v1.get_observable_df(get_src_path(observable_file))
observable_df = v1v2_observable_df(
observable_df,
)
v2.write_observable_df(observable_df, get_dest_path(observable_file))
# Update condition table
for condition_file in new_yaml_config.condition_files:
condition_df = v1.get_condition_df(get_src_path(condition_file))
condition_df = v1v2_condition_df(condition_df, petab_problem.model)
v2.write_condition_df(condition_df, get_dest_path(condition_file))
# records for the experiment table to be created
experiments = []
def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
if not sim_cond_id and not preeq_cond_id:
return ""
# check whether the conditions will exist in the v2 condition table
sim_cond_exists = (
petab_problem.condition_df.loc[sim_cond_id].notna().any()
)
preeq_cond_exists = (
preeq_cond_id
and petab_problem.condition_df.loc[preeq_cond_id].notna().any()
)
if not sim_cond_exists and not preeq_cond_exists:
# if we have only all-NaN conditions, we don't create a new
# experiment
return ""
if preeq_cond_id:
preeq_cond_id = f"{preeq_cond_id}_"
exp_id = f"experiment__{preeq_cond_id}__{sim_cond_id}"
if exp_id in experiments: # noqa: B023
i = 1
while f"{exp_id}_{i}" in experiments: # noqa: B023
i += 1
exp_id = f"{exp_id}_{i}"
return exp_id
measured_experiments = (
petab_problem.get_simulation_conditions_from_measurement_df()
)
for (
_,
row,
) in measured_experiments.iterrows():
# generate a new experiment for each simulation / pre-eq condition
# combination
sim_cond_id = row[v1.C.SIMULATION_CONDITION_ID]
preeq_cond_id = row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, "")
exp_id = create_experiment_id(sim_cond_id, preeq_cond_id)
if not exp_id:
continue
if preeq_cond_id:
experiments.append(
{
v2.C.EXPERIMENT_ID: exp_id,
v2.C.TIME: v2.C.TIME_PREEQUILIBRATION,
v2.C.CONDITION_ID: preeq_cond_id,
}
)
experiments.append(
{
v2.C.EXPERIMENT_ID: exp_id,
v2.C.TIME: 0,
v2.C.CONDITION_ID: sim_cond_id,
}
)
if experiments:
exp_table_path = output_dir / "experiments.tsv"
if exp_table_path.exists():
raise ValueError(
f"Experiment table file {exp_table_path} already exists."
)
new_yaml_config.experiment_files.append("experiments.tsv")
v2.write_experiment_df(
v2.get_experiment_df(pd.DataFrame(experiments)), exp_table_path
)
for measurement_file in new_yaml_config.measurement_files:
measurement_df = v1.get_measurement_df(get_src_path(measurement_file))
# if there is already an experiment ID column, we rename it
if v2.C.EXPERIMENT_ID in measurement_df.columns:
measurement_df.rename(
columns={v2.C.EXPERIMENT_ID: f"experiment_id_{uuid4()}"},
inplace=True,
)
# add pre-eq condition id if not present or convert to string
# for simplicity
if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns:
measurement_df.fillna(
{v1.C.PREEQUILIBRATION_CONDITION_ID: ""}, inplace=True
)
else:
measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
if (
petab_problem.condition_df is not None
and len(
set(petab_problem.condition_df.columns) - {v1.C.CONDITION_NAME}
)
== 0
):
# we can't have "empty" conditions with no overrides in v2,
# therefore, we drop the respective condition ID completely
# TODO: or can we?
# TODO: this needs to be checked condition-wise, not globally
measurement_df[v1.C.SIMULATION_CONDITION_ID] = ""
if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns:
measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
# condition IDs to experiment IDs
measurement_df.insert(
0,
v2.C.EXPERIMENT_ID,
measurement_df.apply(
lambda row: create_experiment_id(
row[v1.C.SIMULATION_CONDITION_ID],
row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, ""),
),
axis=1,
),
)
del measurement_df[v1.C.SIMULATION_CONDITION_ID]
del measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID]
v2.write_measurement_df(
measurement_df, get_dest_path(measurement_file)
)
# Write the new YAML file
new_yaml_file = output_dir / Path(yaml_file).name
new_yaml_config.to_yaml(new_yaml_file)
# validate updated Problem
validation_issues = v2.lint_problem(new_yaml_file)
if validation_issues:
sev = v2.lint.ValidationIssueSeverity
validation_issues.log(max_level=sev.WARNING)
errors = "\n".join(
map(
str,
(i for i in validation_issues if i.level > sev.WARNING),
)
)
if errors:
raise ValueError(
"The generated PEtab v2 problem did not pass linting: "
f"{errors}"
)
def _update_yaml(yaml_config: dict) -> dict:
"""Update PEtab 1.0 YAML to PEtab 2.0 format."""
yaml_config = yaml_config.copy()
# Update format_version
yaml_config[v2.C.FORMAT_VERSION] = "2.0.0"
# Add extensions
yaml_config[v2.C.EXTENSIONS] = {}
# Move models and set IDs (filename for now)
yaml_config[v2.C.MODEL_FILES] = {}
for problem in yaml_config[v1.C.PROBLEMS]:
models = {}
for sbml_file in problem[v1.C.SBML_FILES]:
model_id = sbml_file.split("/")[-1].split(".")[0]
models[model_id] = {
v2.C.MODEL_LANGUAGE: MODEL_TYPE_SBML,
v2.C.MODEL_LOCATION: sbml_file,
}
yaml_config[v2.C.MODEL_FILES] |= models
del problem[v1.C.SBML_FILES]
for file_type in (
v1.C.CONDITION_FILES,
v1.C.MEASUREMENT_FILES,
v1.C.OBSERVABLE_FILES,
):
if file_type in problem:
yaml_config[file_type] = problem[file_type]
del problem[file_type]
del yaml_config[v1.C.PROBLEMS]
# parameter_file -> parameter_files
if not isinstance(
(par_files := yaml_config.pop(v1.C.PARAMETER_FILE, [])), list
):
par_files = [par_files]
yaml_config[v2.C.PARAMETER_FILES] = par_files
return yaml_config
def _copy_file(src: Path | str, dest: Path):
"""Copy file."""
# src might be a URL - convert to Path if local
src_url = urlparse(src)
if not src_url.scheme:
src = Path(src)
elif src_url.scheme == "file" and not src_url.netloc:
src = Path(src.removeprefix("file:/"))
if is_url(src):
with get_handle(src, mode="r") as src_handle:
with open(dest, "w") as dest_handle:
dest_handle.write(src_handle.handle.read())
return
try:
if dest.samefile(src):
return
except FileNotFoundError:
shutil.copy(str(src), str(dest))
def v1v2_condition_df(
condition_df: pd.DataFrame, model: v1.Model
) -> pd.DataFrame:
"""Convert condition table from petab v1 to v2."""
condition_df = condition_df.copy().reset_index()
with suppress(KeyError):
# conditionName was dropped in PEtab v2
condition_df.drop(columns=[v1.C.CONDITION_NAME], inplace=True)
condition_df = condition_df.melt(
id_vars=[v1.C.CONDITION_ID],
var_name=v2.C.TARGET_ID,
value_name=v2.C.TARGET_VALUE,
).dropna(subset=[v2.C.TARGET_VALUE])
if condition_df.empty:
# This happens if there weren't any condition-specific changes
return pd.DataFrame(
columns=[
v2.C.CONDITION_ID,
v2.C.TARGET_ID,
v2.C.TARGET_VALUE,
]
)
return condition_df
def v1v2_observable_df(observable_df: pd.DataFrame) -> pd.DataFrame:
"""Convert observable table from petab v1 to v2.
Perform all updates that can be done solely on the observable table:
* drop observableTransformation, update noiseDistribution
* update placeholder parameters
"""
df = observable_df.copy().reset_index()
# drop observableTransformation, update noiseDistribution
# if there is no observableTransformation, no need to update
if v1.C.OBSERVABLE_TRANSFORMATION in df.columns:
df[v1.C.OBSERVABLE_TRANSFORMATION] = df[
v1.C.OBSERVABLE_TRANSFORMATION
].fillna(v1.C.LIN)
if v1.C.NOISE_DISTRIBUTION in df:
df[v1.C.NOISE_DISTRIBUTION] = df[v1.C.NOISE_DISTRIBUTION].fillna(
v1.C.NORMAL
)
else:
df[v1.C.NOISE_DISTRIBUTION] = v1.C.NORMAL
# merge observableTransformation into noiseDistribution
def update_noise_dist(row):
dist = row.get(v1.C.NOISE_DISTRIBUTION)
trans = row.get(v1.C.OBSERVABLE_TRANSFORMATION)
if trans == v1.C.LIN:
new_dist = dist
else:
new_dist = f"{trans}-{dist}"
if new_dist == "log10-normal":
warnings.warn(
f"Noise distribution `{new_dist}' for "
f"observable `{row[v1.C.OBSERVABLE_ID]}'"
f" is not supported in PEtab v2. "
"Using `log-normal` instead.",
# call to `petab1to2`
stacklevel=9,
)
new_dist = v2.C.LOG_NORMAL
if new_dist not in v2.C.NOISE_DISTRIBUTIONS:
raise NotImplementedError(
f"Noise distribution `{new_dist}' for "
f"observable `{row[v1.C.OBSERVABLE_ID]}'"
f" is not supported in PEtab v2."
)
df[v2.C.NOISE_DISTRIBUTION] = df.apply(update_noise_dist, axis=1)
df.drop(columns=[v1.C.OBSERVABLE_TRANSFORMATION], inplace=True)
def extract_placeholders(row: pd.Series, type_: str) -> str:
"""Extract placeholders from observable formula."""
if type_ == "observable":
formula = row[v1.C.OBSERVABLE_FORMULA]
elif type_ == "noise":
formula = row[v1.C.NOISE_FORMULA]
else:
raise ValueError(f"Unknown placeholder type: {type_}")
if pd.isna(formula):
return ""
t = f"{re.escape(type_)}Parameter"
o = re.escape(row[v1.C.OBSERVABLE_ID])
pattern = re.compile(rf"(?:^|\W)({t}\d+_{o})(?=\W|$)")
expr = sympify_petab(formula)
# for 10+ placeholders, the current lexicographical sorting will result
# in incorrect ordering of the placeholder IDs, so that they don't
# align with the overrides in the measurement table, but who does
# that anyway?
return v2.C.PARAMETER_SEPARATOR.join(
sorted(
str(sym)
for sym in expr.free_symbols
if sym.is_Symbol and pattern.match(str(sym))
)
)
df[v2.C.OBSERVABLE_PLACEHOLDERS] = df.apply(
extract_placeholders, args=("observable",), axis=1
)
df[v2.C.NOISE_PLACEHOLDERS] = df.apply(
extract_placeholders, args=("noise",), axis=1
)
return df
def v1v2_parameter_df(
parameter_df: pd.DataFrame,
) -> pd.DataFrame:
"""Convert parameter table from petab v1 to v2.
Do all the necessary conversions to the parameter table that can
be done with the parameter table alone.
"""
df = parameter_df.copy().reset_index()
# parameter.estimate: int -> bool
df[v2.C.ESTIMATE] = df[v1.C.ESTIMATE].apply(
lambda x: str(bool(int(x))).lower()
)
def update_prior(row):
"""Convert prior to v2 format."""
prior_type = row.get(v1.C.OBJECTIVE_PRIOR_TYPE)
if pd.isna(prior_type):
prior_type = v1.C.UNIFORM
pscale = row.get(v1.C.PARAMETER_SCALE)
if pd.isna(pscale):
pscale = v1.C.LIN
if prior_type not in v1.C.PARAMETER_SCALE_PRIOR_TYPES:
return prior_type
new_prior_type = prior_type.removeprefix("parameterScale").lower()
if pscale != v1.C.LIN:
new_prior_type = f"{pscale}-{new_prior_type}"
if new_prior_type == "log10-normal":
warnings.warn(
f"Prior distribution `{new_prior_type}' for parameter "
f"`{row[v1.C.PARAMETER_ID]}' is not supported in PEtab v2. "
"Using `log-normal` instead.",
# call to `petab1to2`
stacklevel=9,
)
new_prior_type = v2.C.LOG_NORMAL
if new_prior_type not in v2.C.PRIOR_DISTRIBUTIONS:
raise NotImplementedError(
f"PEtab v2 does not support prior type `{new_prior_type}' "
f"required for parameter `{row[v1.C.PARAMETER_ID]}'."
)
return new_prior_type
# update parameterScale*-priors
if v1.C.OBJECTIVE_PRIOR_TYPE in df.columns:
df[v1.C.OBJECTIVE_PRIOR_TYPE] = df.apply(update_prior, axis=1)
# rename objectivePrior* to prior*
df.rename(
columns={
v1.C.OBJECTIVE_PRIOR_TYPE: v2.C.PRIOR_DISTRIBUTION,
v1.C.OBJECTIVE_PRIOR_PARAMETERS: v2.C.PRIOR_PARAMETERS,
},
inplace=True,
errors="ignore",
)
# some columns were dropped in PEtab v2
if v1.C.INITIALIZATION_PRIOR_TYPE in df and (
df[v1.C.INITIALIZATION_PRIOR_TYPE].notna().any()
):
warnings.warn(
"Initialisation priors in parameter table are not supported "
"in PEtab v2.",
stacklevel=9,
)
if not (df[v1.C.PARAMETER_SCALE] == v1.C.LIN).all():
warnings.warn(
"Parameter scales are not supported in PEtab v2.",
stacklevel=9,
)
df.drop(
columns=[
v1.C.INITIALIZATION_PRIOR_TYPE,
v1.C.INITIALIZATION_PRIOR_PARAMETERS,
v1.C.PARAMETER_SCALE,
],
inplace=True,
errors="ignore",
)
# if uniform, we need to explicitly set the parameters
def update_prior_pars(row):
prior_type = row.get(v2.C.PRIOR_DISTRIBUTION)
prior_pars = row.get(v2.C.PRIOR_PARAMETERS)
if prior_type in (v2.C.UNIFORM, v2.C.LOG_UNIFORM) and pd.isna(
prior_pars
):
return (
f"{row[v2.C.LOWER_BOUND]}{v2.C.PARAMETER_SEPARATOR}"
f"{row[v2.C.UPPER_BOUND]}"
)
return prior_pars
df[v2.C.PRIOR_PARAMETERS] = df.apply(update_prior_pars, axis=1)
return df