Source code for petab.v2.petab1to2

"""Convert PEtab version 1 problems to version 2."""

from __future__ import annotations

import re
import shutil
import warnings
from contextlib import suppress
from pathlib import Path
from tempfile import TemporaryDirectory
from urllib.parse import urlparse
from uuid import uuid4

import pandas as pd
from pandas.io.common import get_handle, is_url

from .. import v1, v2
from ..v1.math import sympify_petab
from ..v1.yaml import get_path_prefix, load_yaml, validate
from ..versions import get_major_version
from .models import MODEL_TYPE_SBML

__all__ = ["petab1to2"]



[docs]
def petab1to2(
    yaml_config: Path | str, output_dir: Path | str = None
) -> v2.Problem | None:
    """Convert from PEtab 1.0 to PEtab 2.0 format.

    Convert a PEtab problem from PEtab 1.0 to PEtab 2.0 format.

    .. note::

       Some aspects of PEtab v1 were not well-defined. For example, model
       initialization order (e.g., applying initial assignments before or
       after condition table overrides) and the impact of compartment size
       changes were not specified. In such cases, we made assumptions that are
       consistent with the clarified PEtab v2 specifications,
       the PEtab test suite, or common practice.
       Therefore, it is recommended to carefully review the generated PEtab v2
       problem to ensure it aligns with the expected behavior.

    :param yaml_config:
        The PEtab problem as dictionary or YAML file name.
    :param output_dir:
        The output directory to save the converted PEtab problem, or ``None``,
        to return a :class:`petab.v2.Problem` instance.

    :raises ValueError:
        If the input is invalid or does not pass linting or if the generated
        files do not pass linting.
    """
    if output_dir is not None:
        return petab_files_1to2(yaml_config, output_dir)

    with TemporaryDirectory() as tmp_dir:
        petab_files_1to2(yaml_config, tmp_dir)
        return v2.Problem.from_yaml(Path(tmp_dir, Path(yaml_config).name))



def petab_files_1to2(yaml_config: Path | str | dict, output_dir: Path | str):
    """Convert PEtab files from PEtab 1.0 to PEtab 2.0.


    :param yaml_config:
        The PEtab problem as dictionary or YAML file name.
    :param output_dir:
        The output directory to save the converted PEtab problem.

    :raises ValueError:
        If the input is invalid or does not pass linting or if the generated
        files do not pass linting.
    """
    if isinstance(yaml_config, Path | str):
        yaml_file = str(yaml_config)
        path_prefix = get_path_prefix(yaml_file)
        yaml_config = load_yaml(yaml_config)
        get_src_path = lambda filename: f"{path_prefix}/{filename}"  # noqa: E731
    else:
        yaml_file = None
        path_prefix = None
        get_src_path = lambda filename: filename  # noqa: E731

    get_dest_path = lambda filename: f"{output_dir}/{filename}"  # noqa: E731

    # Validate the original PEtab problem
    validate(yaml_config, path_prefix=path_prefix)
    if get_major_version(yaml_config) != 1:
        raise ValueError("PEtab problem is not version 1.")
    petab_problem = v1.Problem.from_yaml(yaml_file or yaml_config)
    # TODO: move to mapping table
    # get rid of conditionName column if present (unsupported in v2)
    petab_problem.condition_df = petab_problem.condition_df.drop(
        columns=[v1.C.CONDITION_NAME], errors="ignore"
    )
    if v1.lint_problem(petab_problem):
        raise ValueError("Provided PEtab problem does not pass linting.")

    output_dir = Path(output_dir)

    # Update YAML file
    new_yaml_config = _update_yaml(yaml_config)
    new_yaml_config = v2.ProblemConfig(**new_yaml_config)

    # Update tables

    # parameter table
    parameter_df = v1v2_parameter_df(petab_problem.parameter_df.copy())
    v2.write_parameter_df(
        parameter_df, get_dest_path(new_yaml_config.parameter_files[0])
    )

    # copy files that don't need conversion: models
    for file in (
        model.location for model in new_yaml_config.model_files.values()
    ):
        _copy_file(get_src_path(file), Path(get_dest_path(file)))

    # Update observable table
    for observable_file in new_yaml_config.observable_files:
        observable_df = v1.get_observable_df(get_src_path(observable_file))
        observable_df = v1v2_observable_df(
            observable_df,
        )
        v2.write_observable_df(observable_df, get_dest_path(observable_file))

    # Update condition table
    for condition_file in new_yaml_config.condition_files:
        condition_df = v1.get_condition_df(get_src_path(condition_file))
        condition_df = v1v2_condition_df(condition_df, petab_problem.model)
        v2.write_condition_df(condition_df, get_dest_path(condition_file))

    # records for the experiment table to be created
    experiments = []

    def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
        if not sim_cond_id and not preeq_cond_id:
            return ""
        # check whether the conditions will exist in the v2 condition table
        sim_cond_exists = (
            petab_problem.condition_df.loc[sim_cond_id].notna().any()
        )
        preeq_cond_exists = (
            preeq_cond_id
            and petab_problem.condition_df.loc[preeq_cond_id].notna().any()
        )
        if not sim_cond_exists and not preeq_cond_exists:
            # if we have only all-NaN conditions, we don't create a new
            #  experiment
            return ""

        if preeq_cond_id:
            preeq_cond_id = f"{preeq_cond_id}_"
        exp_id = f"experiment__{preeq_cond_id}__{sim_cond_id}"
        if exp_id in experiments:  # noqa: B023
            i = 1
            while f"{exp_id}_{i}" in experiments:  # noqa: B023
                i += 1
            exp_id = f"{exp_id}_{i}"
        return exp_id

    measured_experiments = (
        petab_problem.get_simulation_conditions_from_measurement_df()
    )
    for (
        _,
        row,
    ) in measured_experiments.iterrows():
        # generate a new experiment for each simulation / pre-eq condition
        #  combination
        sim_cond_id = row[v1.C.SIMULATION_CONDITION_ID]
        preeq_cond_id = row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, "")
        exp_id = create_experiment_id(sim_cond_id, preeq_cond_id)
        if not exp_id:
            continue
        if preeq_cond_id:
            experiments.append(
                {
                    v2.C.EXPERIMENT_ID: exp_id,
                    v2.C.TIME: v2.C.TIME_PREEQUILIBRATION,
                    v2.C.CONDITION_ID: preeq_cond_id,
                }
            )
        experiments.append(
            {
                v2.C.EXPERIMENT_ID: exp_id,
                v2.C.TIME: 0,
                v2.C.CONDITION_ID: sim_cond_id,
            }
        )
    if experiments:
        exp_table_path = output_dir / "experiments.tsv"
        if exp_table_path.exists():
            raise ValueError(
                f"Experiment table file {exp_table_path} already exists."
            )
        new_yaml_config.experiment_files.append("experiments.tsv")
        v2.write_experiment_df(
            v2.get_experiment_df(pd.DataFrame(experiments)), exp_table_path
        )

    for measurement_file in new_yaml_config.measurement_files:
        measurement_df = v1.get_measurement_df(get_src_path(measurement_file))
        # if there is already an experiment ID column, we rename it
        if v2.C.EXPERIMENT_ID in measurement_df.columns:
            measurement_df.rename(
                columns={v2.C.EXPERIMENT_ID: f"experiment_id_{uuid4()}"},
                inplace=True,
            )
        # add pre-eq condition id if not present or convert to string
        #  for simplicity
        if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns:
            measurement_df.fillna(
                {v1.C.PREEQUILIBRATION_CONDITION_ID: ""}, inplace=True
            )
        else:
            measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""

        if (
            petab_problem.condition_df is not None
            and len(
                set(petab_problem.condition_df.columns) - {v1.C.CONDITION_NAME}
            )
            == 0
        ):
            # we can't have "empty" conditions with no overrides in v2,
            #  therefore, we drop the respective condition ID completely
            #   TODO: or can we?
            # TODO: this needs to be checked condition-wise, not globally
            measurement_df[v1.C.SIMULATION_CONDITION_ID] = ""
            if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns:
                measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
        # condition IDs to experiment IDs
        measurement_df.insert(
            0,
            v2.C.EXPERIMENT_ID,
            measurement_df.apply(
                lambda row: create_experiment_id(
                    row[v1.C.SIMULATION_CONDITION_ID],
                    row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, ""),
                ),
                axis=1,
            ),
        )
        del measurement_df[v1.C.SIMULATION_CONDITION_ID]
        del measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID]
        v2.write_measurement_df(
            measurement_df, get_dest_path(measurement_file)
        )

    # Write the new YAML file
    new_yaml_file = output_dir / Path(yaml_file).name
    new_yaml_config.to_yaml(new_yaml_file)

    # validate updated Problem
    validation_issues = v2.lint_problem(new_yaml_file)

    if validation_issues:
        sev = v2.lint.ValidationIssueSeverity
        validation_issues.log(max_level=sev.WARNING)
        errors = "\n".join(
            map(
                str,
                (i for i in validation_issues if i.level > sev.WARNING),
            )
        )
        if errors:
            raise ValueError(
                "The generated PEtab v2 problem did not pass linting: "
                f"{errors}"
            )


def _update_yaml(yaml_config: dict) -> dict:
    """Update PEtab 1.0 YAML to PEtab 2.0 format."""
    yaml_config = yaml_config.copy()

    # Update format_version
    yaml_config[v2.C.FORMAT_VERSION] = "2.0.0"

    # Add extensions
    yaml_config[v2.C.EXTENSIONS] = {}

    # Move models and set IDs (filename for now)
    yaml_config[v2.C.MODEL_FILES] = {}
    for problem in yaml_config[v1.C.PROBLEMS]:
        models = {}
        for sbml_file in problem[v1.C.SBML_FILES]:
            model_id = sbml_file.split("/")[-1].split(".")[0]
            models[model_id] = {
                v2.C.MODEL_LANGUAGE: MODEL_TYPE_SBML,
                v2.C.MODEL_LOCATION: sbml_file,
            }
            yaml_config[v2.C.MODEL_FILES] |= models
        del problem[v1.C.SBML_FILES]

        for file_type in (
            v1.C.CONDITION_FILES,
            v1.C.MEASUREMENT_FILES,
            v1.C.OBSERVABLE_FILES,
        ):
            if file_type in problem:
                yaml_config[file_type] = problem[file_type]
                del problem[file_type]
    del yaml_config[v1.C.PROBLEMS]

    # parameter_file -> parameter_files
    if not isinstance(
        (par_files := yaml_config.pop(v1.C.PARAMETER_FILE, [])), list
    ):
        par_files = [par_files]
    yaml_config[v2.C.PARAMETER_FILES] = par_files

    return yaml_config


def _copy_file(src: Path | str, dest: Path):
    """Copy file."""
    # src might be a URL - convert to Path if local
    src_url = urlparse(src)
    if not src_url.scheme:
        src = Path(src)
    elif src_url.scheme == "file" and not src_url.netloc:
        src = Path(src.removeprefix("file:/"))

    if is_url(src):
        with get_handle(src, mode="r") as src_handle:
            with open(dest, "w") as dest_handle:
                dest_handle.write(src_handle.handle.read())
        return

    try:
        if dest.samefile(src):
            return
    except FileNotFoundError:
        shutil.copy(str(src), str(dest))


def v1v2_condition_df(
    condition_df: pd.DataFrame, model: v1.Model
) -> pd.DataFrame:
    """Convert condition table from petab v1 to v2."""
    condition_df = condition_df.copy().reset_index()
    with suppress(KeyError):
        # conditionName was dropped in PEtab v2
        condition_df.drop(columns=[v1.C.CONDITION_NAME], inplace=True)

    condition_df = condition_df.melt(
        id_vars=[v1.C.CONDITION_ID],
        var_name=v2.C.TARGET_ID,
        value_name=v2.C.TARGET_VALUE,
    ).dropna(subset=[v2.C.TARGET_VALUE])

    if condition_df.empty:
        # This happens if there weren't any condition-specific changes
        return pd.DataFrame(
            columns=[
                v2.C.CONDITION_ID,
                v2.C.TARGET_ID,
                v2.C.TARGET_VALUE,
            ]
        )

    return condition_df


def v1v2_observable_df(observable_df: pd.DataFrame) -> pd.DataFrame:
    """Convert observable table from petab v1 to v2.

    Perform all updates that can be done solely on the observable table:
    * drop observableTransformation, update noiseDistribution
    * update placeholder parameters
    """
    df = observable_df.copy().reset_index()

    # drop observableTransformation, update noiseDistribution
    #  if there is no observableTransformation, no need to update
    if v1.C.OBSERVABLE_TRANSFORMATION in df.columns:
        df[v1.C.OBSERVABLE_TRANSFORMATION] = df[
            v1.C.OBSERVABLE_TRANSFORMATION
        ].fillna(v1.C.LIN)

        if v1.C.NOISE_DISTRIBUTION in df:
            df[v1.C.NOISE_DISTRIBUTION] = df[v1.C.NOISE_DISTRIBUTION].fillna(
                v1.C.NORMAL
            )
        else:
            df[v1.C.NOISE_DISTRIBUTION] = v1.C.NORMAL

        # merge observableTransformation into noiseDistribution
        def get_noise_dist(row):
            dist = row.get(v1.C.NOISE_DISTRIBUTION)
            trans = row.get(v1.C.OBSERVABLE_TRANSFORMATION)

            if trans == v1.C.LIN:
                new_dist = dist
            else:
                new_dist = f"{trans}-{dist}"

            if new_dist == "log10-normal":
                warnings.warn(
                    f"Noise distribution `{new_dist}' for "
                    f"observable `{row[v1.C.OBSERVABLE_ID]}'"
                    f" is not supported in PEtab v2. "
                    "Using `log-normal` instead.",
                    # call to `petab1to2`
                    stacklevel=9,
                )
                new_dist = v2.C.LOG_NORMAL

            if new_dist not in v2.C.NOISE_DISTRIBUTIONS:
                raise NotImplementedError(
                    f"Noise distribution `{new_dist}' for "
                    f"observable `{row[v1.C.OBSERVABLE_ID]}'"
                    f" is not supported in PEtab v2."
                )

            return new_dist

        df[v2.C.NOISE_DISTRIBUTION] = df.apply(get_noise_dist, axis=1)
        df.drop(columns=[v1.C.OBSERVABLE_TRANSFORMATION], inplace=True)

    def extract_placeholders(row: pd.Series, type_: str) -> str:
        """Extract placeholders from observable formula."""
        if type_ == "observable":
            formula = row[v1.C.OBSERVABLE_FORMULA]
        elif type_ == "noise":
            formula = row[v1.C.NOISE_FORMULA]
        else:
            raise ValueError(f"Unknown placeholder type: {type_}")

        if pd.isna(formula):
            return ""

        t = f"{re.escape(type_)}Parameter"
        o = re.escape(row[v1.C.OBSERVABLE_ID])

        pattern = re.compile(rf"(?:^|\W)({t}\d+_{o})(?=\W|$)")

        expr = sympify_petab(formula)
        # for 10+ placeholders, the current lexicographical sorting will result
        #  in incorrect ordering of the placeholder IDs, so that they don't
        #  align with the overrides in the measurement table, but who does
        #  that anyway?
        return v2.C.PARAMETER_SEPARATOR.join(
            sorted(
                str(sym)
                for sym in expr.free_symbols
                if sym.is_Symbol and pattern.match(str(sym))
            )
        )

    df[v2.C.OBSERVABLE_PLACEHOLDERS] = df.apply(
        extract_placeholders, args=("observable",), axis=1
    )
    df[v2.C.NOISE_PLACEHOLDERS] = df.apply(
        extract_placeholders, args=("noise",), axis=1
    )

    return df


def v1v2_parameter_df(
    parameter_df: pd.DataFrame,
) -> pd.DataFrame:
    """Convert parameter table from petab v1 to v2.

    Do all the necessary conversions to the parameter table that can
    be done with the parameter table alone.
    """
    df = parameter_df.copy().reset_index()

    # parameter.estimate: int -> bool
    df[v2.C.ESTIMATE] = df[v1.C.ESTIMATE].apply(
        lambda x: str(bool(int(x))).lower()
    )

    def update_prior(row):
        """Convert prior to v2 format."""
        prior_type = row.get(v1.C.OBJECTIVE_PRIOR_TYPE)
        if pd.isna(prior_type):
            prior_type = v1.C.UNIFORM

        pscale = row.get(v1.C.PARAMETER_SCALE)
        if pd.isna(pscale):
            pscale = v1.C.LIN

        if prior_type not in v1.C.PARAMETER_SCALE_PRIOR_TYPES:
            return prior_type

        new_prior_type = prior_type.removeprefix("parameterScale").lower()
        if pscale != v1.C.LIN:
            new_prior_type = f"{pscale}-{new_prior_type}"

        if new_prior_type == "log10-normal":
            warnings.warn(
                f"Prior distribution `{new_prior_type}' for parameter "
                f"`{row[v1.C.PARAMETER_ID]}' is not supported in PEtab v2. "
                "Using `log-normal` instead.",
                # call to `petab1to2`
                stacklevel=9,
            )
            new_prior_type = v2.C.LOG_NORMAL

        if new_prior_type not in v2.C.PRIOR_DISTRIBUTIONS:
            raise NotImplementedError(
                f"PEtab v2 does not support prior type `{new_prior_type}' "
                f"required for parameter `{row[v1.C.PARAMETER_ID]}'."
            )

        return new_prior_type

    # update parameterScale*-priors
    if v1.C.OBJECTIVE_PRIOR_TYPE in df.columns:
        df[v1.C.OBJECTIVE_PRIOR_TYPE] = df.apply(update_prior, axis=1)

    # rename objectivePrior* to prior*
    df.rename(
        columns={
            v1.C.OBJECTIVE_PRIOR_TYPE: v2.C.PRIOR_DISTRIBUTION,
            v1.C.OBJECTIVE_PRIOR_PARAMETERS: v2.C.PRIOR_PARAMETERS,
        },
        inplace=True,
        errors="ignore",
    )
    # some columns were dropped in PEtab v2
    if v1.C.INITIALIZATION_PRIOR_TYPE in df and (
        df[v1.C.INITIALIZATION_PRIOR_TYPE].notna().any()
    ):
        warnings.warn(
            "Initialisation priors in parameter table are not supported "
            "in PEtab v2.",
            stacklevel=9,
        )
    if not (df[v1.C.PARAMETER_SCALE] == v1.C.LIN).all():
        warnings.warn(
            "Parameter scales are not supported in PEtab v2.",
            stacklevel=9,
        )
    df.drop(
        columns=[
            v1.C.INITIALIZATION_PRIOR_TYPE,
            v1.C.INITIALIZATION_PRIOR_PARAMETERS,
            v1.C.PARAMETER_SCALE,
        ],
        inplace=True,
        errors="ignore",
    )

    # if uniform, we need to explicitly set the parameters
    def update_prior_pars(row):
        prior_type = row.get(v2.C.PRIOR_DISTRIBUTION)
        prior_pars = row.get(v2.C.PRIOR_PARAMETERS)

        if prior_type in (v2.C.UNIFORM, v2.C.LOG_UNIFORM) and pd.isna(
            prior_pars
        ):
            return (
                f"{row[v2.C.LOWER_BOUND]}{v2.C.PARAMETER_SEPARATOR}"
                f"{row[v2.C.UPPER_BOUND]}"
            )

        return prior_pars

    df[v2.C.PRIOR_PARAMETERS] = df.apply(update_prior_pars, axis=1)

    return df