From 305e3b0bc8a91cfe783f1903f57a71a11c79067e Mon Sep 17 00:00:00 2001 From: noemibuehrer Date: Tue, 5 Aug 2025 08:28:37 +0200 Subject: [PATCH 01/20] docs: update documentation for `integrate` and `evidence` commands --- src/lyscripts/compute/evidence.py | 12 ++++++++++-- src/lyscripts/integrate.py | 20 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/lyscripts/compute/evidence.py b/src/lyscripts/compute/evidence.py index 8959d46..53aa885 100644 --- a/src/lyscripts/compute/evidence.py +++ b/src/lyscripts/compute/evidence.py @@ -1,4 +1,7 @@ -"""Given samples drawn during thermodynamic integration, compute the model log evidence.""" +"""Given the samples drawn during thermodynamic integration and their +respective log likelihoods, compute the model log evidence and +the Bayesian Information Criterion. +""" from __future__ import annotations import numpy as np @@ -112,7 +115,12 @@ class EvidenceCLI(BaseCLI): def cli_cmd(self) -> None: - """Start the evidence computation process.""" + """Start the ``evidence`` subcommand. + Given the MCMC samples from thermodynamic integration provided by the ``sampling`` + argument and the corresponding inverse temperature schedule, specified in the + ``schedule`` argument, the model evidence is computed using the functions + ``compute_ti_results`` and ``compute_evidence``. Further the BIC is evaluated. + """ data = self.data.load() diff --git a/src/lyscripts/integrate.py b/src/lyscripts/integrate.py index d54a9d6..926e547 100644 --- a/src/lyscripts/integrate.py +++ b/src/lyscripts/integrate.py @@ -1,5 +1,10 @@ """ Perform thermodynamic integration to evaluate the model evidence. + +Using the functions provided by the `sample` module, this script implements +thermodynamic integration (TI) in order to compute the model evidence. +This is done by sampling the model parameters at different inverse temperatures +following a specified schedule. """ from __future__ import annotations @@ -42,7 +47,7 @@ def init_ti_sampler( inv_temp: float, pool: Any ) -> emcee.EnsembleSampler: - """Initialize the ``emcee.EnsembleSampler``with the given ``settings''.""" + """Initialize the ``emcee.EnsembleSampler`` for TI with the given ``settings''.""" nwalkers = ndim * settings.sampling.walkers_per_dim backend = get_hdf5_backend( file_path=settings.sampling.storage_file, @@ -88,7 +93,18 @@ class IntegrateCLI(BaseCLI): def cli_cmd(self) -> None: - """Start the thermodynamic integration process.""" + """Start the ``integrate`` subcommand. + + The model construction and setup is done analogously to the + ``sample`` command. Afterwards, an :py:class:`emcee.EnsembleSampler` + is initialized (see :py:func:`init_sampler`) and :py:func:`run_sampling`, + implemented in the ``sample``module, is executed twice for each TI step: + once for the burn-in phase and once for the actual sampling phase. + Thereby, the log likelihood is scaled by the respective inverse + temperature of that step. All necessary settings for the sampling + are passed by the ``sampling``argument, except for the inverse + temperatures, which are provided by the ``schedule`` argument. + """ # as recommended in https://emcee.readthedocs.io/en/stable/tutorials/parallel/# os.environ["OMP_NUM_THREADS"] = "1" From e2311bdc842345617a54b0e8c811d22e8d7b3d81 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 13 Aug 2025 10:15:06 +0200 Subject: [PATCH 02/20] change: centralize inv temp schedule generation --- src/lyscripts/compute/evidence.py | 81 +++++++++++++------------- src/lyscripts/configs.py | 91 ++++++++++++++++++++++------- src/lyscripts/integrate.py | 95 ++++++++++++++----------------- src/lyscripts/schedule.py | 58 +------------------ 4 files changed, 158 insertions(+), 167 deletions(-) diff --git a/src/lyscripts/compute/evidence.py b/src/lyscripts/compute/evidence.py index 53aa885..1cd51e2 100644 --- a/src/lyscripts/compute/evidence.py +++ b/src/lyscripts/compute/evidence.py @@ -1,18 +1,21 @@ -"""Given the samples drawn during thermodynamic integration and their -respective log likelihoods, compute the model log evidence and -the Bayesian Information Criterion. +"""Compute the model evidence from MCMC samples. + +Given the samples drawn during thermodynamic integration and their respective log +likelihoods, compute the model log evidence and the Bayesian Information Criterion. """ from __future__ import annotations -import numpy as np + import json +from pathlib import Path + +import emcee +import h5py +import numpy as np import pandas as pd from loguru import logger from pydantic import Field - -import h5py -import emcee -from pathlib import Path +from scipy.integrate import trapezoid from lyscripts.cli import assemble_main from lyscripts.configs import ( @@ -21,12 +24,10 @@ SamplingConfig, ScheduleConfig, ) -from lyscripts.schedule import SCHEDULES - -from scipy.integrate import trapezoid RNG = np.random.default_rng() + def comp_bic(log_probs: np.ndarray, num_params: int, num_data: int) -> float: r"""Compute the negative one half of the Bayesian Information Criterion (BIC). @@ -65,6 +66,7 @@ def compute_evidence( integrals[i] = trapezoid(y=drawn_accuracy, x=temp_schedule) return np.mean(integrals), np.std(integrals) + def compute_ti_results( settings: EvidenceCLI, temp_schedule: np.ndarray, @@ -86,8 +88,12 @@ def compute_ti_results( ti_log_probs = np.zeros(shape=(num_temps, nsteps * nwalker)) for i, run in enumerate(h5_file["ti"]): - reader = emcee.backends.HDFBackend(settings.sampling.storage_file, name=f"ti/{run}", read_only=True) - ti_log_probs[i] = reader.get_blobs(flat=True)['log_prob'] + reader = emcee.backends.HDFBackend( + settings.sampling.storage_file, + name=f"ti/{run}", + read_only=True, + ) + ti_log_probs[i] = reader.get_blobs(flat=True)["log_prob"] evidence, evidence_std = compute_evidence(temp_schedule, ti_log_probs) metrics["evidence"] = evidence @@ -102,7 +108,7 @@ class EvidenceCLI(BaseCLI): data: DataConfig sampling: SamplingConfig schedule: ScheduleConfig = Field( - description="Configuration for generating inverse temperature schedule." + description="Configuration for generating inverse temperature schedule.", ) plots: Path = Field( default="./plots", @@ -113,35 +119,31 @@ class EvidenceCLI(BaseCLI): description="Path to metrics file.", ) - def cli_cmd(self) -> None: """Start the ``evidence`` subcommand. - Given the MCMC samples from thermodynamic integration provided by the ``sampling`` - argument and the corresponding inverse temperature schedule, specified in the - ``schedule`` argument, the model evidence is computed using the functions - ``compute_ti_results`` and ``compute_evidence``. Further the BIC is evaluated. - """ + Given the MCMC samples from thermodynamic integration provided by the + ``sampling`` argument and the corresponding inverse temperature schedule, + specified in the ``schedule`` argument, the model evidence is computed using + the functions :py:func:`compute_ti_results` and :py:func`compute_evidence`. + Further the BIC is evaluated. + """ data = self.data.load() metrics = {} - # temperature schedule: use direct list or generate - if self.schedule.values is not None: - temp_schedule = np.array(self.schedule.values) - logger.info(f"Using direct temperature values: {temp_schedule}") - else: - func = SCHEDULES[self.schedule.method] - temp_schedule = func(self.schedule.num, self.schedule.power) - logger.info(f"Generated {self.schedule.method} schedule: {temp_schedule}") - - - with h5py.File(self.sampling.storage_file, mode='r') as h5_file: + temp_schedule = self.schedule.get_schedule() + + with h5py.File(self.sampling.storage_file, mode="r") as h5_file: # Get ndim from the HDF5 backend - backend = emcee.backends.HDFBackend(self.sampling.storage_file, read_only=True, name=self.sampling.dataset) + backend = emcee.backends.HDFBackend( + self.sampling.storage_file, + read_only=True, + name=self.sampling.dataset, + ) ndim = backend.shape[1] logger.info(f"Inferred {ndim} parameters from stored samples") - + # if TI has been performed, compute the evidence if "ti" in h5_file: temp_schedule, ti_log_probs = compute_ti_results( @@ -172,10 +174,12 @@ def cli_cmd(self) -> None: ) beta_vs_accuracy.to_csv(self.plots, index=False) logger.info(f"Plotted β vs accuracy at {self.plots}") - + # use blobs, because also for TI, this is the unscaled log-prob - final_log_probs = backend.get_blobs()['log_prob'] - logger.info(f"Opened samples from emcee backend from {self.sampling.storage_file}") + final_log_probs = backend.get_blobs()["log_prob"] + logger.info( + f"Opened samples from emcee backend from {self.sampling.storage_file}", + ) # store metrics in JSON file self.metrics.parent.mkdir(parents=True, exist_ok=True) @@ -191,11 +195,10 @@ def cli_cmd(self) -> None: with open(self.metrics, mode="w", encoding="utf-8") as metrics_file: json.dump(metrics, metrics_file) - + logger.info(f"Wrote out metrics to {self.metrics}") + if __name__ == "__main__": main = assemble_main(settings_cls=EvidenceCLI, prog_name="compute evidence") main() - - diff --git a/src/lyscripts/configs.py b/src/lyscripts/configs.py index ed9ffb5..ebae236 100644 --- a/src/lyscripts/configs.py +++ b/src/lyscripts/configs.py @@ -18,7 +18,7 @@ from collections.abc import Callable, Sequence from copy import deepcopy from pathlib import Path -from typing import Annotated, Any, Literal, Optional +from typing import Annotated, Any, Literal import numpy as np import pandas as pd @@ -436,7 +436,7 @@ class SamplingConfig(BaseModel): default=None, description=( "Number of burn-in steps to take. If None, burn-in runs until convergence." - ) + ), ) num_steps: int | None = Field( default=100, @@ -466,28 +466,75 @@ def load(self, thin: int = 1) -> np.ndarray: thin=thin, ) + +def geometric_schedule(num: int, *_a) -> np.ndarray: + """Create a geometric sequence of ``num`` numbers from 0 to 1.""" + log_seq = np.logspace(0.0, 1.0, num) + shifted_seq = log_seq - 1.0 + return shifted_seq / 9.0 + + +def linear_schedule(num: int, *_a) -> np.ndarray: + """Create a linear sequence of ``num`` numbers from 0 to 1. + + Equivalent to the :py:func:`power_schedule` with ``power=1``. + """ + return np.linspace(0.0, 1.0, num) + + +def power_schedule(num: int, power: float, *_a) -> np.ndarray: + """Create a power sequence of ``num`` numbers from 0 to 1. + + This is essentially a :py:func:`linear_schedule` of ``num`` numbers from 0 to 1, + but each number is raised to the power of ``power``. + """ + lin_seq = np.linspace(0.0, 1.0, num) + return lin_seq**power + + +SCHEDULES = { + "geometric": geometric_schedule, + "linear": linear_schedule, + "power": power_schedule, +} + + class ScheduleConfig(BaseModel): - """Configuration for generating a schedule of inverse temperatures.""" + """Configuration for generating a schedule of inverse temperatures.""" + + method: Literal["geometric", "linear", "power"] = Field( + default="power", + description="Method to generate the inverse temperature schedule.", + ) + num: int = Field( + default=32, + description="Number of inverse temperatures in the schedule.", + ) + power: float = Field( + default=4.0, + description="If a power schedule is chosen, use this as power.", + ) + values: list[float] | None = Field( + default=None, + description=( + "List of inverse temperatures to use instead of generating a schedule. " + "If a list is provided, the other parameters are ignored." + ), + ) + + def get_schedule(self) -> np.ndarray: + """Get the inverse temperature schedule as a numpy array.""" + if self.values is not None: + logger.debug("Using provided inverse temperature values.") + schedule = np.array(self.values) + else: + logger.debug(f"Generating inverse temperature schedule with {self.method}.") + func = SCHEDULES[self.method] + schedule = func(self.num, self.power) + + logger.info(f"Generated inverse temperature schedule: {schedule}") + return schedule - method: Literal["geometric", "linear", "power"] = Field( - default="power", - description="Method to generate the inverse temperature schedule.", - ) - num: int = Field( - default=32, - description="Number of inverse temperatures in the schedule.", - ) - power: float = Field( - default=4.0, - description="If a power schedule is chosen, use this as power.", - ) - values: list[float] | None = Field( - default=None, - description=( - "List of inverse temperatures to use instead of generating a schedule. " - "If a list is provided, the other parameters are ignored." - ), - ) def map_to_optional_bool(value: Any) -> Any: """Try to convert the options in the `PatternType` to a boolean value.""" diff --git a/src/lyscripts/integrate.py b/src/lyscripts/integrate.py index 926e547..c213116 100644 --- a/src/lyscripts/integrate.py +++ b/src/lyscripts/integrate.py @@ -1,5 +1,4 @@ -""" -Perform thermodynamic integration to evaluate the model evidence. +"""Perform thermodynamic integration to evaluate the model evidence. Using the functions provided by the `sample` module, this script implements thermodynamic integration (TI) in order to compute the model evidence. @@ -11,17 +10,16 @@ import os from typing import Any -from loguru import logger import emcee -import numpy as np import h5py - -from lyscripts.cli import assemble_main -from pydantic import Field - +import numpy as np +from loguru import logger from lydata.utils import ModalityConfig +from pydantic import Field +import lyscripts.sample as sample_module # Import the module to set its global MODEL +from lyscripts.cli import assemble_main from lyscripts.configs import ( BaseCLI, DataConfig, @@ -30,31 +28,28 @@ ModelConfig, SamplingConfig, ScheduleConfig, - construct_model, add_distributions, add_modalities, + construct_model, ) - -import lyscripts.sample as sample_module # Import the module to set its global MODEL -from lyscripts.schedule import SCHEDULES - from lyscripts.utils import get_hdf5_backend + def init_ti_sampler( - settings: IntegrateCLI, - temp_idx: int, - ndim: int, - inv_temp: float, - pool: Any + settings: IntegrateCLI, + temp_idx: int, + ndim: int, + inv_temp: float, + pool: Any, ) -> emcee.EnsembleSampler: """Initialize the ``emcee.EnsembleSampler`` for TI with the given ``settings''.""" nwalkers = ndim * settings.sampling.walkers_per_dim backend = get_hdf5_backend( - file_path=settings.sampling.storage_file, - dataset=f"ti/{temp_idx+1:0>2d}", - nwalkers=nwalkers, - ndim=ndim, - ) + file_path=settings.sampling.storage_file, + dataset=f"ti/{temp_idx + 1:0>2d}", + nwalkers=nwalkers, + ndim=ndim, + ) return emcee.EnsembleSampler( nwalkers=nwalkers, ndim=ndim, @@ -67,6 +62,7 @@ def init_ti_sampler( parameter_names=list(MODEL.get_named_params().keys()), ) + class IntegrateCLI(BaseCLI): """Perform thermodynamic integration to compute the model evidence.""" @@ -88,21 +84,20 @@ class IntegrateCLI(BaseCLI): data: DataConfig sampling: SamplingConfig schedule: ScheduleConfig = Field( - description="Configuration for generating inverse temperature schedule." + description="Configuration for generating inverse temperature schedule.", ) - def cli_cmd(self) -> None: """Start the ``integrate`` subcommand. - - The model construction and setup is done analogously to the - ``sample`` command. Afterwards, an :py:class:`emcee.EnsembleSampler` - is initialized (see :py:func:`init_sampler`) and :py:func:`run_sampling`, + + The model construction and setup is done analogously to the + ``sample`` command. Afterwards, an :py:class:`emcee.EnsembleSampler` + is initialized (see :py:func:`init_sampler`) and :py:func:`run_sampling`, implemented in the ``sample``module, is executed twice for each TI step: - once for the burn-in phase and once for the actual sampling phase. - Thereby, the log likelihood is scaled by the respective inverse - temperature of that step. All necessary settings for the sampling - are passed by the ``sampling``argument, except for the inverse + once for the burn-in phase and once for the actual sampling phase. + Thereby, the log likelihood is scaled by the respective inverse + temperature of that step. All necessary settings for the sampling + are passed by the ``sampling``argument, except for the inverse temperatures, which are provided by the ``schedule`` argument. """ # as recommended in https://emcee.readthedocs.io/en/stable/tutorials/parallel/# @@ -110,41 +105,34 @@ def cli_cmd(self) -> None: logger.debug(self.model_dump_json(indent=2)) - # ugly, but necessary for pickling + # ugly, but necessary for pickling global MODEL MODEL = construct_model(self.model, self.graph) MODEL = add_distributions(MODEL, self.distributions) MODEL = add_modalities(MODEL, self.modalities) MODEL.load_patient_data(**self.data.get_load_kwargs()) ndim = MODEL.get_num_dims() - + # set MODEL in the sample module's namespace so log_prob_fn can access it sample_module.MODEL = MODEL - # temperature schedule: use direct list or generate - if self.schedule.values is not None: - temp_schedule = np.array(self.schedule.values) - logger.info(f"Using direct temperature values: {temp_schedule}") - else: - func = SCHEDULES[self.schedule.method] - temp_schedule = func(self.schedule.num, self.schedule.power) - logger.info(f"Generated {self.schedule.method} schedule: {temp_schedule}") - + schedule = self.schedule.get_schedule() + # emcee does not support numpy's new random number generator yet. - np.random.seed(self.sampling.seed) # noqa: NPY002 + np.random.seed(self.sampling.seed) # noqa: NPY002 with sample_module.get_pool(self.sampling.cores) as pool: - for idx, inv_temp in enumerate(temp_schedule): + for idx, inv_temp in enumerate(schedule): sampler = init_ti_sampler( settings=self, temp_idx=idx, ndim=ndim, inv_temp=inv_temp, - pool=pool + pool=pool, ) sample_module.run_sampling( - description=f"Burn-in phase: TI step {idx+1}/{len(temp_schedule)}", + description=f"Burn-in phase: TI step {idx + 1}/{len(schedule)}", sampler=sampler, num_steps=self.sampling.burnin_steps, check_interval=self.sampling.check_interval, @@ -154,7 +142,7 @@ def cli_cmd(self) -> None: ) sample_module.run_sampling( - description=f"Sampling phase: TI step {idx+1}/{len(temp_schedule)}", + description=f"Sampling phase: TI step {idx + 1}/{len(schedule)}", sampler=sampler, num_steps=self.sampling.num_steps, reset_backend=True, @@ -163,8 +151,13 @@ def cli_cmd(self) -> None: ) # copy last sampling round over to a group in the HDF5 file called "mcmc" with h5py.File(self.sampling.storage_file, mode="r+") as h5_file: - h5_file.copy(f"ti/{len(temp_schedule):0>2d}", h5_file, name=self.sampling.dataset) + h5_file.copy( + f"ti/{len(schedule):0>2d}", + h5_file, + name=self.sampling.dataset, + ) + if __name__ == "__main__": main = assemble_main(settings_cls=IntegrateCLI, prog_name="integrate") - main() \ No newline at end of file + main() diff --git a/src/lyscripts/schedule.py b/src/lyscripts/schedule.py index 16b367f..99fa334 100644 --- a/src/lyscripts/schedule.py +++ b/src/lyscripts/schedule.py @@ -10,72 +10,20 @@ :math:`\beta_i^k` where :math:`k` could e.g. be 5. """ -from typing import Literal - -import numpy as np from loguru import logger -from pydantic import Field from lyscripts.cli import assemble_main -from lyscripts.configs import BaseCLI - - -def geometric_schedule(num: int, *_a) -> np.ndarray: - """Create a geometric sequence of ``num`` numbers from 0 to 1.""" - log_seq = np.logspace(0.0, 1.0, num) - shifted_seq = log_seq - 1.0 - return shifted_seq / 9.0 - - -def linear_schedule(num: int, *_a) -> np.ndarray: - """Create a linear sequence of ``num`` numbers from 0 to 1. - - Equivalent to the :py:func:`power_schedule` with ``power=1``. - """ - return np.linspace(0.0, 1.0, num) - +from lyscripts.configs import BaseCLI, ScheduleConfig -def power_schedule(num: int, power: float, *_a) -> np.ndarray: - """Create a power sequence of ``num`` numbers from 0 to 1. - This is essentially a :py:func:`linear_schedule` of ``num`` numbers from 0 to 1, - but each number is raised to the power of ``power``. - """ - lin_seq = np.linspace(0.0, 1.0, num) - return lin_seq**power - - -SCHEDULES = { - "geometric": geometric_schedule, - "linear": linear_schedule, - "power": power_schedule, -} - - -class ScheduleCLI(BaseCLI): +class ScheduleCLI(ScheduleConfig, BaseCLI): """Generate an inverse temperature schedule for thermodynamic integration.""" - method: Literal["geometric", "linear", "power"] = Field( - default="geometric", - description="Choose the method to distribute the inverse temperatures.", - ) - num: int = Field( - default=32, - description="Number of inverse temperatures in the schedule.", - ) - power: float = Field( - default=4, - description="If a power schedule is chosen, use this as power.", - ) - def cli_cmd(self) -> None: """Start the ``schedule`` command.""" logger.debug(self.model_dump_json(indent=2)) - func = SCHEDULES[self.method] - schedule = func(self.num, self.power) - - for inv_temp in schedule: + for inv_temp in self.get_schedule(): # print is necessary to allow piping the output print(inv_temp) # noqa: T201 From 663bd69c54dfd050cabad5ea3acdd071a389bf4a Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:13:55 +0200 Subject: [PATCH 03/20] change: store selected loglevel globally --- src/lyscripts/cli.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lyscripts/cli.py b/src/lyscripts/cli.py index 26870c7..c1baae2 100644 --- a/src/lyscripts/cli.py +++ b/src/lyscripts/cli.py @@ -17,6 +17,8 @@ from rich.logging import RichHandler from rich_argparse import ArgumentDefaultsRichHelpFormatter +_current_log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO" + def assemble_main( settings_cls: type[BaseSettings], @@ -80,11 +82,12 @@ def configure_logging( """ logger.enable("lyscripts") logger.enable("lydata") - log_level = somewhat_safely_get_loglevel(argv=argv) + global _current_log_level + _current_log_level = somewhat_safely_get_loglevel(argv=argv) logger.remove() handler = RichHandler(console=console) logger.add( sink=handler, - level=log_level, + level=_current_log_level, format="{message}", ) From ef6f51bf7e7cf920dd5db7c7cf72303d913e60ed Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 13 Aug 2025 13:54:07 +0200 Subject: [PATCH 04/20] feat: add collector utility/command --- pyproject.toml | 11 +- src/lyscripts/__init__.py | 2 +- src/lyscripts/data/__init__.py | 2 + src/lyscripts/data/collect/__init__.py | 163 ++++++++++++++++++++++++ src/lyscripts/data/collect/collector.js | 150 ++++++++++++++++++++++ src/lyscripts/data/collect/index.html | 26 ++++ 6 files changed, 352 insertions(+), 2 deletions(-) create mode 100644 src/lyscripts/data/collect/__init__.py create mode 100644 src/lyscripts/data/collect/collector.js create mode 100644 src/lyscripts/data/collect/index.html diff --git a/pyproject.toml b/pyproject.toml index 5659c16..69ba6a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ "pydantic-settings >= 2.7.0, != 2.9.1, != 2.9.0", "numpydantic", "loguru", - "lydata @ git+https://github.com/lycosystem/lydata-package@eb99daf6cd65d93879cfa3552923f25c0a9c29bf", + "lydata @ git+https://github.com/lycosystem/lydata-package@79c5d4996b37a833afca457629d8d854acc05a33", ] dynamic = ["version"] @@ -81,9 +81,18 @@ dev = [ [project.scripts] lyscripts = "lyscripts:main" +[tool.setuptools] +include-package-data = true + [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +"lyscripts" = [ + "src/lyscripts/data/collect/collector.js", + "src/lyscripts/data/collect/index.html", +] + [tool.setuptools_scm] write_to = "src/lyscripts/_version.py" local_scheme = "no-local-version" diff --git a/src/lyscripts/__init__.py b/src/lyscripts/__init__.py index 6727547..e9dbcb1 100644 --- a/src/lyscripts/__init__.py +++ b/src/lyscripts/__init__.py @@ -17,7 +17,7 @@ CliSubCommand, ) -from lyscripts import compute, data, sample, schedule, integrate # noqa: F401 +from lyscripts import compute, data, integrate, sample, schedule # noqa: F401 from lyscripts._version import version from lyscripts.cli import assemble_main, configure_logging from lyscripts.utils import console diff --git a/src/lyscripts/data/__init__.py b/src/lyscripts/data/__init__.py index c9863ec..eb86dbd 100644 --- a/src/lyscripts/data/__init__.py +++ b/src/lyscripts/data/__init__.py @@ -17,6 +17,7 @@ from pydantic_settings import BaseSettings, CliApp, CliSubCommand from lyscripts.data import ( # noqa: F401 + collect, enhance, fetch, generate, @@ -32,6 +33,7 @@ class DataCLI(BaseSettings): """Work with lymphatic progression data through this CLI.""" + collect: CliSubCommand[collect.CollectorCLI] lyproxify: CliSubCommand[lyproxify.LyproxifyCLI] join: CliSubCommand[join.JoinCLI] split: CliSubCommand[split.SplitCLI] diff --git a/src/lyscripts/data/collect/__init__.py b/src/lyscripts/data/collect/__init__.py new file mode 100644 index 0000000..44e1f19 --- /dev/null +++ b/src/lyscripts/data/collect/__init__.py @@ -0,0 +1,163 @@ +"""Submodule to collect data interactively using a simple web interface. + +With the simply command + +.. code-block:: bash + + lyscripts data collect + +One can start a very basic web server that serves an interactive UI at +``http://localhost:8000/``. There, one can enter patient, tumor, and lymphatic +involvement data one by one. When completed, the "submit" button will parse, validate, +and convert the data to serve a downloadable CSV file. + +This resulting CSV file is in the correct format to be used in `LyProX`_ and for +inference using our `lymph-model`_ library. + +.. _LyProX: https://lyprox.org +.. _lymph-model: https://lymph-model.readthedocs.io +""" + +import inspect +import io +import logging +from pathlib import Path +from typing import Any + +import lydata +import lydata.validator +import pandas as pd +from fastapi import FastAPI, HTTPException +from fastapi.responses import StreamingResponse +from loguru import logger +from pydantic import Field, RootModel +from starlette.responses import FileResponse, HTMLResponse + +from lyscripts.cli import _current_log_level +from lyscripts.configs import BaseCLI + +app = FastAPI( + title="lyDATA Collector", + description=( + "A simple web interface to collect data for the lyDATA datasets. " + "This is a prototype and not intended for production use." + ), + version=lydata.__version__, +) + +BASE_DIR = Path(__file__).parent +modalities = lydata.schema.get_default_modalities() +RecordModel = lydata.schema.create_full_record_model(modalities, title="Record") +ROOT_MODEL = RootModel[list[RecordModel]] + + +@app.get("/") +def serve_index() -> HTMLResponse: + """Serve the index.html file.""" + with open(BASE_DIR / "index.html") as file: + content = file.read() + return HTMLResponse(content=content) + + +@app.get("/schema") +def serve_schema() -> dict[str, Any]: + """Serve the JSON schema for the patient and tumor records.""" + return ROOT_MODEL.model_json_schema() + + +@app.get("/collector.js") +def serve_collector_js() -> FileResponse: + """Serve the collector.js file.""" + return FileResponse(BASE_DIR / "collector.js") + + +@app.post("/submit") +async def process(data: RootModel) -> StreamingResponse: + """Convert the submitted data to a DataFrame.""" + logger.info(f"Received data: {data.root}") + + if len(data.root) == 0: + logger.warning("No records provided in the data.") + raise HTTPException( + status_code=400, + detail="No records provided in the data.", + ) + + flattened_records = [] + + for record in data.root: + flattened_record = lydata.validator.flatten(record) + logger.debug(f"Flattened record: {flattened_record}") + flattened_records.append(flattened_record) + + df = pd.DataFrame(flattened_records) + df.columns = pd.MultiIndex.from_tuples(flattened_record.keys()) + logger.info(df.patient.core.head()) + + buffer = io.StringIO() + df.to_csv(buffer, index=False) + buffer.seek(0) + logger.success("Data prepared for download") + return StreamingResponse( + buffer, + media_type="text/csv", + headers={"Content-Disposition": "attachment; filename=lydata_records.csv"}, + ) + + +class InterceptHandler(logging.Handler): + """Intercept logging messages and redirect them to Loguru.""" + + def emit(self, record: logging.LogRecord) -> None: + """Intercept the log record and redirect it to Loguru.""" + # Get corresponding Loguru level if it exists. + try: + level: str | int = logger.level(record.levelname).name + except ValueError: + level = record.levelno + + # Find caller from where originated the logged message. + frame, depth = inspect.currentframe(), 0 + while frame: + filename = frame.f_code.co_filename + is_logging = filename == logging.__file__ + is_frozen = "importlib" in filename and "_bootstrap" in filename + if depth > 0 and not (is_logging or is_frozen): + break + frame = frame.f_back + depth += 1 + + logger.opt(depth=depth, exception=record.exc_info).log( + level, + record.getMessage(), + ) + + +class CollectorCLI(BaseCLI): + """Command-line interface for the lyDATA collector.""" + + hostname: str = Field( + default="localhost", + description="Hostname to run the FastAPI app on.", + ) + port: int = Field( + default=8000, + description="Port to run the FastAPI app on.", + ) + + def cli_cmd(self) -> None: + """Run the FastAPI app.""" + logger.debug(self.model_dump_json(indent=2)) + import uvicorn + + # Intercept standard logging and redirect it to Loguru + logging.basicConfig(handlers=[InterceptHandler()], level=0, force=True) + logger.enable("lydata") + + uvicorn.run( + app, + host=self.hostname, + port=self.port, + log_level=_current_log_level.lower(), + log_config=None, + ) diff --git a/src/lyscripts/data/collect/collector.js b/src/lyscripts/data/collect/collector.js new file mode 100644 index 0000000..d5e4a59 --- /dev/null +++ b/src/lyscripts/data/collect/collector.js @@ -0,0 +1,150 @@ +/** + * Client-side helper functions for collecting user input through JSONEditor, + * validating it against a fetched JSON Schema, submitting the validated data + * to the backend, and presenting a downloadable CSV returned by the server. + * + * NOTE: Functionality is intentionally unchanged; only readability and + * documentation have been improved. + */ + +/** + * Ensure an alert element (used to display validation errors) exists. + * Creates and appends it if missing. + * + * @returns {HTMLDivElement} The existing or newly created alert element. + */ +function ensureAlertExists() { + let alertElement = document.querySelector('.alert'); + if (!alertElement) { + alertElement = document.createElement('div'); + } + alertElement.className = 'alert alert-danger'; + const editorHolder = document.getElementById('editor_holder'); + editorHolder.appendChild(alertElement); + return alertElement; +} + +/** + * Remove an existing validation alert if present. + */ +function ensureAlertRemoved() { + const existingAlert = document.querySelector('.alert'); + if (existingAlert) { + console.log('Clearing existing alert'); + existingAlert.remove(); + } +} + +/** + * Remove an existing download button (if it exists) to avoid duplicates. + */ +function ensureDownloadButtonRemoved() { + const existingButton = document.getElementById('download_link'); + if (existingButton) { + console.log('Clearing existing download button'); + existingButton.remove(); + } +} + +/** + * Create (or replace) a download button for a CSV blob returned by the server. + * + * @param {Blob} blob - The CSV data blob to make downloadable. + */ +function createDownloadButton(blob) { + ensureDownloadButtonRemoved(); + + const url = window.URL.createObjectURL(blob); + const downloadLink = document.createElement('a'); + downloadLink.id = 'download_link'; + downloadLink.href = url; + downloadLink.textContent = 'Download CSV'; + downloadLink.className = 'btn btn-success'; + downloadLink.download = 'lydata_records.csv'; + + document.getElementById('editor_holder').appendChild(downloadLink); + console.log('Download button created:', downloadLink); +} + +/** + * Send validated editor data to the backend for processing. Expects a CSV blob + * in response which is then exposed via a generated download button. + * + * @param {JSONEditor} editor - The JSONEditor instance from which to read data. + */ +async function sendEditorData(editor) { + const data = editor.getValue(); + console.log('Sending data:', data); + + try { + const response = await fetch('/submit', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(data) + }); + + if (!response.ok) { + // Try to parse error details from JSON, fallback to text + let errorMsg = 'Unknown error'; + try { + const err = await response.json(); + errorMsg = err.detail || err.message || errorMsg; + } catch { + errorMsg = await response.text(); + } + throw new Error(errorMsg); + } + + const blob = await response.blob(); + console.log('Received processed data as blob:', blob); + createDownloadButton(blob); + } catch (error) { + ensureDownloadButtonRemoved(); + console.error('Error submitting data:', error); + const alert = ensureAlertExists(); + alert.textContent = 'Error submitting data: ' + error.message; + alert.classList.add('alert-danger'); + } +} + +/** + * Validate the editor content. If there are validation errors they are + * displayed in an alert; otherwise the data is submitted to the backend. + * + * @param {JSONEditor} editor - The JSONEditor instance to validate & submit. + */ +function processEditor(editor) { + const errors = editor.validate(); + + if (errors.length) { + console.error('Validation errors:', errors); + const alert = ensureAlertExists(); + alert.textContent = 'Validation errors: ' + errors.map(e => e.message).join(', '); + } else { + console.log('Data successfully validated'); + ensureAlertRemoved(); + sendEditorData(editor); + } +} + +// Fetch the JSON Schema to initialize the editor +fetch('/schema') + .then(response => response.json()) + .then(schema => { + const element = document.getElementById('editor_holder'); + const options = { + disable_edit_json: true, + theme: 'bootstrap5', + iconlib: 'bootstrap', + object_layout: 'grid', + schema: schema + }; + const editor = new JSONEditor(element, options); + + // Bind the submit button to validation + submission flow + document.getElementById('submit').addEventListener('click', () => { + console.log('Submit button clicked'); + processEditor(editor); + }); + }) + .catch(error => console.error('Error loading schema:', error)); diff --git a/src/lyscripts/data/collect/index.html b/src/lyscripts/data/collect/index.html new file mode 100644 index 0000000..931dacf --- /dev/null +++ b/src/lyscripts/data/collect/index.html @@ -0,0 +1,26 @@ + + + + + + Basic JSON Editor Example + + + + + + + + +
+

LyDATA Collector

+ +
+ + + +
+ + + From 0bc2a64801acecab04a9711151e3cbd6654a1268 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 13 Aug 2025 15:02:00 +0200 Subject: [PATCH 05/20] docs: improve collect description --- README.md | 6 ++- docs/source/data/collect.rst | 13 ++++++ docs/source/data/init.rst | 1 + docs/source/index.rst | 2 +- src/lyscripts/cli.py | 30 +++++++++++++ src/lyscripts/data/__init__.py | 6 +++ src/lyscripts/data/collect/__init__.py | 61 ++++++++++---------------- 7 files changed, 80 insertions(+), 39 deletions(-) create mode 100644 docs/source/data/collect.rst diff --git a/README.md b/README.md index 9621e28..e64323b 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,11 @@ Simply run lyscripts --help ``` -in your terminal and let the output guide you through the functions of the program. +in your terminal to display the help text for the main command. It will list all subcommands that are avialable, which you can then also call with `lyscripts --help` to get more information on its use and the available arguments. + +For example, one subcommand is `lyscripts data collect`, which will launch a small web server that allows a user to enter patient records on lymphatic involvement in head and neck cancer one row at a time and construct a standardized CSV file from it. + + You can also refer to the [documentation] for a written-down version of all these help texts and even more context on how and why to use the provided commands. diff --git a/docs/source/data/collect.rst b/docs/source/data/collect.rst new file mode 100644 index 0000000..8c8b08b --- /dev/null +++ b/docs/source/data/collect.rst @@ -0,0 +1,13 @@ +.. currentmodule:: lyscripts.data.collect + +Collect lyDATA Tables Interactively +=================================== + +.. automodule:: lyscripts.data.collect + :members: + :show-inheritance: + +Command Help +------------ + +.. program-output:: lyscripts data collect --help diff --git a/docs/source/data/init.rst b/docs/source/data/init.rst index e53ee60..014bb85 100644 --- a/docs/source/data/init.rst +++ b/docs/source/data/init.rst @@ -19,6 +19,7 @@ Submodules .. toctree:: :maxdepth: 1 + collect lyproxify join split diff --git a/docs/source/index.rst b/docs/source/index.rst index 1f36252..647e4e1 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,7 +8,7 @@ Introduction .. include:: ../../README.md - :end-line: 30 + :end-before: :parser: myst_parser.sphinx_ diff --git a/src/lyscripts/cli.py b/src/lyscripts/cli.py index c1baae2..b6448d0 100644 --- a/src/lyscripts/cli.py +++ b/src/lyscripts/cli.py @@ -8,6 +8,8 @@ .. _loguru: https://loguru.readthedocs.io/en/stable """ +import inspect +import logging from collections.abc import Callable from typing import Literal @@ -91,3 +93,31 @@ def configure_logging( level=_current_log_level, format="{message}", ) + + +class InterceptHandler(logging.Handler): + """Intercept logging messages and redirect them to Loguru.""" + + def emit(self, record: logging.LogRecord) -> None: + """Intercept the log record and redirect it to Loguru.""" + # Get corresponding Loguru level if it exists. + try: + level: str | int = logger.level(record.levelname).name + except ValueError: + level = record.levelno + + # Find caller from where originated the logged message. + frame, depth = inspect.currentframe(), 0 + while frame: + filename = frame.f_code.co_filename + is_logging = filename == logging.__file__ + is_frozen = "importlib" in filename and "_bootstrap" in filename + if depth > 0 and not (is_logging or is_frozen): + break + frame = frame.f_back + depth += 1 + + logger.opt(depth=depth, exception=record.exc_info).log( + level, + record.getMessage(), + ) diff --git a/src/lyscripts/data/__init__.py b/src/lyscripts/data/__init__.py index eb86dbd..29ffe69 100644 --- a/src/lyscripts/data/__init__.py +++ b/src/lyscripts/data/__init__.py @@ -7,6 +7,12 @@ the installed datasets provided by the `lydata`_ package and directly from the associated `GitHub repository`_. +Another cool feature is the built-in mini web application that allows collecting nodal +involvement data interactively and in the same standardized format as we have published +in the past, both on `LyProX`_ and in our `GitHub repository`_. It can be launched by +running `lyscripts data collect` in the terminal. See the docs for the +:py:mod:`lyscripts.data.collect` submodule on more information. + .. _Make: https://www.gnu.org/software/make/ .. _DVC: https://dvc.org .. _LyProX: https://lyprox.org diff --git a/src/lyscripts/data/collect/__init__.py b/src/lyscripts/data/collect/__init__.py index 44e1f19..cd681c8 100644 --- a/src/lyscripts/data/collect/__init__.py +++ b/src/lyscripts/data/collect/__init__.py @@ -1,6 +1,6 @@ """Submodule to collect data interactively using a simple web interface. -With the simply command +With the simple command .. code-block:: bash @@ -11,14 +11,13 @@ involvement data one by one. When completed, the "submit" button will parse, validate, and convert the data to serve a downloadable CSV file. -This resulting CSV file is in the correct format to be used in `LyProX`_ and for +The resulting CSV file is in the correct format to be used in `LyProX`_ and for inference using our `lymph-model`_ library. .. _LyProX: https://lyprox.org .. _lymph-model: https://lymph-model.readthedocs.io """ -import inspect import io import logging from pathlib import Path @@ -33,7 +32,7 @@ from pydantic import Field, RootModel from starlette.responses import FileResponse, HTMLResponse -from lyscripts.cli import _current_log_level +from lyscripts.cli import InterceptHandler, _current_log_level from lyscripts.configs import BaseCLI app = FastAPI( @@ -52,8 +51,8 @@ @app.get("/") -def serve_index() -> HTMLResponse: - """Serve the index.html file.""" +def serve_index_html() -> HTMLResponse: + """Serve the ``index.html`` file at the URL's root.""" with open(BASE_DIR / "index.html") as file: content = file.read() return HTMLResponse(content=content) @@ -67,13 +66,29 @@ def serve_schema() -> dict[str, Any]: @app.get("/collector.js") def serve_collector_js() -> FileResponse: - """Serve the collector.js file.""" + """Serve the ``collector.js`` file under ``"http://{host}:{port}/collector.js"``. + + This frontend JavaScript file loads the `JSON-Editor`_ library and initializes it + using the schema returned by the :py:func:`serve_schema` function. + + .. _JSON-Editor: https://github.com/json-editor/json-editor/ + """ return FileResponse(BASE_DIR / "collector.js") @app.post("/submit") async def process(data: RootModel) -> StreamingResponse: - """Convert the submitted data to a DataFrame.""" + """Process the submitted data to a DataFrame. + + `FastAPI`_ will automatically parse the received JSON data into the list of + instances of he pydantic type defined by the + :py:func:`lydata.schema.create_full_record_model` function. + + From this list, we create a pandas DataFrame and return it as a downloadable CSV + file. + + .. _FastAPI: https://fastapi.tiangolo.com/ + """ logger.info(f"Received data: {data.root}") if len(data.root) == 0: @@ -105,36 +120,8 @@ async def process(data: RootModel) -> StreamingResponse: ) -class InterceptHandler(logging.Handler): - """Intercept logging messages and redirect them to Loguru.""" - - def emit(self, record: logging.LogRecord) -> None: - """Intercept the log record and redirect it to Loguru.""" - # Get corresponding Loguru level if it exists. - try: - level: str | int = logger.level(record.levelname).name - except ValueError: - level = record.levelno - - # Find caller from where originated the logged message. - frame, depth = inspect.currentframe(), 0 - while frame: - filename = frame.f_code.co_filename - is_logging = filename == logging.__file__ - is_frozen = "importlib" in filename and "_bootstrap" in filename - if depth > 0 and not (is_logging or is_frozen): - break - frame = frame.f_back - depth += 1 - - logger.opt(depth=depth, exception=record.exc_info).log( - level, - record.getMessage(), - ) - - class CollectorCLI(BaseCLI): - """Command-line interface for the lyDATA collector.""" + """Serve a FastAPI web app for collecting involvement patterns as CSV files.""" hostname: str = Field( default="localhost", From 1bb0cbfa515250138ca23f779458d7b70950915a Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 15 Aug 2025 11:33:46 +0200 Subject: [PATCH 06/20] chore: add CITATION.cff --- CITATION.cff | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100755 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100755 index 0000000..70404a2 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,25 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: lyscripts +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - given-names: Roman + family-names: Ludwig + orcid: 'https://orcid.org/0000-0001-9434-328X' + affiliation: University Hospital Zurich +repository-code: 'https://github.com/lycosystem/lyscripts' +url: 'https://lyscripts.readthedocs.io' +abstract: >- + Scripts for reproducible research on lymphatic tumor + progression in head and neck cancer. +keywords: + - cancer + - metastasis + - lymphatic system + - head and neck +license: MIT From 73e60688bd7d12b66fbccb4132377b23a06f734f Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:48:29 +0200 Subject: [PATCH 07/20] build: add uvicorn, fastapi to deps --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 69ba6a6..112ecf2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,8 @@ dependencies = [ "pydantic-settings >= 2.7.0, != 2.9.1, != 2.9.0", "numpydantic", "loguru", + "fastapi", + "uvicorn", "lydata @ git+https://github.com/lycosystem/lydata-package@79c5d4996b37a833afca457629d8d854acc05a33", ] dynamic = ["version"] From fbff10240ab7e7fced48bea4880c13a133b7a252 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:48:42 +0200 Subject: [PATCH 08/20] change: disable properties in collector --- src/lyscripts/data/collect/collector.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lyscripts/data/collect/collector.js b/src/lyscripts/data/collect/collector.js index d5e4a59..3d8d5d9 100644 --- a/src/lyscripts/data/collect/collector.js +++ b/src/lyscripts/data/collect/collector.js @@ -137,6 +137,7 @@ fetch('/schema') theme: 'bootstrap5', iconlib: 'bootstrap', object_layout: 'grid', + disable_properties: true, schema: schema }; const editor = new JSONEditor(element, options); From 4d49a9f6db3e3017826968755267e2d21c608866 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 26 Aug 2025 11:17:01 +0200 Subject: [PATCH 09/20] fix: use fixed lydata `.enhance()` method --- pyproject.toml | 2 +- src/lyscripts/data/collect/__init__.py | 2 +- src/lyscripts/data/enhance.py | 5 +---- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 112ecf2..af6b90d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ dependencies = [ "loguru", "fastapi", "uvicorn", - "lydata @ git+https://github.com/lycosystem/lydata-package@79c5d4996b37a833afca457629d8d854acc05a33", + "lydata @ git+https://github.com/lycosystem/lydata-package@8469a26bea1d275b7ced43fbe3a269c8b30b60f4", ] dynamic = ["version"] diff --git a/src/lyscripts/data/collect/__init__.py b/src/lyscripts/data/collect/__init__.py index cd681c8..425c728 100644 --- a/src/lyscripts/data/collect/__init__.py +++ b/src/lyscripts/data/collect/__init__.py @@ -46,7 +46,7 @@ BASE_DIR = Path(__file__).parent modalities = lydata.schema.get_default_modalities() -RecordModel = lydata.schema.create_full_record_model(modalities, title="Record") +RecordModel = lydata.schema.create_full_record_model(modalities, model_name="Record") ROOT_MODEL = RootModel[list[RecordModel]] diff --git a/src/lyscripts/data/enhance.py b/src/lyscripts/data/enhance.py index a8a795a..87b1e23 100644 --- a/src/lyscripts/data/enhance.py +++ b/src/lyscripts/data/enhance.py @@ -40,12 +40,9 @@ def cli_cmd(self) -> None: logger.debug(self.model_dump_json(indent=2)) data: LyDataFrame = self.input.load() - data[self.method] = data.ly.combine( + data = data.ly.enhance( modalities=self.modalities, method=self.method, - ) - data[self.method] = data.ly.augment( - modality=self.method, subdivisions=self.lnl_subdivisions, ) save_table_to_csv(file_path=self.output_file, table=data) From 7cddc7a9964e5848cfec8952845b3cf38b6510c5 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 26 Aug 2025 12:06:33 +0200 Subject: [PATCH 10/20] test: update tests for new lydata --- tests/compute/prevalences_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/compute/prevalences_test.py b/tests/compute/prevalences_test.py index 471b78d..3fe2151 100644 --- a/tests/compute/prevalences_test.py +++ b/tests/compute/prevalences_test.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from lydata import infer_and_combine_levels, load_datasets +from lydata import load_datasets from lyscripts.compute.prevalences import observe_prevalence from lyscripts.configs import DiagnosisConfig, ScenarioConfig @@ -24,7 +24,7 @@ def scenario_config() -> ScenarioConfig: def data() -> pd.DataFrame: """Load one of the lyDATA datasets.""" data = next(load_datasets(year=2021, institution="usz")) - return infer_and_combine_levels(data) + return data.ly.enhance() def test_observe_prevalence( @@ -37,5 +37,5 @@ def test_observe_prevalence( scenario_config=scenario_config, ) - assert portion.match == 67 + assert portion.match == 66 assert portion.total == 150 From 255af68809ea9af8691d05ccc9ba6e105f77e3be Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 4 Sep 2025 11:28:53 +0200 Subject: [PATCH 11/20] build: require at least lydata 0.4.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index af6b90d..1f095be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ dependencies = [ "loguru", "fastapi", "uvicorn", - "lydata @ git+https://github.com/lycosystem/lydata-package@8469a26bea1d275b7ced43fbe3a269c8b30b60f4", + "lydata >= 0.4.0", ] dynamic = ["version"] From e4f38fa791cc0b981e60e53ab18b84982ba98f46 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 4 Sep 2025 11:34:40 +0200 Subject: [PATCH 12/20] chore: update changelog --- CHANGELOG.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfdd3f0..4b0fcda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,44 @@ All notable changes to this project will be documented in this file. +## [1.0.0] - 2025-09-04 + +### Bug Fixes + +- Change `info` to `core` for mid-level lydata columns +- Use bug-fixed lydata `.ly.enhance()` method. + +### Documentation + +- Update documentation for `integrate` and `evidence` commands. +- Improve `data collect` description. + +### Features + +- Add `integrate` command for thermodynamic integration. Thanks [@noemibuehrer]! +- Add command spawning websever for interactive data collection. + +### Miscellaneous Tasks + +- Add missing links to changelog. +- Add CITATION.cff. + +### Testing + +- Update tests for new lydata. + +### Build + +- Add uvicorn, fastapi to deps. +- Require at least lydata 0.4.0. + +### Change + +- Make compatible with new lyDATA version. +- Centralize inverse temperature schedule generation. +- Store selected log-level globally. +- Disable properties in collector. + ## [1.0.0rc3] - 2025-07-22 ### Documentation @@ -871,6 +909,7 @@ returns `None` instead. Fixes [#11] ## [0.5.3] - 2022-08-22 +[1.0.0]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc3...1.0.0 [1.0.0rc3]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc2...1.0.0rc3 [1.0.0rc2]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc1...1.0.0rc2 [1.0.0rc1]: https://github.com/lycosystem/lyscripts/compare/1.0.0.a7...1.0.0rc1 @@ -938,3 +977,4 @@ returns `None` instead. Fixes [#11] [`rich`]: https://rich.readthedocs.io/en/latest/ [`rich_argparse`]: https://github.com/hamdanal/rich_argparse [LyProX]: https://lyprox.org +[@noemibuehrer]: https://github.com/noemibuehrer From 8e3f0324547073422eebe6ef5a0faebe640e0897 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 4 Sep 2025 11:38:38 +0200 Subject: [PATCH 13/20] change: update JSON schema for YAML files --- schemas/ly.json | 103 +++++++++++++++++++++++++++++++++++++--- src/lyscripts/schema.py | 1 + 2 files changed, 98 insertions(+), 6 deletions(-) diff --git a/schemas/ly.json b/schemas/ly.json index 6bb3ca1..561127c 100644 --- a/schemas/ly.json +++ b/schemas/ly.json @@ -308,18 +308,46 @@ "type": "string" }, "repo_name": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], "default": "lycosystem/lydata", "description": "GitHub `repository/owner`.", - "minLength": 1, - "title": "Repo Name", - "type": "string" + "title": "Repo Name" }, "ref": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], "default": "main", "description": "Branch/tag/commit of the repo.", - "minLength": 1, - "title": "Ref", - "type": "string" + "title": "Ref" + }, + "local_dataset_dir": { + "anyOf": [ + { + "format": "directory-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Path to directory containing all the dataset subdirectories. So, e.g. if `path_on_disk` is `~/datasets` and the dataset is `2023-clb-multisite`, then the CSV file is expected to be at `~/datasets/2023-clb-multisite/data.csv`.", + "title": "Local Dataset Dir" } }, "required": [ @@ -501,6 +529,19 @@ "title": "Relative Thresh", "type": "number" }, + "burnin_steps": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of burn-in steps to take. If None, burn-in runs until convergence.", + "title": "Burnin Steps" + }, "num_steps": { "anyOf": [ { @@ -623,6 +664,52 @@ ], "title": "ScenarioConfig", "type": "object" + }, + "ScheduleConfig": { + "description": "Configuration for generating a schedule of inverse temperatures.", + "properties": { + "method": { + "default": "power", + "description": "Method to generate the inverse temperature schedule.", + "enum": [ + "geometric", + "linear", + "power" + ], + "title": "Method", + "type": "string" + }, + "num": { + "default": 32, + "description": "Number of inverse temperatures in the schedule.", + "title": "Num", + "type": "integer" + }, + "power": { + "default": 4.0, + "description": "If a power schedule is chosen, use this as power.", + "title": "Power", + "type": "number" + }, + "values": { + "anyOf": [ + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of inverse temperatures to use instead of generating a schedule. If a list is provided, the other parameters are ignored.", + "title": "Values" + } + }, + "title": "ScheduleConfig", + "type": "object" } }, "description": "Settings for generating a JSON schema for lyscripts configuration files.", @@ -685,6 +772,10 @@ }, "title": "Scenarios", "type": "array" + }, + "schedule": { + "$ref": "#/$defs/ScheduleConfig", + "default": null } }, "required": [ diff --git a/src/lyscripts/schema.py b/src/lyscripts/schema.py index cab7c68..7470af3 100644 --- a/src/lyscripts/schema.py +++ b/src/lyscripts/schema.py @@ -53,6 +53,7 @@ class SchemaSettings(BaseModel): model: configs.ModelConfig = None sampling: configs.SamplingConfig = None scenarios: list[configs.ScenarioConfig] = [] + schedule: configs.ScheduleConfig = None def main() -> None: From 349507f215a168c5eb4d2d00a129f8ae234a6e07 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:49:08 +0200 Subject: [PATCH 14/20] fix: convert dtypes during joining --- src/lyscripts/data/join.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lyscripts/data/join.py b/src/lyscripts/data/join.py index d7eb62e..a48db9a 100644 --- a/src/lyscripts/data/join.py +++ b/src/lyscripts/data/join.py @@ -60,7 +60,9 @@ def cli_cmd(self) -> None: joined = None for data_config in self.inputs: - data = data_config.load() + # `.convert_dtypes()` ensures that e.g. boolean values are not suddenly + # converted to strings when a dataset with missing values is concatenated. + data = data_config.load().convert_dtypes() if joined is None: joined = data else: From e017b881403763850692059a04a161649e180865 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:55:43 +0200 Subject: [PATCH 15/20] chore: update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b0fcda..76e866e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ All notable changes to this project will be documented in this file. +## [1.0.1] - 2025-09-10 + +### Bug Fixes + +- `.convert_dtypes()` during joining of tables. Not doing this caused pandas to interpret e.g. booleans with missing values as strings. + ## [1.0.0] - 2025-09-04 ### Bug Fixes @@ -909,6 +915,7 @@ returns `None` instead. Fixes [#11] ## [0.5.3] - 2022-08-22 +[1.0.1]: https://github.com/lycosystem/lyscripts/compare/1.0.0...1.0.1 [1.0.0]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc3...1.0.0 [1.0.0rc3]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc2...1.0.0rc3 [1.0.0rc2]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc1...1.0.0rc2 From 8adbf4a4a6b351be7d422c1c031d2d1b123192bc Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 10 Sep 2025 17:00:04 +0200 Subject: [PATCH 16/20] change: use lydata's `cast_dtypes()` --- src/lyscripts/data/join.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lyscripts/data/join.py b/src/lyscripts/data/join.py index a48db9a..8b0547e 100644 --- a/src/lyscripts/data/join.py +++ b/src/lyscripts/data/join.py @@ -3,6 +3,7 @@ from pathlib import Path import pandas as pd +from lydata.validator import cast_dtypes from pydantic import Field from lyscripts.cli import assemble_main @@ -60,9 +61,10 @@ def cli_cmd(self) -> None: joined = None for data_config in self.inputs: - # `.convert_dtypes()` ensures that e.g. boolean values are not suddenly + data = data_config.load() + # `cast_dtypes()` ensures that e.g. boolean values are not suddenly # converted to strings when a dataset with missing values is concatenated. - data = data_config.load().convert_dtypes() + data = cast_dtypes(data) if joined is None: joined = data else: From 9848cb5def0ec56218b189ccb6b254879dc4c0cd Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Wed, 10 Sep 2025 17:00:47 +0200 Subject: [PATCH 17/20] chore: update changlog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76e866e..c5e5ef2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file. ### Bug Fixes -- `.convert_dtypes()` during joining of tables. Not doing this caused pandas to interpret e.g. booleans with missing values as strings. +- Convert dtypes during joining using lydata's `cast_dtypes()`. ## [1.0.0] - 2025-09-04 From deccc8972e9d271286ef1a564df8be0eeec2a1e4 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 29 Sep 2025 15:50:18 +0200 Subject: [PATCH 18/20] chore: change email addresses --- pyproject.toml | 4 +++- src/lyscripts/__init__.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1f095be..75401c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,9 @@ build-backend = "setuptools.build_meta" name = "lyscripts" description = "Package to interact with lymphatic progression data and models." authors = [ - {name = "Roman Ludwig", email = "roman.ludwig@usz.ch"} + {name = "Roman Ludwig", email = "gygqdstu3@mozmail.com"}, + {name = "Yoel Pérez Haas", email = "yoel.perezhaas@usz.ch"}, + {name = "Noemi Bührer", email = "noemi.buehrer@usz.ch"}, ] readme = "README.md" requires-python = ">=3.10" diff --git a/src/lyscripts/__init__.py b/src/lyscripts/__init__.py index e9dbcb1..32808f2 100644 --- a/src/lyscripts/__init__.py +++ b/src/lyscripts/__init__.py @@ -25,7 +25,7 @@ __version__ = version __description__ = "Package to interact with lymphatic progression data and models." __author__ = "Roman Ludwig" -__email__ = "roman.ludwig@usz.ch" +__email__ = "gygqdstu3@mozmail.com" __uri__ = "https://github.com/lycosystem/lyscripts" # activate copy on write in pandas. From c96615b8f18b6da5f8d8e2c92f6819f04fdea77a Mon Sep 17 00:00:00 2001 From: noemibuehrer Date: Mon, 2 Mar 2026 11:15:43 +0100 Subject: [PATCH 19/20] fix: correct TI evaluation and leave out uncertainty reporting --- src/lyscripts/compute/evidence.py | 25 +++++++++---------------- src/lyscripts/evaluate.py | 25 +++++++++---------------- 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/src/lyscripts/compute/evidence.py b/src/lyscripts/compute/evidence.py index 1cd51e2..ffce481 100644 --- a/src/lyscripts/compute/evidence.py +++ b/src/lyscripts/compute/evidence.py @@ -48,23 +48,17 @@ def comp_bic(log_probs: np.ndarray, num_params: int, num_data: int) -> float: def compute_evidence( temp_schedule: np.ndarray, log_probs: np.ndarray, - num: int = 1000, -) -> tuple[float, float]: - """Compute the evidence and its standard deviation. +) -> float: + """Compute the evidence. Given a ``temp_schedule`` of inverse temperatures and corresponding sets of - ``log_probs``, draw ``num`` "paths" of log-probabilities and compute the evidence - for each using trapezoidal integration. - - The evidence is then the mean of those ``num`` integrations, while the error is - their standard deviation. + ``log_probs``, we calculate the mean ``log_prob`` over all samples to approximate + the expectation value under the corresponding power posterior for each step in the + ``temp_schedule``. The evidence is evaluated using trapezoidal integration of the + expectation values over the ``temp_schedule``. """ - integrals = np.zeros(shape=num) - for i in range(num): - rand_idx = RNG.choice(log_probs.shape[1], size=log_probs.shape[0]) - drawn_accuracy = log_probs[np.arange(log_probs.shape[0]), rand_idx].copy() - integrals[i] = trapezoid(y=drawn_accuracy, x=temp_schedule) - return np.mean(integrals), np.std(integrals) + a_mc = np.mean(log_probs, axis=1) + return trapezoid(y=a_mc, x=temp_schedule) def compute_ti_results( @@ -95,9 +89,8 @@ def compute_ti_results( ) ti_log_probs[i] = reader.get_blobs(flat=True)["log_prob"] - evidence, evidence_std = compute_evidence(temp_schedule, ti_log_probs) + evidence = compute_evidence(temp_schedule, ti_log_probs) metrics["evidence"] = evidence - metrics["evidence_std"] = evidence_std return temp_schedule, ti_log_probs diff --git a/src/lyscripts/evaluate.py b/src/lyscripts/evaluate.py index d09c5dd..5f9dbd0 100644 --- a/src/lyscripts/evaluate.py +++ b/src/lyscripts/evaluate.py @@ -90,23 +90,17 @@ def comp_bic(log_probs: np.ndarray, num_params: int, num_data: int) -> float: def compute_evidence( temp_schedule: np.ndarray, log_probs: np.ndarray, - num: int = 1000, -) -> tuple[float, float]: - """Compute the evidence and its standard deviation. +) -> float: + """Compute the evidence. Given a ``temp_schedule`` of inverse temperatures and corresponding sets of - ``log_probs``, draw ``num`` "paths" of log-probabilities and compute the evidence - for each using trapezoidal integration. - - The evidence is then the mean of those ``num`` integrations, while the error is - their standard deviation. + ``log_probs``, we calculate the mean ``log_prob`` over all samples to approximate + the expectation value under the corresponding power posterior for each step in the + ``temp_schedule``. The evidence is evaluated using trapezoidal integration of the + expectation values over the ``temp_schedule``. """ - integrals = np.zeros(shape=num) - for i in range(num): - rand_idx = RNG.choice(log_probs.shape[1], size=log_probs.shape[0]) - drawn_accuracy = log_probs[np.arange(log_probs.shape[0]), rand_idx].copy() - integrals[i] = trapezoid(y=drawn_accuracy, x=temp_schedule) - return np.mean(integrals), np.std(integrals) + a_mc = np.mean(log_probs, axis=1) + return trapezoid(y=a_mc, x=temp_schedule) def compute_ti_results( @@ -134,9 +128,8 @@ def compute_ti_results( reader = emcee.backends.HDFBackend(model, name=f"ti/{run}", read_only=True) ti_log_probs[i] = reader.get_blobs(flat=True) - evidence, evidence_std = compute_evidence(temp_schedule, ti_log_probs) + evidence = compute_evidence(temp_schedule, ti_log_probs) metrics["evidence"] = evidence - metrics["evidence_std"] = evidence_std return temp_schedule, ti_log_probs From d38f77c2c6fbc242b30330e5143ecdfd79f84e5c Mon Sep 17 00:00:00 2001 From: noemibuehrer Date: Wed, 8 Apr 2026 16:13:49 +0200 Subject: [PATCH 20/20] chore: update changelog --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5e5ef2..84abe31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,16 @@ All notable changes to this project will be documented in this file. +## [1.0.2] - 2026-04-08 + +### Bug Fixes + +- Correct TI evaluation and leave out uncertainty reporting. Fixes [#81]. + +### Miscellaneous Tasks + +- Change email addresses. + ## [1.0.1] - 2025-09-10 ### Bug Fixes @@ -915,6 +925,7 @@ returns `None` instead. Fixes [#11] ## [0.5.3] - 2022-08-22 +[1.0.2]: https://github.com/lycosystem/lyscripts/compare/1.0.1...1.0.2 [1.0.1]: https://github.com/lycosystem/lyscripts/compare/1.0.0...1.0.1 [1.0.0]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc3...1.0.0 [1.0.0rc3]: https://github.com/lycosystem/lyscripts/compare/1.0.0rc2...1.0.0rc3 @@ -979,6 +990,7 @@ returns `None` instead. Fixes [#11] [#72]: https://github.com/lycosystem/lyscripts/issues/72 [#74]: https://github.com/lycosystem/lyscripts/issues/74 [#75]: https://github.com/lycosystem/lyscripts/issues/75 +[#81]: https://github.com/lycosystem/lyscripts/issues/81 [`emcee`]: https://emcee.readthedocs.io/en/stable/ [`rich`]: https://rich.readthedocs.io/en/latest/