From c23505998c500b2bc0764e7aa9daa27c490a6a95 Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Mon, 9 Feb 2026 10:52:00 -0800 Subject: [PATCH 1/4] indentation fixes fixed indentation of docstring in `dataset.Dataset.split_train_other(...)` --- coderdata/dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index 39c2a5ba..695db22c 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -322,7 +322,7 @@ def split_train_other( random_state: Optional[Union[int,RandomState]]=None, **kwargs: dict, ) -> TwoWaySplit: - """ + """ Split the dataset into training and another subset (e.g., testing or validation). Parameters From e239a49ff1e09d563ba7cc11f3cc29092c914231 Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Mon, 9 Feb 2026 10:54:05 -0800 Subject: [PATCH 2/4] fixes to dataset.yml added qutations to all dataset citations since ":" were interfering with proper parsing in select cases --- coderdata/dataset.yml | 54 +++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/coderdata/dataset.yml b/coderdata/dataset.yml index 22eb8746..15e91270 100644 --- a/coderdata/dataset.yml +++ b/coderdata/dataset.yml @@ -4,9 +4,9 @@ datasets: beataml: description: Beat acute myeloid leukemia (BeatAML) focuses on acute myeloid leukemia tumor data. Data includes drug response, proteomics, and transcriptomics datasets. references: - - citation: Bottomly D, Long N, Schultz AR, et al. Integrative analysis of drug response and clinical outcome in acute myeloid leukemia. Cancer Cell. 2022;40(8):850-864.e9. + - citation: "Bottomly D, Long N, Schultz AR, et al. Integrative analysis of drug response and clinical outcome in acute myeloid leukemia. Cancer Cell. 2022;40(8):850-864.e9." doi: https://doi.org/10.1016/j.ccell.2022.07.002 - - citation: Pino JC, Posso C, Joshi SK, et al. Mapping the proteogenomic landscape enables prediction of drug response in acute myeloid leukemia. Cell Rep Med. 2024;5(1):101359. + - citation: "Pino JC, Posso C, Joshi SK, et al. Mapping the proteogenomic landscape enables prediction of drug response in acute myeloid leukemia. Cell Rep Med. 2024;5(1):101359." doi: https://doi.org/10.1016/j.xcrm.2023.101359 modalities: - sample @@ -20,7 +20,7 @@ datasets: bladder: description: Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer. Data includes transcriptomics, mutations, copy number, and drug response data. references: - - citation: Suk Hyung Lee, Wenhuo Hu, Justin T. Matulay, et al. Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer. Cell. 2018;173(2):515-528.e17. + - citation: "Suk Hyung Lee, Wenhuo Hu, Justin T. Matulay, et al. Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer. Cell. 2018;173(2):515-528.e17." doi: https://doi.org/10.1016/j.cell.2018.03.017 modalities: - sample @@ -34,7 +34,7 @@ datasets: ccle: description: Cancer Cell Line Encyclopedia (CCLE). references: - - citation: Barretina J, Caponigro G, Stransky N, et al. The Cancer Cell Line Encyclopedia enables predictive modelling of anticancer drug sensitivity. Nature. 2012;483(7391):603-607. + - citation: "Barretina J, Caponigro G, Stransky N, et al. The Cancer Cell Line Encyclopedia enables predictive modelling of anticancer drug sensitivity. Nature. 2012;483(7391):603-607." doi: https://doi.org/10.1038/nature11003 modalities: - sample @@ -49,7 +49,7 @@ datasets: cptac: description: The Clinical Proteomic Tumor Analysis Consortium (CPTAC) project is a collaborative network funded by the National Cancer Institute (NCI) focused on improving our understanding of cancer biology through the integration of transcriptomic, proteomic, and genomic data. references: - - citation: Lindgren CM, Adams DW, Kimball B, et al. Simplified and Unified Access to Cancer Proteogenomic Data. J Proteome Res. 2021;20(4):1902-1910. + - citation: "Lindgren CM, Adams DW, Kimball B, et al. Simplified and Unified Access to Cancer Proteogenomic Data. J Proteome Res. 2021;20(4):1902-1910." doi: https://doi.org/10.1021/acs.jproteome.0c00919 modalities: - sample @@ -61,11 +61,11 @@ datasets: ctrpv2: description: Cancer Therapeutics Response Portal version 2 (CTRPv2) references: - - citation: Rees MG, Seashore-Ludlow B, Cheah JH, et al. Correlating chemical sensitivity and basal gene expression reveals mechanism of action. Nat Chem Biol. 2016;12(2):109-116. + - citation: "Rees MG, Seashore-Ludlow B, Cheah JH, et al. Correlating chemical sensitivity and basal gene expression reveals mechanism of action. Nat Chem Biol. 2016;12(2):109-116." doi: https://doi.org/10.1038/nchembio.1986 - - citation: Seashore-Ludlow B, Rees MG, Cheah JH, et al. Harnessing Connectivity in a Large-Scale Small-Molecule Sensitivity Dataset. Cancer Discov. 2015;5(11):1210-1223. + - citation: "Seashore-Ludlow B, Rees MG, Cheah JH, et al. Harnessing Connectivity in a Large-Scale Small-Molecule Sensitivity Dataset. Cancer Discov. 2015;5(11):1210-1223." doi: https://doi.org/10.1158/2159-8290.CD-15-0235 - - citation: Basu A, Bodycombe NE, Cheah JH, et al. An interactive resource to identify cancer genetic and lineage dependencies targeted by small molecules. Cell. 2013;154(5):1151-1161. + - citation: "Basu A, Bodycombe NE, Cheah JH, et al. An interactive resource to identify cancer genetic and lineage dependencies targeted by small molecules. Cell. 2013;154(5):1151-1161." doi: https://doi.org/10.1016/j.cell.2013.08.003 modalities: - sample @@ -79,9 +79,9 @@ datasets: fimm: description: Institute for Molecular Medicine Finland (FIMM) dataset. references: - - citation: Mpindi JP, Yadav B, Östling P, et al. Consistency in drug response profiling. Nature. 2016;540(7631):E5-E6. + - citation: "Mpindi JP, Yadav B, Östling P, et al. Consistency in drug response profiling. Nature. 2016;540(7631):E5-E6." doi: https://doi.org/10.1038/nature20171 - - citation: Pemovska T, Kontro M, Yadav B, et al. Individualized systems medicine strategy to tailor treatments for patients with chemorefractory acute myeloid leukemia. Cancer Discov. 2013;3(12):1416-1429. + - citation: "Pemovska T, Kontro M, Yadav B, et al. Individualized systems medicine strategy to tailor treatments for patients with chemorefractory acute myeloid leukemia. Cancer Discov. 2013;3(12):1416-1429." doi: https://doi.org/10.1158/2159-8290.CD-13-0350 modalities: - sample @@ -101,7 +101,7 @@ datasets: mpnst: description: Malignant Peripheral Nerve Sheath Tumor is a rare, aggressive sarcoma that affects peripheral nerves throughout the body. references: - - citation: Dehner C, Moon CI, Zhang X, et al. Chromosome 8 gain is associated with high-grade transformation in MPNST. JCI Insight. 2021;6(6):e146351. + - citation: "Dehner C, Moon CI, Zhang X, et al. Chromosome 8 gain is associated with high-grade transformation in MPNST. JCI Insight. 2021;6(6):e146351." doi: https://doi.org/10.1172/jci.insight.146351 modalities: - sample @@ -116,7 +116,7 @@ datasets: nci60: description: National Cancer Institute 60. references: - - citation: Shoemaker RH. The NCI60 human tumour cell line anticancer drug screen. Nat Rev Cancer. 2006;6(10):813-823. + - citation: "Shoemaker RH. The NCI60 human tumour cell line anticancer drug screen. Nat Rev Cancer. 2006;6(10):813-823." doi: https://doi.org/10.1038/nrc1951 modalities: - sample @@ -130,7 +130,7 @@ datasets: pancreas: description: Organoid Profiling Identifies Common Responders to Chemotherapy in Pancreatic Cancer. Data includes transcriptomics, mutations, copy number, and drug response data. references: - - citation: Tiriac H, Belleau P, Engle DD, et al. Organoid Profiling Identifies Common Responders to Chemotherapy in Pancreatic Cancer. Cancer Discov. 2018;8(9):1112-1129. + - citation: "Tiriac H, Belleau P, Engle DD, et al. Organoid Profiling Identifies Common Responders to Chemotherapy in Pancreatic Cancer. Cancer Discov. 2018;8(9):1112-1129." doi: https://doi.org/10.1158/2159-8290.CD-18-0349 modalities: - sample @@ -144,9 +144,9 @@ datasets: prism: description: Profiling Relative Inhibition Simultaneously in Mixtures. references: - - citation: Corsello SM, Nagari RT, Spangler RD, et al. Discovering the anti-cancer potential of non-oncology drugs by systematic viability profiling. Nat Cancer. 2020;1(2):235-248. + - citation: "Corsello SM, Nagari RT, Spangler RD, et al. Discovering the anti-cancer potential of non-oncology drugs by systematic viability profiling. Nat Cancer. 2020;1(2):235-248." doi: https://doi.org/10.1038/s43018-019-0018-6 - - citation: Yu C, Mannan AM, Yvone GM, et al. High-throughput identification of genotype-specific cancer vulnerabilities in mixtures of barcoded tumor cell lines. Nat Biotechnol. 2016;34(4):419-423. + - citation: "Yu C, Mannan AM, Yvone GM, et al. High-throughput identification of genotype-specific cancer vulnerabilities in mixtures of barcoded tumor cell lines. Nat Biotechnol. 2016;34(4):419-423." doi: https://doi.org/10.1038/nbt.3460 modalities: - sample @@ -158,7 +158,7 @@ datasets: sarcoma: description: The landscape of drug sensitivity and resistance in sarcoma. Data includes transcriptomics, mutations, and drug response data. references: - - citation: Al Shihabi A, Tebon PJ, Nguyen HTL, et al. The landscape of drug sensitivity and resistance in sarcoma. Cell Stem Cell. 2024;31(10):1524-1542.e4. + - citation: "Al Shihabi A, Tebon PJ, Nguyen HTL, et al. The landscape of drug sensitivity and resistance in sarcoma. Cell Stem Cell. 2024;31(10):1524-1542.e4." doi: https://doi.org/10.1016/j.stem.2024.08.010 modalities: - sample @@ -171,7 +171,7 @@ datasets: colorectal: description: Living organoid biobank of colorectal cancer patients. references: - - citation: van de Wetering M, Francies HE, Francis JM, et al. Prospective derivation of a living organoid biobank of colorectal cancer patients. Cell. 2015;161(4):933-945. + - citation: "van de Wetering M, Francies HE, Francis JM, et al. Prospective derivation of a living organoid biobank of colorectal cancer patients. Cell. 2015;161(4):933-945." doi: https://doi.org/10.1016/j.cell.2015.03.053 modalities: - sample @@ -184,7 +184,7 @@ datasets: liver: description: Pharmaco-proteogenomic characterization of liver cancer organoids for precision oncology. references: - - citation: Ji S, Feng L, Fu Z, et al. Pharmaco-proteogenomic characterization of liver cancer organoids for precision oncology. Sci Transl Med. 2023;15(706):eadg3358. + - citation: "Ji S, Feng L, Fu Z, et al. Pharmaco-proteogenomic characterization of liver cancer organoids for precision oncology. Sci Transl Med. 2023;15(706):eadg3358." doi: https://doi.org/10.1126/scitranslmed.adg3358 modalities: - sample @@ -197,7 +197,7 @@ datasets: novartis: description: Patient-derived tumor xenografts for drug response prediction. references: - - citation: Gao H, Korn JM, Ferretti S, et al. High-throughput screening using patient-derived tumor xenografts to predict clinical trial drug response. Nat Med. 2015;21(11):1318–1325. + - citation: "Gao H, Korn JM, Ferretti S, et al. High-throughput screening using patient-derived tumor xenografts to predict clinical trial drug response. Nat Med. 2015;21(11):1318–1325." doi: https://doi.org/10.1038/nm.3954 modalities: - sample @@ -210,9 +210,9 @@ datasets: gcsi: description: The Genentech Cell Line Screening Initiative (gCSI) references: - - citation: Haverty PM, Lin E, Tan J, et al. Reproducible pharmacogenomic profiling of cancer cell line panels. Nature. 2016;533(7603):333–337. + - citation: "Haverty PM, Lin E, Tan J, et al. Reproducible pharmacogenomic profiling of cancer cell line panels. Nature. 2016;533(7603):333–337." doi: https://doi.org/10.1038/nature17987 - - citation: Klijn C, Durinck S, Stawiski EW, et al. A comprehensive transcriptional portrait of human cancer cell lines. Nat Biotechnol. 2015;33(3):306–312. + - citation: "Klijn C, Durinck S, Stawiski EW, et al. A comprehensive transcriptional portrait of human cancer cell lines. Nat Biotechnol. 2015;33(3):306–312." doi: https://doi.org/10.1038/nbt.3080 modalities: - sample @@ -226,11 +226,11 @@ datasets: gdscv1: description: Genomics of Drug Sensitivity in Cancer version 1 (GDSCv1) references: - - citation: Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575. + - citation: "Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575." doi: https://doi.org/10.1038/nature11005 - - citation: Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754. + - citation: "Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754." doi: https://doi.org/10.1016/j.cell.2016.06.017 - - citation: Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961. + - citation: "Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961." doi: https://doi.org/10.1093/nar/gks1111 modalities: - sample @@ -244,11 +244,11 @@ datasets: gdscv2: description: Genomics of Drug Sensitivity in Cancer version 2 (GDSCv2) references: - - citation: Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575. + - citation: "Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575." doi: https://doi.org/10.1038/nature11005 - - citation: Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754. + - citation: "Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754." doi: https://doi.org/10.1016/j.cell.2016.06.017 - - citation: Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961. + - citation: "Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961." doi: https://doi.org/10.1093/nar/gks1111 modalities: - sample From 1a03b357b6ac4587037a37e975efabea6cca888d Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Mon, 9 Feb 2026 11:40:00 -0800 Subject: [PATCH 3/4] fix to mRECIST conversion in prepare_data_for_improve.py conversion of mRECIST values needed for some PDX datasets. The formatting function of coderdata returns a DF with the mRECIST column having dtype 'str'. Previously 'floats' where assigned to the values causing a TypeError. Note the datasets still have a typo of mRESCIST. --- scripts/prepare_data_for_improve.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/prepare_data_for_improve.py b/scripts/prepare_data_for_improve.py index 553aecfe..50d3c65a 100644 --- a/scripts/prepare_data_for_improve.py +++ b/scripts/prepare_data_for_improve.py @@ -203,10 +203,10 @@ def process_datasets(args): ], ) # conversion logic from mRECIST -> auc - experiment.loc[experiment['mRESCIST'] == 'CR', 'mRESCIST'] = 0.1 - experiment.loc[experiment['mRESCIST'] == 'PR', 'mRESCIST'] = 0.2 - experiment.loc[experiment['mRESCIST'] == 'SD', 'mRESCIST'] = 0.5 - experiment.loc[experiment['mRESCIST'] == 'PD', 'mRESCIST'] = 1.0 + experiment.loc[experiment['mRESCIST'] == 'CR', 'mRESCIST'] = "0.1" + experiment.loc[experiment['mRESCIST'] == 'PR', 'mRESCIST'] = "0.2" + experiment.loc[experiment['mRESCIST'] == 'SD', 'mRESCIST'] = "0.5" + experiment.loc[experiment['mRESCIST'] == 'PD', 'mRESCIST'] = "1.0" experiment.rename(columns={'mRESCIST': 'auc'}, inplace=True) experiments.append(experiment) From d52ba7feb907fd2b2d5e1206ae3d03e4f6556a6e Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Wed, 11 Feb 2026 10:33:50 -0800 Subject: [PATCH 4/4] cast study name to lower for the improve framework to work properly study identifiers in the split file names and the respones have to be of the same case. This fix is implemented such that this is guaranteed --- scripts/prepare_data_for_improve.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/prepare_data_for_improve.py b/scripts/prepare_data_for_improve.py index 50d3c65a..573b5c92 100644 --- a/scripts/prepare_data_for_improve.py +++ b/scripts/prepare_data_for_improve.py @@ -248,7 +248,9 @@ def process_datasets(args): response_data['improve_sample_id'] = "SAMPLE-ID-" + response_data['improve_sample_id'].astype(int).astype(str) # exporting the drug response data to 'y_data/response.tsv' outfile_path = args.WORKDIR.joinpath("data_out", "y_data", "response.tsv") - response_data.to_csv( + response_out = deepcopy(response_data) + response_out['study'] = response_out['study'].str.lower() + response_out.to_csv( path_or_buf=outfile_path, index=False, sep='\t',