diff --git a/coderdata/dataset.yml b/coderdata/dataset.yml index 22eb8746..15e91270 100644 --- a/coderdata/dataset.yml +++ b/coderdata/dataset.yml @@ -4,9 +4,9 @@ datasets: beataml: description: Beat acute myeloid leukemia (BeatAML) focuses on acute myeloid leukemia tumor data. Data includes drug response, proteomics, and transcriptomics datasets. references: - - citation: Bottomly D, Long N, Schultz AR, et al. Integrative analysis of drug response and clinical outcome in acute myeloid leukemia. Cancer Cell. 2022;40(8):850-864.e9. + - citation: "Bottomly D, Long N, Schultz AR, et al. Integrative analysis of drug response and clinical outcome in acute myeloid leukemia. Cancer Cell. 2022;40(8):850-864.e9." doi: https://doi.org/10.1016/j.ccell.2022.07.002 - - citation: Pino JC, Posso C, Joshi SK, et al. Mapping the proteogenomic landscape enables prediction of drug response in acute myeloid leukemia. Cell Rep Med. 2024;5(1):101359. + - citation: "Pino JC, Posso C, Joshi SK, et al. Mapping the proteogenomic landscape enables prediction of drug response in acute myeloid leukemia. Cell Rep Med. 2024;5(1):101359." doi: https://doi.org/10.1016/j.xcrm.2023.101359 modalities: - sample @@ -20,7 +20,7 @@ datasets: bladder: description: Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer. Data includes transcriptomics, mutations, copy number, and drug response data. references: - - citation: Suk Hyung Lee, Wenhuo Hu, Justin T. Matulay, et al. Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer. Cell. 2018;173(2):515-528.e17. + - citation: "Suk Hyung Lee, Wenhuo Hu, Justin T. Matulay, et al. Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer. Cell. 2018;173(2):515-528.e17." doi: https://doi.org/10.1016/j.cell.2018.03.017 modalities: - sample @@ -34,7 +34,7 @@ datasets: ccle: description: Cancer Cell Line Encyclopedia (CCLE). references: - - citation: Barretina J, Caponigro G, Stransky N, et al. The Cancer Cell Line Encyclopedia enables predictive modelling of anticancer drug sensitivity. Nature. 2012;483(7391):603-607. + - citation: "Barretina J, Caponigro G, Stransky N, et al. The Cancer Cell Line Encyclopedia enables predictive modelling of anticancer drug sensitivity. Nature. 2012;483(7391):603-607." doi: https://doi.org/10.1038/nature11003 modalities: - sample @@ -49,7 +49,7 @@ datasets: cptac: description: The Clinical Proteomic Tumor Analysis Consortium (CPTAC) project is a collaborative network funded by the National Cancer Institute (NCI) focused on improving our understanding of cancer biology through the integration of transcriptomic, proteomic, and genomic data. references: - - citation: Lindgren CM, Adams DW, Kimball B, et al. Simplified and Unified Access to Cancer Proteogenomic Data. J Proteome Res. 2021;20(4):1902-1910. + - citation: "Lindgren CM, Adams DW, Kimball B, et al. Simplified and Unified Access to Cancer Proteogenomic Data. J Proteome Res. 2021;20(4):1902-1910." doi: https://doi.org/10.1021/acs.jproteome.0c00919 modalities: - sample @@ -61,11 +61,11 @@ datasets: ctrpv2: description: Cancer Therapeutics Response Portal version 2 (CTRPv2) references: - - citation: Rees MG, Seashore-Ludlow B, Cheah JH, et al. Correlating chemical sensitivity and basal gene expression reveals mechanism of action. Nat Chem Biol. 2016;12(2):109-116. + - citation: "Rees MG, Seashore-Ludlow B, Cheah JH, et al. Correlating chemical sensitivity and basal gene expression reveals mechanism of action. Nat Chem Biol. 2016;12(2):109-116." doi: https://doi.org/10.1038/nchembio.1986 - - citation: Seashore-Ludlow B, Rees MG, Cheah JH, et al. Harnessing Connectivity in a Large-Scale Small-Molecule Sensitivity Dataset. Cancer Discov. 2015;5(11):1210-1223. + - citation: "Seashore-Ludlow B, Rees MG, Cheah JH, et al. Harnessing Connectivity in a Large-Scale Small-Molecule Sensitivity Dataset. Cancer Discov. 2015;5(11):1210-1223." doi: https://doi.org/10.1158/2159-8290.CD-15-0235 - - citation: Basu A, Bodycombe NE, Cheah JH, et al. An interactive resource to identify cancer genetic and lineage dependencies targeted by small molecules. Cell. 2013;154(5):1151-1161. + - citation: "Basu A, Bodycombe NE, Cheah JH, et al. An interactive resource to identify cancer genetic and lineage dependencies targeted by small molecules. Cell. 2013;154(5):1151-1161." doi: https://doi.org/10.1016/j.cell.2013.08.003 modalities: - sample @@ -79,9 +79,9 @@ datasets: fimm: description: Institute for Molecular Medicine Finland (FIMM) dataset. references: - - citation: Mpindi JP, Yadav B, Östling P, et al. Consistency in drug response profiling. Nature. 2016;540(7631):E5-E6. + - citation: "Mpindi JP, Yadav B, Östling P, et al. Consistency in drug response profiling. Nature. 2016;540(7631):E5-E6." doi: https://doi.org/10.1038/nature20171 - - citation: Pemovska T, Kontro M, Yadav B, et al. Individualized systems medicine strategy to tailor treatments for patients with chemorefractory acute myeloid leukemia. Cancer Discov. 2013;3(12):1416-1429. + - citation: "Pemovska T, Kontro M, Yadav B, et al. Individualized systems medicine strategy to tailor treatments for patients with chemorefractory acute myeloid leukemia. Cancer Discov. 2013;3(12):1416-1429." doi: https://doi.org/10.1158/2159-8290.CD-13-0350 modalities: - sample @@ -101,7 +101,7 @@ datasets: mpnst: description: Malignant Peripheral Nerve Sheath Tumor is a rare, aggressive sarcoma that affects peripheral nerves throughout the body. references: - - citation: Dehner C, Moon CI, Zhang X, et al. Chromosome 8 gain is associated with high-grade transformation in MPNST. JCI Insight. 2021;6(6):e146351. + - citation: "Dehner C, Moon CI, Zhang X, et al. Chromosome 8 gain is associated with high-grade transformation in MPNST. JCI Insight. 2021;6(6):e146351." doi: https://doi.org/10.1172/jci.insight.146351 modalities: - sample @@ -116,7 +116,7 @@ datasets: nci60: description: National Cancer Institute 60. references: - - citation: Shoemaker RH. The NCI60 human tumour cell line anticancer drug screen. Nat Rev Cancer. 2006;6(10):813-823. + - citation: "Shoemaker RH. The NCI60 human tumour cell line anticancer drug screen. Nat Rev Cancer. 2006;6(10):813-823." doi: https://doi.org/10.1038/nrc1951 modalities: - sample @@ -130,7 +130,7 @@ datasets: pancreas: description: Organoid Profiling Identifies Common Responders to Chemotherapy in Pancreatic Cancer. Data includes transcriptomics, mutations, copy number, and drug response data. references: - - citation: Tiriac H, Belleau P, Engle DD, et al. Organoid Profiling Identifies Common Responders to Chemotherapy in Pancreatic Cancer. Cancer Discov. 2018;8(9):1112-1129. + - citation: "Tiriac H, Belleau P, Engle DD, et al. Organoid Profiling Identifies Common Responders to Chemotherapy in Pancreatic Cancer. Cancer Discov. 2018;8(9):1112-1129." doi: https://doi.org/10.1158/2159-8290.CD-18-0349 modalities: - sample @@ -144,9 +144,9 @@ datasets: prism: description: Profiling Relative Inhibition Simultaneously in Mixtures. references: - - citation: Corsello SM, Nagari RT, Spangler RD, et al. Discovering the anti-cancer potential of non-oncology drugs by systematic viability profiling. Nat Cancer. 2020;1(2):235-248. + - citation: "Corsello SM, Nagari RT, Spangler RD, et al. Discovering the anti-cancer potential of non-oncology drugs by systematic viability profiling. Nat Cancer. 2020;1(2):235-248." doi: https://doi.org/10.1038/s43018-019-0018-6 - - citation: Yu C, Mannan AM, Yvone GM, et al. High-throughput identification of genotype-specific cancer vulnerabilities in mixtures of barcoded tumor cell lines. Nat Biotechnol. 2016;34(4):419-423. + - citation: "Yu C, Mannan AM, Yvone GM, et al. High-throughput identification of genotype-specific cancer vulnerabilities in mixtures of barcoded tumor cell lines. Nat Biotechnol. 2016;34(4):419-423." doi: https://doi.org/10.1038/nbt.3460 modalities: - sample @@ -158,7 +158,7 @@ datasets: sarcoma: description: The landscape of drug sensitivity and resistance in sarcoma. Data includes transcriptomics, mutations, and drug response data. references: - - citation: Al Shihabi A, Tebon PJ, Nguyen HTL, et al. The landscape of drug sensitivity and resistance in sarcoma. Cell Stem Cell. 2024;31(10):1524-1542.e4. + - citation: "Al Shihabi A, Tebon PJ, Nguyen HTL, et al. The landscape of drug sensitivity and resistance in sarcoma. Cell Stem Cell. 2024;31(10):1524-1542.e4." doi: https://doi.org/10.1016/j.stem.2024.08.010 modalities: - sample @@ -171,7 +171,7 @@ datasets: colorectal: description: Living organoid biobank of colorectal cancer patients. references: - - citation: van de Wetering M, Francies HE, Francis JM, et al. Prospective derivation of a living organoid biobank of colorectal cancer patients. Cell. 2015;161(4):933-945. + - citation: "van de Wetering M, Francies HE, Francis JM, et al. Prospective derivation of a living organoid biobank of colorectal cancer patients. Cell. 2015;161(4):933-945." doi: https://doi.org/10.1016/j.cell.2015.03.053 modalities: - sample @@ -184,7 +184,7 @@ datasets: liver: description: Pharmaco-proteogenomic characterization of liver cancer organoids for precision oncology. references: - - citation: Ji S, Feng L, Fu Z, et al. Pharmaco-proteogenomic characterization of liver cancer organoids for precision oncology. Sci Transl Med. 2023;15(706):eadg3358. + - citation: "Ji S, Feng L, Fu Z, et al. Pharmaco-proteogenomic characterization of liver cancer organoids for precision oncology. Sci Transl Med. 2023;15(706):eadg3358." doi: https://doi.org/10.1126/scitranslmed.adg3358 modalities: - sample @@ -197,7 +197,7 @@ datasets: novartis: description: Patient-derived tumor xenografts for drug response prediction. references: - - citation: Gao H, Korn JM, Ferretti S, et al. High-throughput screening using patient-derived tumor xenografts to predict clinical trial drug response. Nat Med. 2015;21(11):1318–1325. + - citation: "Gao H, Korn JM, Ferretti S, et al. High-throughput screening using patient-derived tumor xenografts to predict clinical trial drug response. Nat Med. 2015;21(11):1318–1325." doi: https://doi.org/10.1038/nm.3954 modalities: - sample @@ -210,9 +210,9 @@ datasets: gcsi: description: The Genentech Cell Line Screening Initiative (gCSI) references: - - citation: Haverty PM, Lin E, Tan J, et al. Reproducible pharmacogenomic profiling of cancer cell line panels. Nature. 2016;533(7603):333–337. + - citation: "Haverty PM, Lin E, Tan J, et al. Reproducible pharmacogenomic profiling of cancer cell line panels. Nature. 2016;533(7603):333–337." doi: https://doi.org/10.1038/nature17987 - - citation: Klijn C, Durinck S, Stawiski EW, et al. A comprehensive transcriptional portrait of human cancer cell lines. Nat Biotechnol. 2015;33(3):306–312. + - citation: "Klijn C, Durinck S, Stawiski EW, et al. A comprehensive transcriptional portrait of human cancer cell lines. Nat Biotechnol. 2015;33(3):306–312." doi: https://doi.org/10.1038/nbt.3080 modalities: - sample @@ -226,11 +226,11 @@ datasets: gdscv1: description: Genomics of Drug Sensitivity in Cancer version 1 (GDSCv1) references: - - citation: Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575. + - citation: "Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575." doi: https://doi.org/10.1038/nature11005 - - citation: Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754. + - citation: "Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754." doi: https://doi.org/10.1016/j.cell.2016.06.017 - - citation: Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961. + - citation: "Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961." doi: https://doi.org/10.1093/nar/gks1111 modalities: - sample @@ -244,11 +244,11 @@ datasets: gdscv2: description: Genomics of Drug Sensitivity in Cancer version 2 (GDSCv2) references: - - citation: Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575. + - citation: "Garnett MJ, Edelman EJ, Heidorn SJ, et al. Systematic identification of genomic markers of drug sensitivity in cancer cells. Nature. 2012;483(7391):570–575." doi: https://doi.org/10.1038/nature11005 - - citation: Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754. + - citation: "Iorio F, Knijnenburg TA, Vis DJ, et al. A Landscape of Pharmacogenomic Interactions in Cancer. Cell. 2016;166(3):740–754." doi: https://doi.org/10.1016/j.cell.2016.06.017 - - citation: Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961. + - citation: "Yang W, Soares J, Greninger P, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Res. 2013;41(Database issue):D955–D961." doi: https://doi.org/10.1093/nar/gks1111 modalities: - sample diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index 39c2a5ba..695db22c 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -322,7 +322,7 @@ def split_train_other( random_state: Optional[Union[int,RandomState]]=None, **kwargs: dict, ) -> TwoWaySplit: - """ + """ Split the dataset into training and another subset (e.g., testing or validation). Parameters diff --git a/scripts/prepare_data_for_improve.py b/scripts/prepare_data_for_improve.py index 553aecfe..573b5c92 100644 --- a/scripts/prepare_data_for_improve.py +++ b/scripts/prepare_data_for_improve.py @@ -203,10 +203,10 @@ def process_datasets(args): ], ) # conversion logic from mRECIST -> auc - experiment.loc[experiment['mRESCIST'] == 'CR', 'mRESCIST'] = 0.1 - experiment.loc[experiment['mRESCIST'] == 'PR', 'mRESCIST'] = 0.2 - experiment.loc[experiment['mRESCIST'] == 'SD', 'mRESCIST'] = 0.5 - experiment.loc[experiment['mRESCIST'] == 'PD', 'mRESCIST'] = 1.0 + experiment.loc[experiment['mRESCIST'] == 'CR', 'mRESCIST'] = "0.1" + experiment.loc[experiment['mRESCIST'] == 'PR', 'mRESCIST'] = "0.2" + experiment.loc[experiment['mRESCIST'] == 'SD', 'mRESCIST'] = "0.5" + experiment.loc[experiment['mRESCIST'] == 'PD', 'mRESCIST'] = "1.0" experiment.rename(columns={'mRESCIST': 'auc'}, inplace=True) experiments.append(experiment) @@ -248,7 +248,9 @@ def process_datasets(args): response_data['improve_sample_id'] = "SAMPLE-ID-" + response_data['improve_sample_id'].astype(int).astype(str) # exporting the drug response data to 'y_data/response.tsv' outfile_path = args.WORKDIR.joinpath("data_out", "y_data", "response.tsv") - response_data.to_csv( + response_out = deepcopy(response_data) + response_out['study'] = response_out['study'].str.lower() + response_out.to_csv( path_or_buf=outfile_path, index=False, sep='\t',