From 6d31edb3f7ece0e16a41a182d5315c90463dcb06 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Tue, 3 Jan 2023 16:47:09 -0500 Subject: [PATCH 1/2] Remove broken datasets multilexsum Multilexsum is broken, remove it from dataset_info.jsonl --- utils/dataset_info.jsonl | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/dataset_info.jsonl b/utils/dataset_info.jsonl index 7f343484..6a4288ea 100644 --- a/utils/dataset_info.jsonl +++ b/utils/dataset_info.jsonl @@ -446,7 +446,6 @@ {"multihumes---es": {"dataset_name": "multihumes", "dataset_class_name": "MultiHumESDataset", "sub_dataset": "es", "splits": {"train": 1335, "validation": 189, "test": 380}, "languages": ["es"], "task_categories": [["conditional-generation"]], "tasks": ["summarization"]}} {"multihumes---fr": {"dataset_name": "multihumes", "dataset_class_name": "MultiHumESDataset", "sub_dataset": "fr", "splits": {"train": 3016, "validation": 431, "test": 865}, "languages": ["fr"], "task_categories": [["conditional-generation"]], "tasks": ["summarization"]}} {"multihumes---multilingual": {"dataset_name": "multihumes", "dataset_class_name": "MultiHumESDataset", "sub_dataset": "multilingual", "splits": {"train": 24909, "validation": 3546, "test": 7112}, "languages": ["multilingual"], "task_categories": [["conditional-generation"]], "tasks": ["summarization"]}} -{"multilexsum---long": {"dataset_name": "multilexsum", "dataset_class_name": "MultiLexSumDataset", "sub_dataset": "long", "splits": {"train": 3177, "validation": 454, "test": 908}, "languages": ["en"], "features": {"texts": {"dtype": "string", "feature_level": "sample_level", "raw_feature": true, "description": null, "id": null, "is_bucket": false, "require_training_set": false, "is_pre_computed": false, "bucket_info": null, "_type": "Value"}, "summary": {"dtype": "string", "feature_level": "sample_level", "raw_feature": true, "description": null, "id": null, "is_bucket": false, "require_training_set": false, "is_pre_computed": false, "bucket_info": null, "_type": "Value"}}, "task_categories": [["summarization"]], "tasks": ["multi-doc-summarization"]}} {"multinews---preprocessed-multi": {"dataset_name": "multinews", "dataset_class_name": "MultiNewsDataset", "sub_dataset": "preprocessed-multi", "splits": {"train": 44972, "validation": 5622, "test": 5622}, "languages": [], "task_categories": [["summarization"]], "tasks": ["multi-doc-summarization"]}} {"multinews---preprocessed-single": {"dataset_name": "multinews", "dataset_class_name": "MultiNewsDataset", "sub_dataset": "preprocessed-single", "splits": {"train": 44972, "validation": 5622, "test": 5622}, "languages": [], "task_categories": [["conditional-generation"]], "tasks": ["summarization"]}} {"multinews---raw-cleaned-multi": {"dataset_name": "multinews", "dataset_class_name": "MultiNewsDataset", "sub_dataset": "raw-cleaned-multi", "splits": {"train": 44972, "validation": 5622, "test": 5622}, "languages": [], "task_categories": [["summarization"]], "tasks": ["multi-doc-summarization"]}} From b72c0b5ebbf15ef5048649e95b9487f7cf588ebf Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Wed, 4 Jan 2023 11:03:23 -0500 Subject: [PATCH 2/2] Update dataset_info.jsonl --- utils/dataset_info.jsonl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/utils/dataset_info.jsonl b/utils/dataset_info.jsonl index 6a4288ea..f92fdcc1 100644 --- a/utils/dataset_info.jsonl +++ b/utils/dataset_info.jsonl @@ -197,10 +197,6 @@ {"fig_qa---large": {"dataset_name": "fig_qa", "dataset_class_name": "FigQA", "sub_dataset": "large", "splits": {"train": 8016, "validation": 1094, "test": 1146}, "languages": ["en"], "task_categories": [["qa"]], "tasks": ["qa-multiple-choice"]}} {"fig_qa---medium": {"dataset_name": "fig_qa", "dataset_class_name": "FigQA", "sub_dataset": "medium", "splits": {"train": 1458, "validation": 1094, "test": 1146}, "languages": ["en"], "task_categories": [["qa"]], "tasks": ["qa-multiple-choice"]}} {"fig_qa---small": {"dataset_name": "fig_qa", "dataset_class_name": "FigQA", "sub_dataset": "small", "splits": {"train": 200, "validation": 1094, "test": 1146}, "languages": ["en"], "task_categories": [["qa"]], "tasks": ["qa-multiple-choice"]}} -{"financial_phrasebank---sentences_50agree": {"dataset_name": "financial_phrasebank", "dataset_class_name": "FinancialPhrasebank", "sub_dataset": "sentences_50agree", "splits": {"train": 4846}, "languages": ["en"], "task_categories": [["text-classification"]], "tasks": ["sentiment-classification"]}} -{"financial_phrasebank---sentences_66agree": {"dataset_name": "financial_phrasebank", "dataset_class_name": "FinancialPhrasebank", "sub_dataset": "sentences_66agree", "splits": {"train": 4217}, "languages": ["en"], "task_categories": [["text-classification"]], "tasks": ["sentiment-classification"]}} -{"financial_phrasebank---sentences_75agree": {"dataset_name": "financial_phrasebank", "dataset_class_name": "FinancialPhrasebank", "sub_dataset": "sentences_75agree", "splits": {"train": 3453}, "languages": ["en"], "task_categories": [["text-classification"]], "tasks": ["sentiment-classification"]}} -{"financial_phrasebank---sentences_allagree": {"dataset_name": "financial_phrasebank", "dataset_class_name": "FinancialPhrasebank", "sub_dataset": "sentences_allagree", "splits": {"train": 2264}, "languages": ["en"], "task_categories": [["text-classification"]], "tasks": ["sentiment-classification"]}} {"finre---relation_extraction": {"dataset_name": "finre", "dataset_class_name": "FinRE", "sub_dataset": "relation_extraction", "splits": {"train": 13486, "validation": 1489, "test": 3727}, "languages": ["zh"], "task_categories": [["ROOT"]], "tasks": ["span-relation-prediction"]}} {"fudan_nlp---conll2003": {"dataset_name": "fudan_nlp", "dataset_class_name": "FudanNlp", "sub_dataset": "conll2003", "splits": {"train": 14041, "test": 3453, "validation": 3250}, "languages": [], "features": {"tokens": {"feature": {"dtype": "string", "feature_level": "sample_level", "raw_feature": true, "description": null, "id": null, "is_bucket": false, "require_training_set": false, "is_pre_computed": false, "bucket_info": null, "_type": "Value"}, "feature_level": "sample_level", "raw_feature": true, "length": -1, "id": null, "is_bucket": false, "require_training_set": false, "_type": "Sequence"}, "tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "description": null, "id": null, "is_bucket": false, "require_training_set": false, "feature_level": "sample_level", "bucket_info": null, "raw_feature": true, "_type": "ClassLabel"}, "feature_level": "sample_level", "raw_feature": true, "length": -1, "id": null, "is_bucket": false, "require_training_set": false, "_type": "Sequence"}}, "task_categories": [["sequence-labeling"]], "tasks": ["named-entity-recognition"]}} {"fudan_nlp---movie_review": {"dataset_name": "fudan_nlp", "dataset_class_name": "FudanNlp", "sub_dataset": "movie_review", "splits": {"train": 8596, "validation": 1000, "test": 1066}, "languages": [], "features": {"text": {"dtype": "string", "feature_level": "sample_level", "raw_feature": true, "description": null, "id": null, "is_bucket": false, "require_training_set": false, "is_pre_computed": false, "bucket_info": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["positive", "negative"], "names_file": null, "description": null, "id": null, "is_bucket": false, "require_training_set": false, "feature_level": "sample_level", "bucket_info": null, "raw_feature": true, "_type": "ClassLabel"}}, "task_categories": [["text-classification"]], "tasks": ["sentiment-classification"]}}