Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 03b72e3

Browse filesBrowse files
authored
Merge pull request #11 from codefuse-ai/dev_cbx
update data code
2 parents 7c6221a + dfe2356 commit 03b72e3
Copy full SHA for 03b72e3

File tree

3 files changed

+120
-0
lines changed
Filter options

3 files changed

+120
-0
lines changed

‎.gitignore

Copy file name to clipboardExpand all lines: .gitignore
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
**/__pycache__
22
.DS_Store
33
data/
4+
!src/data/
45
.pyc
56
__pycache__
67
start_job.py

‎src/data/ data_load.py

Copy file name to clipboard
+51Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import json
2+
import jsonlines
3+
import os
4+
import pandas as pd
5+
6+
from loguru import logger
7+
8+
from src.hparams.evaluate_args import EvaluateArguments
9+
from src.data.data_preprocess import preprocess
10+
11+
12+
def load_all_dataset(eval_args: EvaluateArguments):
13+
'''
14+
Load all eval dataset
15+
'''
16+
# get fp for eval dataset
17+
dataset_name_list = eval_args.eval_dataset_list
18+
eval_dataset_fp_conf_path = eval_args.eval_dataset_fp_conf_path
19+
20+
with open(eval_dataset_fp_conf_path, 'r') as f:
21+
dataset_fn_dict = json.load(f)
22+
23+
data_dir = eval_args.data_path
24+
25+
logger.info(dataset_name_list)
26+
if len(dataset_name_list) == 1 and dataset_name_list[0] == 'all':
27+
dataset_name_list = dataset_fn_dict.keys()
28+
dataset_fp_list = [data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[i] for i in dataset_name_list]
29+
30+
logger.info('Start load and preprocess dataset')
31+
all_dataset = {}
32+
for dataset_name in dataset_name_list:
33+
dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[dataset_name]
34+
df = pd.read_csv(dataset_fp)
35+
36+
# Read dev data if doing few-shot test
37+
df_dev = None
38+
if eval_args.k_shot > 0:
39+
dev_dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + 'dev' + os.path.sep + dataset_fn_dict[dataset_name]
40+
df_dev = pd.read_csv(dev_dataset_fp)
41+
42+
all_dataset[dataset_name] = preprocess(df, eval_args, df_dev=df_dev)
43+
logger.info('Load success, dataset_name={}, dataset_file_path={}, dataset question count={}'.format(dataset_name,
44+
dataset_fp,
45+
len(all_dataset[dataset_name])))
46+
return all_dataset
47+
48+
if __name__ == '__main__':
49+
a = os.path.split(os.path.realpath(__file__))[0]
50+
b = os.path.abspath(os.path.dirname(a)+os.path.sep+"../data")
51+
logger.debug(b)

‎src/data/data_preprocess.py

Copy file name to clipboard
+68Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import pandas as pd
2+
from loguru import logger
3+
4+
5+
def preprocess(df: pd.DataFrame, eval_args, df_dev: pd.DataFrame = None):
6+
'''
7+
Preprocess df and generate final dict
8+
'''
9+
question_prompt = '''以下是关于开发运维领域的单项选择题,请选出其中的正确答案。请直接输出选项。\n'''
10+
11+
if eval_args.k_shot > 0 and df_dev is not None:
12+
# uppercase to lowercase
13+
df_dev.rename(columns={
14+
'Question': 'question',
15+
'Answer': 'answer'
16+
}, inplace=True)
17+
18+
prefix = ''
19+
20+
for idx in range(eval_args.k_shot):
21+
question = df_dev['question'].iloc[idx]
22+
prefix = prefix + question_prompt + '问题:' + question + '\n'
23+
24+
for option in ['A', 'B', 'C', 'D']:
25+
if df_dev[option].iloc[idx]:
26+
prefix += '{}. {}\n'.format(option, df_dev[option].iloc[idx])
27+
prefix += '答案:{}\n'.format(df_dev['answer'].iloc[idx].strip().upper())
28+
prefix = prefix + question_prompt
29+
res = preprocess_question(df, prefix)
30+
else:
31+
res = preprocess_question(df, question_prompt)
32+
33+
return res
34+
35+
def preprocess_question(df: pd.DataFrame, prefix: str = ''):
36+
'''
37+
Preprocess df and generate final dict
38+
'''
39+
res = []
40+
41+
# uppercase to lowercase
42+
df.rename(columns={
43+
'Question': 'question',
44+
'Answer': 'answer'
45+
}, inplace=True)
46+
47+
for idx in range(df.shape[0]):
48+
to_append = {
49+
'question': df['question'].iloc[idx],
50+
'options': [],
51+
'answer': df['answer'].iloc[idx].strip().upper()
52+
}
53+
question = df['question'].iloc[idx]
54+
55+
query = prefix + '''问题:{question}\n'''.format(question=question)
56+
57+
for option in ['A', 'B', 'C', 'D']:
58+
if df[option].iloc[idx]:
59+
to_append['options'].append(option)
60+
to_append[option] = df[option].iloc[idx]
61+
to_add = '{}. {}\n'.format(option, df[option].iloc[idx])
62+
query += to_add
63+
64+
to_add = '答案:'
65+
query += to_add
66+
to_append['query'] = query
67+
res.append(to_append)
68+
return res

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.