diff --git a/scripts/run_fc_example.sh b/scripts/run_fc_example.sh new file mode 100644 index 0000000..db60a90 --- /dev/null +++ b/scripts/run_fc_example.sh @@ -0,0 +1,4 @@ +#!/bin/bash + + +python src/qwen_eval_main.py diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py new file mode 100644 index 0000000..88e34e4 --- /dev/null +++ b/src/datasets/__init__.py @@ -0,0 +1,9 @@ +from .base_dataset import ToolDataset +from .toolfill_dataset import ToolFillDataset +from .toolparser_dataset import ToolParserDataset +from .toolsummary_dataset import ToolSummaryDataset +from .funccall_dataset import FuncCallDataset + +__all__ = [ + "ToolFillDataset", "ToolDataset", "ToolParserDataset", "ToolSummaryDataset", "FuncCallDataset" +] \ No newline at end of file diff --git a/src/datasets/base_dataset.py b/src/datasets/base_dataset.py new file mode 100644 index 0000000..daca352 --- /dev/null +++ b/src/datasets/base_dataset.py @@ -0,0 +1,40 @@ +from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file +from src.utils.json_utils import read_json_file, save_to_json_file + + + +class ToolDataset: + def __init__(self, dataset_name, tool_task, filepath): + self.dataset_name = dataset_name + self.tool_task = tool_task + self.filepath = filepath + self.datas = self.load_data() + + def load_data(self, ) -> list: + if self.filepath: + return self.load_data_from_local(self.filepath) + elif self.dataset_name and self.tool_task: + return self.load_data_from_hf(self.tool_task) + return [] + + def load_data_from_local(self, filepath): + '''''' + pass + + def load_data_from_hf(self, tool_task): + pass + + def __iter__(self): + self.current_index = 0 + return self + + def __next__(self): + if self.current_index < len(self.datas): + current_item = self.datas[self.current_index] + self.current_index += 1 + return current_item + else: + raise StopIteration + + def __len__(self): + return len(self.datas) diff --git a/src/datasets/funccall_dataset.py b/src/datasets/funccall_dataset.py new file mode 100644 index 0000000..475b966 --- /dev/null +++ b/src/datasets/funccall_dataset.py @@ -0,0 +1,41 @@ +from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file +from src.utils.json_utils import read_json_file, save_to_json_file +from .base_dataset import ToolDataset + +import os + + + + +class FuncCallDataset(ToolDataset): + def __init__(self, dataset_name, tool_task, filepath): + self.dataset_name = dataset_name + self.tool_task = tool_task + self.filepath = filepath + self.datas = self.load_data() + + def load_data(self, ) -> list: + if self.filepath: + return self.load_data_from_local(self.filepath) + elif self.dataset_name and self.tool_task: + return self.load_data_from_hf(self.tool_task) + return [] + + def load_data_from_local(self, filepath): + def _load_from_file(filename): + if "jsonl" in filename: + return read_jsonl_file(filename) + elif "json" in filename: + return read_json_file(filename) + + datas = [] + if os.path.isdir(filepath): + for filename in os.listdir(filepath): + datas.extend(_load_from_file(os.path.join(filepath, filename))) + else: + datas = _load_from_file(filepath) + + return datas + + def load_data_from_hf(self, tool_task): + pass \ No newline at end of file diff --git a/src/datasets/toolfill_dataset.py b/src/datasets/toolfill_dataset.py new file mode 100644 index 0000000..b6ad5bc --- /dev/null +++ b/src/datasets/toolfill_dataset.py @@ -0,0 +1,30 @@ +from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file +from src.utils.json_utils import read_json_file, save_to_json_file +from .base_dataset import ToolDataset + + + + +class ToolFillDataset(ToolDataset): + def __init__(self, dataset_name, tool_task, filepath): + self.dataset_name = dataset_name + self.tool_task = tool_task + self.filepath = filepath + self.datas = self.load_data() + + def load_data(self, ) -> list: + if self.filepath: + return self.load_data_from_local(self.filepath) + elif self.dataset_name and self.tool_task: + return self.load_data_from_hf(self.tool_task) + return [] + + def load_data_from_local(self, filepath): + if "jsonl" in filepath: + return read_jsonl_file(filepath) + elif "json" in filepath: + return read_json_file(filepath) + return [] + + def load_data_from_hf(self, tool_task): + pass \ No newline at end of file diff --git a/src/datasets/toolparser_dataset.py b/src/datasets/toolparser_dataset.py new file mode 100644 index 0000000..0cb71ff --- /dev/null +++ b/src/datasets/toolparser_dataset.py @@ -0,0 +1,30 @@ +from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file +from src.utils.json_utils import read_json_file, save_to_json_file +from .base_dataset import ToolDataset + + + + +class ToolParserDataset(ToolDataset): + def __init__(self, dataset_name, tool_task, filepath): + self.dataset_name = dataset_name + self.tool_task = tool_task + self.filepath = filepath + self.datas = self.load_data() + + def load_data(self, ) -> list: + if self.filepath: + return self.load_data_from_local(self.filepath) + elif self.dataset_name and self.tool_task: + return self.load_data_from_hf(self.tool_task) + return [] + + def load_data_from_local(self, filepath): + if "jsonl" in filepath: + return read_jsonl_file(filepath) + elif "json" in filepath: + return read_json_file(filepath) + return [] + + def load_data_from_hf(self, tool_task): + pass \ No newline at end of file diff --git a/src/datasets/toolsummary_dataset.py b/src/datasets/toolsummary_dataset.py new file mode 100644 index 0000000..697f15a --- /dev/null +++ b/src/datasets/toolsummary_dataset.py @@ -0,0 +1,28 @@ +from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file +from src.utils.json_utils import read_json_file, save_to_json_file +from .base_dataset import ToolDataset + + +class ToolSummaryDataset(ToolDataset): + def __init__(self, dataset_name, tool_task, filepath): + self.dataset_name = dataset_name + self.tool_task = tool_task + self.filepath = filepath + self.datas = self.load_data() + + def load_data(self, ) -> list: + if self.filepath: + return self.load_data_from_local(self.filepath) + elif self.dataset_name and self.tool_task: + return self.load_data_from_hf(self.tool_task) + return [] + + def load_data_from_local(self, filepath): + if "jsonl" in filepath: + return read_jsonl_file(filepath) + elif "json" in filepath: + return read_json_file(filepath) + return [] + + def load_data_from_hf(self, tool_task): + pass \ No newline at end of file diff --git a/src/evals/__init__.py b/src/evals/__init__.py new file mode 100644 index 0000000..ae79a28 --- /dev/null +++ b/src/evals/__init__.py @@ -0,0 +1,10 @@ +from .base_evalution import ToolEvalution +from .toolfill_evalution import ToolFillEvalution +from .toolparser_evalution import ToolParserEvalution +from .toolsummary_evalution import ToolSummaryEvalution +from .func_call_evalution import FuncCallEvalution + + +__all__ = [ + "ToolEvalution", "ToolFillEvalution", "ToolParserEvalution", "ToolSummaryEvalution", "FuncCallEvalution" +] \ No newline at end of file diff --git a/src/evals/base_evalution.py b/src/evals/base_evalution.py new file mode 100644 index 0000000..8bc0bf9 --- /dev/null +++ b/src/evals/base_evalution.py @@ -0,0 +1,43 @@ +from src.models.base_model import ToolModel +from src.models.generate_configs import GenerateConfigs +from src.datasets import ToolFillDataset + + + +class ToolEvalution: + def __init__( + self, + model: ToolModel, + dataset: ToolFillDataset, + base_prompt: str = '', + generate_configs: GenerateConfigs = None, + ): + self.model = model + self.dataset = dataset + self.base_prompt = base_prompt + self.generate_configs = generate_configs + + if not isinstance(model, ToolModel): + raise BaseException(f"must be ToolModel Class! not {model}") + + def calc(self): + '''开始计算结果''' + self.predicts = [] + for idx, data in enumerate(self.dataset): + # if idx >= 5: break + prompt = self.base_prompt.format(**data) + answer = data["api_param"] + predict = self.generate(prompt, self.generate_configs) + self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) + + metric = self.eval_metric(self.predicts) + return metric + + def generate(self, prompt, generate_configs): + '''返回结果''' + return self.model.generate(prompt, generate_configs) + + def eval_metric(self, datas): + '''calc custom metric''' + pass + diff --git a/src/evals/func_call_evalution.py b/src/evals/func_call_evalution.py new file mode 100644 index 0000000..b9161f6 --- /dev/null +++ b/src/evals/func_call_evalution.py @@ -0,0 +1,304 @@ +from src.models.base_model import ToolModel +from src.models.generate_configs import GenerateConfigs +from src.datasets import FuncCallDataset +from src.utils.jsonl_utils import read_jsonl_file +from .base_evalution import ToolEvalution + +from collections import Counter +import jieba, re, json, os +import numpy as np +from loguru import logger + + +def remove_punctuation(text): + pattern = r'[^\w\s]' + return re.sub(pattern, '', text) + + +def cmp_arguments(args_str1, args_str2): + rtn_flag = False + try: + args_dict1 = json.loads(args_str1) + args_dict2 = json.loads(args_str2) + # 比较两个字典是否一致 + if args_dict1 == args_dict2: + rtn_flag = True + except Exception as e: + print("json.loads error: ", e) + return rtn_flag + return rtn_flag + + +class FuncCallEvalution(ToolEvalution): + def __init__( + self, + model: ToolModel, + dataset: FuncCallDataset, + base_prompt: str = '', + template: str = 'default', + generate_configs: GenerateConfigs = None, + ): + self.model = model + self.dataset = dataset + self.base_prompt = base_prompt + self.template = template + self.generate_configs = generate_configs + + if not isinstance(model, ToolModel): + raise BaseException(f"must be ToolModel Class! not {model}") + + def calc(self): + '''开始计算结果''' + self.predicts = [] + func_call_train_datas = self.create_prompts(self.dataset) + + for idx, data in enumerate(func_call_train_datas): + print(f"总共 {len(func_call_train_datas)} 条prompt,当前运行到第 {idx} 条prompt", end="\r") + prompt = data["instruction"] + history = data["history"] + answer = data["output"] + functions = data["functions"] + predict = self.generate(prompt, self.template, self.generate_configs, history) + + if "arguments" in answer: + answer = {"content": answer["content"], "function_call": {"name": answer["name"], "arguments": answer["arguments"]}} + + if "#function" in predict: + try: + predict_param = json.loads(predict.split("#function")[-1]) + if "arguments" in predict_param: + predict_param = { + "content": predict_param["content"], + "function_call": {"name": predict_param["name"], "arguments": predict_param["arguments"]} + } + predict = {**predict_param, **{"role": "assistant"}} + except Exception as e: + logger.error("content: {content}") + predict = {**{"content": predict_param}, **{"role": "assistant"}} + else: + predict = { + "role": "assistant", + "content": predict + } + + self.predicts.append({ + "prompt": prompt, "history": history, + "predict": predict, "answer": answer, + "functions": functions + }) + + metric = self.eval_metric(self.predicts) + return metric + + def calc_from_predicts(self, file_path): + if os.path.exists(file_path): + self.predicts = read_jsonl_file(file_path) + metric = self.eval_metric(self.predicts) + return metric + else: + return self.calc() + + def create_prompts(self, func_call_datas): + system_content = '''CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。 + 你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。''' + function_format = '''You are ToolGPT, you have access to the following APIs:\n{tools}''' + + func_call_train_datas = [] + history_error_cnt = 0 + funccall_error_cnt = 0 + + for data in func_call_datas: + tools = data["functions"] + chatrounds = data["chatrounds"] + + function_content = "" + if len(tools) > 0: + function_content = function_format.format(tools=json.dumps(tools, ensure_ascii=False, sort_keys=True)) + + history = [] + for i in chatrounds: + if i["role"]=="system": + continue + + if i["role"]=="user": + history.append(("user", i["content"])) + + if i["role"] == "assistant": + if "function_call" in i: + if not isinstance(i["function_call"], dict): + funccall_error_cnt+=1 + continue + content = "#function" + json.dumps({**{"content": i["content"]}, **i["function_call"]}, ensure_ascii=False) + else: + content = i["content"] + history.append(("assistant", content)) + + + if i["role"] == "function": + content = json.dumps({**{"content": i["content"]}, **{"name": i["name"]}}, ensure_ascii=False) + history.append(("user", content)) + + + history = [i[1] for i in history] + history[0] = "\n".join([system_content,function_content, history[0]]) + + for his_idx in range(0, len(history), 2): + output = history[his_idx+1] + + if "#function" in output: + output = output.split("#function")[-1] + + try: + output = json.loads(output) + except: + output = {"content": output} + + + func_call_train_datas.append( + { + "instruction": history[his_idx], + "input": "", + "output": output, + "history": [history[:his_idx+2][i:i+2] for i in range(0, len(history[:his_idx]), 2)], + "functions": tools + }, + ) + return func_call_train_datas + + def generate(self, prompt, template, generate_configs, history=None): + '''返回结果''' + return self.model.generate(prompt, template, generate_configs, history) + + def eval_metric(self, datas): + '''''' + # function call 回复测试总数 + self.function_call_sum = 0 + # function call 回复正确数 + self.function_call_correct = 0 + # function call 回复失败数 + self.function_call_fail = 0 + # function call 回复失败中,本应该调用工具但是模型没有调用, 无工具识别识别错误数 + self.function_call_fail_functioncall = 0 + # function call 回复失败数中,因为函数名不对导致的失败数 + self.function_call_fail_name = 0 + # function call 回复失败数中,因为参数不对导致的失败数 + self.function_call_fail_param = 0 + # function call 回复失败中 函数名幻觉的失败数 + self.function_call_fail_name_illusion = 0 + + # assistant ans 回复相关度列表 + self.assistant_ans_relevancy_list = [] + + for data in datas: + ass_predict = data["predict"] + ass_truth = data["answer"] + functions = data["functions"] + history = data["history"] + # 将user 和 function 的部分组合 + content_msg = "" + for user_msg, assistant_msg in history: + content_msg += user_msg + + # if "#function" in ass_truth: + if "function_call" in ass_truth: + self.calc_func_params(ass_predict, ass_truth, functions) + else: + self.calc_relevancy(ass_predict, ass_truth, content_msg) + + self.print_result() + return { + "function_call_correct_rate": self.function_call_correct_rate, + "function_call_fail_rate": self.function_call_fail_rate, + "function_call_fail_functioncall_rate": self.function_call_fail_functioncall_rate, + "function_call_fail_name_rate": self.function_call_fail_name_rate, + "function_call_fail_param_rate": self.function_call_fail_param_rate, + "function_call_fail_name_illusion_rate": self.function_call_fail_name_illusion_rate + } + + def calc_func_params(self, ass_predict, ass_truth, functions): + self.function_call_sum += 1 + + function_names = [i["name"] for i in functions] + # ass_predict_param = json.loads(ass_predict.split("#function")[-1]) + # ass_truth_param = json.loads(ass_truth.split("#function")[-1]) + + if "function_call" not in ass_predict: + self.function_call_fail += 1 + self.function_call_fail_functioncall += 1 + elif ass_predict["function_call"]["name"] not in function_names: + # 模型幻觉 + self.function_call_fail += 1 + self.function_call_fail_name += 1 + self.function_call_fail_name_illusion += 1 + else: + function_call_name_label = False + function_call_args_label = False + if ass_predict["function_call"]["name"] == ass_truth["function_call"]["name"]: + function_call_name_label = True + if cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]): + function_call_args_label = True + else: + self.function_call_fail_param += 1 + else: + self.function_call_fail_name += 1 + # # 是否可能存在名字错误参数正确的情况? + # if self.cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]): + # function_call_args_label = True + # else: + # self.function_call_fail_param += 1 + + if function_call_name_label and function_call_args_label: + self.function_call_correct += 1 + else: + self.function_call_fail += 1 + + def calc_relevancy(self, ass_predict, ass_truth, content_msg): + if "function_call" in ass_predict: + self.assistant_ans_relevancy_list.append(0) + return + + content_msg_counter = Counter(jieba.cut(remove_punctuation(content_msg))) + ass_truth_counter = Counter(jieba.cut(remove_punctuation(ass_truth["content"]))) + ass_predict_counter = Counter(jieba.cut(remove_punctuation(ass_predict["content"]))) + relative_counter = content_msg_counter & ass_truth_counter + len_relative = sum(relative_counter.values()) + predict_relative = ass_predict_counter & relative_counter + + if len_relative == 0: + # 要是标准答案和问题相关词都无 直接给1 + self.assistant_ans_relevancy_list.append(1) + else: + # 交集与相关词的占比 + self.assistant_ans_relevancy_list.append(sum(predict_relative.values())/len_relative) + + def print_result(self, ): + # 打印指标结果 + print("=============统计数据=========================") + print(f"function_call_sum: {self.function_call_sum}") + print(f"function_call_correct: {self.function_call_correct}") + print(f"function_call_fail: {self.function_call_fail}") + print(f"function_call_fail_name: {self.function_call_fail_name}") + print(f"function_call_fail_param: {self.function_call_fail_param}") + print(f"function_call_fail_name_illusion: {self.function_call_fail_name_illusion}") + print(f"assistant_ans_sum: {len(self.assistant_ans_relevancy_list)}") + print(f"assistant_ans_relevancy: {np.mean(self.assistant_ans_relevancy_list)}") + print("=============实验结果=========================") + self.function_call_correct_rate = self.function_call_correct/self.function_call_sum + self.function_call_fail_rate = self.function_call_fail/self.function_call_sum + self.function_call_fail_functioncall_rate = self.function_call_fail_functioncall/self.function_call_sum + self.function_call_fail_name_rate = self.function_call_fail_name/self.function_call_sum + self.function_call_fail_param_rate = self.function_call_fail_param/self.function_call_sum + self.function_call_fail_name_illusion_rate = self.function_call_fail_name_illusion/self.function_call_sum + + # self.function_call_fail_functioncall_rate = self.function_call_fail_functioncall/self.function_call_fail if self.function_call_fail else 0 + # self.function_call_fail_name_rate = self.function_call_fail_name/self.function_call_fail if self.function_call_fail else 0 + # self.function_call_fail_param_rate = self.function_call_fail_param/self.function_call_fail if self.function_call_fail else 0 + # self.function_call_fail_name_illusion_rate = self.function_call_fail_name_illusion/self.function_call_fail if self.function_call_fail else 0 + print(f"工具识别正确率fccr: {self.function_call_correct_rate}") + print(f"工具识别失败率fcfr: {self.function_call_fail_rate}") + print(f"工具调用识别失败占比fcffr: {self.function_call_fail_functioncall_rate}") + print(f"工具名识别失败占比fcfnr: {self.function_call_fail_name_rate}") + print(f"工具参数识别失败占比fcfpr: {self.function_call_fail_param_rate}") + print(f"工具幻觉识别失败占比fcfnir: {self.function_call_fail_name_illusion_rate}") + print(f"助手回复答案相关度aar: {np.mean(self.assistant_ans_relevancy_list)}") + print("==============================================") diff --git a/src/evals/toolfill_evalution.py b/src/evals/toolfill_evalution.py new file mode 100644 index 0000000..e855076 --- /dev/null +++ b/src/evals/toolfill_evalution.py @@ -0,0 +1,67 @@ +from src.models.base_model import ToolModel +from src.models.generate_configs import GenerateConfigs +from src.datasets import ToolFillDataset +from .base_evalution import ToolEvalution + + + +class ToolFillEvalution(ToolEvalution): + def __init__( + self, + model: ToolModel, + dataset: ToolFillDataset, + base_prompt: str = '', + template: str = 'default', + generate_configs: GenerateConfigs = None, + ): + self.model = model + self.dataset = dataset + self.base_prompt = base_prompt + self.template = template + self.generate_configs = generate_configs + + if not isinstance(model, ToolModel): + raise BaseException(f"must be ToolModel Class! not {model}") + + def calc(self): + '''开始计算结果''' + self.predicts = [] + for idx, data in enumerate(self.dataset): + prompt = self.base_prompt.format(**data) + answer = data["api_param"] + predict = self.generate(prompt, self.template, self.generate_configs) + self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) + + metric = self.eval_metric(self.predicts) + return metric + + def generate(self, prompt, template, generate_configs): + '''返回结果''' + return self.model.generate(prompt, template, generate_configs) + + def eval_metric(self, datas): + '''''' + self.right_predicts = [] + self.wrong_predicts = [] + self.error_predicts = [] + for data in datas: + prompt, predict, answer = data["prompt"], data["predict"], data["answer"] + + try: + predict_json = predict if isinstance(predict, dict) else eval(predict) + answer_json = answer if isinstance(answer, dict) else eval(answer) + if predict_json == answer_json: + # print("prompt: {}\npredict: {}\nanswer: {}".format(prompt, predict, answer)) + self.right_predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) + else: + self.wrong_predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) + except: + self.error_predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) + # + print(len(self.right_predicts), len(self.wrong_predicts), len(self.error_predicts)) + + metric = { + "accuracy": len(self.right_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), + "error": len(self.error_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), + } + return metric diff --git a/src/evals/toolparser_evalution.py b/src/evals/toolparser_evalution.py new file mode 100644 index 0000000..b7b6e80 --- /dev/null +++ b/src/evals/toolparser_evalution.py @@ -0,0 +1,82 @@ +from src.models.base_model import ToolModel +from src.models.generate_configs import GenerateConfigs +from src.datasets import ToolParserDataset +from .base_evalution import ToolEvalution +from .utils import rec_search_key + + +class ToolParserEvalution(ToolEvalution): + def __init__( + self, + model: ToolModel, + dataset: ToolParserDataset, + base_prompt: str = '', + template: str = 'default', + generate_configs: GenerateConfigs = None, + ): + self.model = model + self.dataset = dataset + self.base_prompt = base_prompt + self.template = template + self.generate_configs = generate_configs + + if not isinstance(model, ToolModel): + raise BaseException(f"must be ToolModel Class! not {model}") + + def calc(self): + '''开始计算结果''' + self.predicts = [] + for idx, data in enumerate(self.dataset): + # if idx >= 5: break + prompt = self.base_prompt.format(**data) + response = data["response"] + answer = data["selected_keys"] + predict = self.generate(prompt, self.template, self.generate_configs) + self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) + + metric = self.eval_metric(self.predicts) + return metric + + self.model = model + self.dataset = dataset + self.base_prompt = base_prompt + self.template = template + self.generate_configs = generate_configs + + if not isinstance(model, ToolModel): + raise BaseException(f"must be ToolModel Class! not {model}") + + def generate(self, prompt, template, generate_configs): + '''返回结果''' + return self.model.generate(prompt, template, generate_configs) + + def eval_metric(self, datas): + '''''' + self.right_predicts = [] + self.wrong_predicts = [] + self.error_predicts = [] + for data in datas: + prompt, predict, answer, response = data["prompt"], data["predict"], data["answer"], data["response"] + selected_keys = rec_search_key(response, "", [], predict) + try: + predict_json = selected_keys if isinstance(selected_keys, list) else eval(selected_keys) + answer_json = answer if isinstance(answer, list) else eval(answer) + + predict_json = set(predict_json) if isinstance(predict_json, list) else predict_json + answer_json = set(answer_json) if isinstance(answer_json, list) else answer_json + + if predict_json == answer_json: + # print("prompt: {}\npredict: {}\nanswer: {}".format(prompt, predict, answer)) + self.right_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) + else: + self.wrong_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) + except: + self.error_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) + # + print(len(self.right_predicts), len(self.wrong_predicts), len(self.error_predicts)) + + metric = { + "accuracy": len(self.right_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), + "error": len(self.error_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), + } + return metric diff --git a/src/evals/toolsummary_evalution.py b/src/evals/toolsummary_evalution.py new file mode 100644 index 0000000..f665106 --- /dev/null +++ b/src/evals/toolsummary_evalution.py @@ -0,0 +1,73 @@ +from src.models.base_model import ToolModel +from src.models.generate_configs import GenerateConfigs +from src.datasets import ToolSummaryDataset +from .base_evalution import ToolEvalution +from .utils import rec_search_key + + +class ToolSummaryEvalution(ToolEvalution): + def __init__( + self, + model: ToolModel, + dataset: ToolSummaryDataset, + base_prompt: str = '', + template: str = 'default', + generate_configs: GenerateConfigs = None, + ): + self.model = model + self.dataset = dataset + self.base_prompt = base_prompt + self.template = template + self.generate_configs = generate_configs + + if not isinstance(model, ToolModel): + raise BaseException(f"must be ToolModel Class! not {model}") + + def calc(self): + '''开始计算结果''' + self.predicts = [] + for idx, data in enumerate(self.dataset): + # if idx >= 5: break + prompt = self.base_prompt.format(**data) + response = data["response"] + answer = data["selected_keys"] + predict = self.generate(prompt, self.template, self.generate_configs) + self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) + + metric = self.eval_metric(self.predicts) + return metric + + def generate(self, prompt, template, generate_configs): + '''返回结果''' + return self.model.generate(prompt, template, generate_configs) + + def eval_metric(self, datas): + '''''' + self.right_predicts = [] + self.wrong_predicts = [] + self.error_predicts = [] + for data in datas: + prompt, predict, answer, response = data["prompt"], data["predict"], data["answer"], data["response"] + selected_keys = rec_search_key(response, "", [], predict) + try: + predict_json = selected_keys if isinstance(selected_keys, list) else eval(selected_keys) + answer_json = answer if isinstance(answer, list) else eval(answer) + + predict_json = set(predict_json) if isinstance(predict_json, list) else predict_json + answer_json = set(answer_json) if isinstance(answer_json, list) else answer_json + + if predict_json == answer_json: + # print("prompt: {}\npredict: {}\nanswer: {}".format(prompt, predict, answer)) + self.right_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) + else: + self.wrong_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) + except Exception as e: + self.error_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response, "error_message": e}) + # + print(len(self.right_predicts), len(self.wrong_predicts), len(self.error_predicts)) + + metric = { + "accuracy": len(self.right_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), + "error": len(self.error_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), + } + return metric diff --git a/src/evals/utils.py b/src/evals/utils.py new file mode 100644 index 0000000..f24b448 --- /dev/null +++ b/src/evals/utils.py @@ -0,0 +1,21 @@ + +def rec_search_key(res, k="", skeys: list=[], s=""): + '''递归进行分析是否存在key被获取''' + if isinstance(res, dict): + for new_k, v in res.items(): + try: + skeys = rec_search_key(v, ".".join([str(k), str(new_k)]) if k else new_k, skeys, s) + except Exception as e: + print(res, k, new_k) + raise e + elif isinstance(res, list): + for i in res: + skeys = rec_search_key(i, k + ".list", skeys, s) + else: + if str(res) in str(s): + skeys.append(k[:-5] if k[-5:] == ".list" else k) + return list(set(skeys)) + return list(set(skeys)) + + + \ No newline at end of file diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..3b4a005 --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1,9 @@ +from .base_model import ToolModel +from .qwen_model import QwenModel +from .internlm_model import InternlmModel +from .openai_model import OpenaiModel +from .baichuan_model import BaiChuanModel + +__all__ = [ + "ToolModel", "QwenModel", "InternlmModel", "OpenaiModel", "BaiChuanModel" +] \ No newline at end of file diff --git a/src/models/baichuan_model.py b/src/models/baichuan_model.py new file mode 100644 index 0000000..e9c3b11 --- /dev/null +++ b/src/models/baichuan_model.py @@ -0,0 +1,91 @@ +# from vllm import LLM, SamplingParams +# from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel + + +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig +from peft import PeftModel, PeftConfig + +from .generate_configs import GenerateConfigs +from .base_model import ToolModel + + +from loguru import logger + + + +class BaiChuanModel(ToolModel): + def __init__(self, model_path: str, peft_path: str = None, template: str = "default", trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + self.model_path = model_path + self.peft_path = peft_path + self.template = template + self.trust_remote_code = trust_remote_code + self.tensor_parallel_size = tensor_parallel_size + self.gpu_memory_utilization = gpu_memory_utilization + self.generation_config = GenerationConfig.from_pretrained(model_path) + self.load_model(self.model_path, self.peft_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) + + def generate( + self, prompts: str, + template: str = None, + generate_configs: GenerateConfigs =None, + history: list = None, + ) -> list: + '''产出对应结果''' + template = self.template if template is None else template + + params = self.generate_params(generate_configs) + + if template == "default": + inputs = self.tokenizer(prompts, return_tensors="pt") + inputs["input_ids"] = inputs["input_ids"].cuda() + + inputs.update(params) + output = self.model.generate(**inputs) + predict = self.tokenizer.decode(output[0].tolist())[len(prompts):] + predict = predict.replace("<|endoftext|>", "").replace("", "") + return predict + elif template != "default": + messages = [{"role": "user" if idx==0 else "assistant", "content": ii} for i in history for idx, ii in enumerate(i)] + messages.append({"role": "user", "content": prompts}) + output = self.model.chat(self.tokenizer, messages=messages, generation_config=self.generation_config) + return output + + def generate_params( + self, generate_configs: GenerateConfigs, + ): + '''generate param''' + kargs = generate_configs.dict() + params = { + "max_new_tokens": kargs.get("max_new_tokens", 128), + "top_k": kargs.get("top_k", 50), + "top_p": kargs.get("top_p", 0.95), + "temperature": kargs.get("temperature", 1.0), + } + self.generation_config.max_new_tokens = kargs.get("max_new_tokens", 128) + self.generation_config.top_k = kargs.get("top_k", 50) + self.generation_config.top_p = kargs.get("top_p", 0.95) + self.generation_config.temperature = kargs.get("temperature", 1.0) + + # params = { + # "n": 1, + # "max_tokens": kargs.get("max_new_tokens", 128), + # "best_of": kargs.get("beam_bums", 1), + # "top_k": kargs.get("top_k", 50), + # "top_p": kargs.get("top_p", 0.95), + # "temperature": kargs.get("temperature", 1.0), + # "length_penalty": kargs.get("length_penalty", 1.0), + # "presence_penalty": kargs.get("presence_penalty", 1.0), + # "stop": kargs.get("stop_words", ["<|endoftext|>"]), + # } + return params + + def load_model(self, model_path, peft_path=None, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + '''加载模型''' + print(f"self.model_path: {self.model_path}") + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) + self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval().half() + if peft_path: + print(f"peft_path: {peft_path}") + self.model = PeftModel.from_pretrained(self.model, peft_path) + + # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) \ No newline at end of file diff --git a/src/models/base_model.py b/src/models/base_model.py new file mode 100644 index 0000000..95f4018 --- /dev/null +++ b/src/models/base_model.py @@ -0,0 +1,37 @@ +# from vllm import LLM, SamplingParams +# from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel + + +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoTokenizer +from peft import PeftModel, PeftConfig + +from .generate_configs import GenerateConfigs + + + +class ToolModel: + def __init__(self, model_path: str, template: str, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + self.model_path = model_path + self.trust_remote_code = trust_remote_code + self.tensor_parallel_size = tensor_parallel_size + self.gpu_memory_utilization = gpu_memory_utilization + self.load_model(self.model_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) + + def generate(self, prompts: str, template: str = None, generate_configs: GenerateConfigs = None) -> list: + '''产出对应结果''' + pass + + def generate_params( + self, generate_configs: GenerateConfigs, + ): + '''generate param''' + kargs = generate_configs.dict() + return kargs + + def load_model(self, model_path, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + '''加载模型''' + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) + self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval() + + # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) \ No newline at end of file diff --git a/src/models/generate_configs.py b/src/models/generate_configs.py new file mode 100644 index 0000000..2bf1e63 --- /dev/null +++ b/src/models/generate_configs.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel + + +class GenerateConfigs(BaseModel): + max_new_tokens: int = 128 + beam_bums: int = 1 + top_k: int = 50 + top_p: float = 0.95 + temperature: float = 1.0 + length_penalty: float = 1.0 + presence_penalty: float = 1.0 + stop_words: list = [] + template: str = "default" \ No newline at end of file diff --git a/src/models/internlm_model.py b/src/models/internlm_model.py new file mode 100644 index 0000000..9cab94d --- /dev/null +++ b/src/models/internlm_model.py @@ -0,0 +1,88 @@ +# from vllm import LLM, SamplingParams +# from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel + + +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoTokenizer +from peft import PeftModel, PeftConfig + +from .generate_configs import GenerateConfigs +from .base_model import ToolModel + + + +class InternlmModel(ToolModel): + def __init__(self, model_path: str, peft_path: str = None, template: str = "default", trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + self.model_path = model_path + self.peft_path = peft_path + self.template = template + self.trust_remote_code = trust_remote_code + self.tensor_parallel_size = tensor_parallel_size + self.gpu_memory_utilization = gpu_memory_utilization + self.load_model(self.model_path, self.peft_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) + + def generate( + self, prompts: str, + template: str = None, + generate_configs: GenerateConfigs =None, + ) -> list: + '''产出对应结果''' + + template = self.template if template is None else template + + params = self.generate_params(generate_configs) + + if template == "default": + inputs = self.tokenizer(prompts, return_tensors="pt") + inputs["input_ids"] = inputs["input_ids"].cuda() + inputs["attention_mask"] = inputs["attention_mask"].cuda() + + inputs.update(params) + output = self.model.generate(**inputs) + predict = self.tokenizer.decode(output[0].tolist()) + predict = predict.split("\n")[-1] + predict = predict.replace("<|endoftext|>", "").replace("", "") + return predict + elif template != "default": + output, _ = self.model.chat(self.tokenizer, prompts, history=None, **params) + return output + # params = self.generate_params(generate_configs) + # sampling_params = SamplingParams(**params) + # prompts = [prompts] if isinstance(prompts, str) else prompts + # outputs = self.model.generate(prompts, sampling_params) + # return [i.outputs[0].text for i in outputs] + + def generate_params( + self, generate_configs: GenerateConfigs, + ): + '''generate param''' + kargs = generate_configs.dict() + params = { + "max_new_tokens": kargs.get("max_new_tokens", 128), + "top_k": kargs.get("top_k", 50), + "top_p": kargs.get("top_p", 0.95), + "temperature": kargs.get("temperature", 1.0), + } + + # params = { + # "n": 1, + # "max_tokens": kargs.get("max_new_tokens", 128), + # "best_of": kargs.get("beam_bums", 1), + # "top_k": kargs.get("top_k", 50), + # "top_p": kargs.get("top_p", 0.95), + # "temperature": kargs.get("temperature", 1.0), + # "length_penalty": kargs.get("length_penalty", 1.0), + # "presence_penalty": kargs.get("presence_penalty", 1.0), + # "stop": kargs.get("stop_words", ["<|endoftext|>"]), + # } + return params + + def load_model(self, model_path, peft_path=None, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + '''加载模型''' + print(model_path, peft_path, trust_remote_code) + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=trust_remote_code) + self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=trust_remote_code).eval() + if peft_path: + self.model = PeftModel.from_pretrained(self.model, peft_path) + + # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) \ No newline at end of file diff --git a/src/models/openai_model.py b/src/models/openai_model.py new file mode 100644 index 0000000..2a1b99a --- /dev/null +++ b/src/models/openai_model.py @@ -0,0 +1,51 @@ +# from vllm import LLM, SamplingParams +# from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel + + +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoTokenizer +from peft import PeftModel, PeftConfig + +from .generate_configs import GenerateConfigs +from .base_model import ToolModel + +import openai, os + + + +class OpenaiModel(ToolModel): + def __init__(self, model_path: str, template: str, system_prompt): + self.model_path = model_path + self.template = template + self.system_prompt = system_prompt + + def generate( + self, prompts: str, template: str = None, + generate_configs: GenerateConfigs =None, + ) -> list: + '''产出对应结果''' + template = self.template if template is None else template + + params = self.generate_params(generate_configs) + + messages = [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompts}] + try: + result = openai.ChatCompletion.create(api_base=os.environ["OPENAI_API_BASE"], api_key=os.environ["OPENAI_API_KEY"], model=self.model_path, messages=messages, **params) + # print("prompt_tokens: {}, completion_tokens: {}".format(result["usage"]["prompt_tokens"], result["usage"]["completion_tokens"])) + return result["choices"][0]["message"]["content"] + except Exception as e: + result = str(e) + + def generate_params( + self, generate_configs: GenerateConfigs, + ): + '''generate param''' + kargs = generate_configs.dict() + params = { + "max_new_tokens": kargs.get("max_new_tokens", 128), + "top_k": kargs.get("top_k", 50), + "top_p": kargs.get("top_p", 0.95), + "temperature": kargs.get("temperature", 1.0), + } + return params + diff --git a/src/models/qwen_model.py b/src/models/qwen_model.py new file mode 100644 index 0000000..ae5b6ef --- /dev/null +++ b/src/models/qwen_model.py @@ -0,0 +1,85 @@ +# from vllm import LLM, SamplingParams +# from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel + + +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoTokenizer +from peft import PeftModel, PeftConfig + +from .generate_configs import GenerateConfigs +from .base_model import ToolModel + + + +class QwenModel(ToolModel): + def __init__(self, model_path: str, peft_path: str = None, template: str = "default", trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + self.model_path = model_path + self.peft_path = peft_path + self.template = template + self.trust_remote_code = trust_remote_code + self.tensor_parallel_size = tensor_parallel_size + self.gpu_memory_utilization = gpu_memory_utilization + self.load_model(self.model_path, self.peft_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) + + def generate( + self, prompts: str, + template: str = None, + generate_configs: GenerateConfigs =None, + history: list = None, + ) -> list: + '''产出对应结果''' + template = self.template if template is None else template + + params = self.generate_params(generate_configs) + + if template == "default": + inputs = self.tokenizer(prompts, return_tensors="pt") + inputs["input_ids"] = inputs["input_ids"].cuda() + + inputs.update(params) + output = self.model.generate(**inputs) + predict = self.tokenizer.decode(output[0].tolist())[len(prompts):] + predict = predict.replace("<|endoftext|>", "").replace("", "") + return predict + elif template != "default": + output, _ = self.model.chat(self.tokenizer, prompts, history=history, **params) + return output + # params = self.generate_params(generate_configs) + # sampling_params = SamplingParams(**params) + # prompts = [prompts] if isinstance(prompts, str) else prompts + # outputs = self.model.generate(prompts, sampling_params) + # return [i.outputs[0].text for i in outputs] + + def generate_params( + self, generate_configs: GenerateConfigs, + ): + '''generate param''' + kargs = generate_configs.dict() + params = { + "max_new_tokens": kargs.get("max_new_tokens", 128), + "top_k": kargs.get("top_k", 50), + "top_p": kargs.get("top_p", 0.95), + "temperature": kargs.get("temperature", 1.0), + } + + # params = { + # "n": 1, + # "max_tokens": kargs.get("max_new_tokens", 128), + # "best_of": kargs.get("beam_bums", 1), + # "top_k": kargs.get("top_k", 50), + # "top_p": kargs.get("top_p", 0.95), + # "temperature": kargs.get("temperature", 1.0), + # "length_penalty": kargs.get("length_penalty", 1.0), + # "presence_penalty": kargs.get("presence_penalty", 1.0), + # "stop": kargs.get("stop_words", ["<|endoftext|>"]), + # } + return params + + def load_model(self, model_path, peft_path=None, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): + '''加载模型''' + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) + self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval() + if peft_path: + self.model = PeftModel.from_pretrained(self.model, peft_path) + + # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) \ No newline at end of file diff --git a/src/models/test.py b/src/models/test.py new file mode 100644 index 0000000..9887dba --- /dev/null +++ b/src/models/test.py @@ -0,0 +1,104 @@ +import os +import re +import json +import random +import torch +import transformers +from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, TextStreamer +end_token_id = 2 +checkpoint = "/mnt/user/230854/output/vbase-llama-16k-hf/transformers" +print(checkpoint) +print("Loading model") +model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto").half().eval() +tokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint) +print("Loading finish") +streamer = TextStreamer(tokenizer, skip_prompt=True) +torch.manual_seed(random.randint(0, 100000)) +temperature = 0.2 +top_p = 0.95 +top_k = 40 +repetition_penalty = 1.1 +output_len = 2048 +role_start = "[START]" +role_end = "[END]" + + +def change2chatml(fc_dict): + chatrounds_list = [] + if fc_dict["chatrounds"][0]["role"] == "system": + role = "system" + content = fc_dict["chatrounds"][0]["content"] + chatrounds_list.append({"role":role, "content":content}) + else: + role = "system" + content = "CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。\n你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。" + chatrounds_list.append({"role":role, "content":content}) + + if fc_dict.get("functions",[]): + role = "funcionapis" + content = "You are ToolGPT, you have access to the following APIs:" + content += json.dumps(fc_dict["functions"], ensure_ascii=False, sort_keys=True) + chatrounds_list.append({"role":role, "content":content}) + + for chat_dict in fc_dict["chatrounds"]: + if chat_dict["role"] == "user": + role = "human" + content = chat_dict["content"] + chatrounds_list.append({"role":role, "content":content}) + elif chat_dict["role"] == "assistant": + role = "bot" + if "function_call" in chat_dict: + function_call_dict = {} + function_call_dict["content"] = chat_dict["content"] + function_call_dict["name"] = chat_dict["function_call"]["name"] + function_call_dict["arguments"] = chat_dict["function_call"]["arguments"] + content = "#function"+json.dumps(function_call_dict, ensure_ascii=False) + else: + content = chat_dict["content"] + chatrounds_list.append({"role":role, "content":content}) + elif chat_dict["role"] == "function": + role = "function" + function_call_rst = {} + function_call_rst["name"] = chat_dict["name"] + function_call_rst["content"] = chat_dict["content"] + content = json.dumps(function_call_rst, ensure_ascii=False) + chatrounds_list.append({"role":role, "content":content}) + return chatrounds_list + + +def get_chatrounds_ids(chatrounds_list): + input_ids = [] + for chatround in chatrounds_list: + input_ids += tokenizer.encode(role_start + chatround["role"]+ role_end) + tokenizer.encode(chatround["content"], add_special_tokens=False) + [tokenizer.eos_token_id] + input_ids += tokenizer.encode(role_start + "bot" + role_end) + return input_ids + +class GetAssistantAns(): + # 按照自己推理需求自己修改代码 + + def __init__(self): + pass + + def gen_answer(self, chat_dict): + chatrounds_list = change2chatml(chat_dict) + input_ids = get_chatrounds_ids(chatrounds_list) + output_ids = model.generate(torch.tensor([input_ids]).to(model.device), max_new_tokens=output_len, num_beams=1, num_return_sequences=1, do_sample=True, temperature=temperature, top_p=top_p, eos_token_id=end_token_id, top_k=top_k, streamer=None, repetition_penalty=repetition_penalty, pad_token_id=10000)[0] + res = tokenizer.decode(output_ids[len(input_ids):-1]) + save_dict = {"role": "assistant"} + if res.startswith("#function"): + try: + res_dict = json.loads(re.sub("^#function", "", res)) + save_dict["content"] = res_dict["content"] + save_dict["function_call"] = {} + save_dict["function_call"]["name"] = res_dict["name"] + save_dict["function_call"]["arguments"] = res_dict["arguments"] + except Exception as e: + print(e) + save_dict = {"role": "assistant"} + save_dict["content"] = res + else: + save_dict["content"] = res + + print(save_dict) + + return save_dict diff --git a/src/opensource_functioncall_evalution.py b/src/opensource_functioncall_evalution.py new file mode 100644 index 0000000..2376abe --- /dev/null +++ b/src/opensource_functioncall_evalution.py @@ -0,0 +1,221 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +############################################ +# File: opensource_functioncall_evalution.py +# create by youmi +# Time: 2023-11-23 13:10 +############################################ + + +import os +import sys +import random +import time +import shutil +import json +import jieba +import re +import copy +import numpy as np +from tqdm import tqdm +from collections import Counter +from concurrent.futures import ThreadPoolExecutor, as_completed +from getAssistantAns import GetAssistantAns + + +test_ans_file_list = [ + "fcdata_zh_test.jsonl" + ] + +# 多进程评测加速 +GPU_NUM = 1 + +# function call 回复测试总数 +function_call_sum = 0 +# function call 回复正确数 +function_call_correct = 0 +# function call 回复失败数 +function_call_fail = 0 +# function call 回复失败中,本应该调用工具但是模型没有调用, 无工具识别识别错误数 +function_call_fail_functioncall = 0 +# function call 回复失败数中,因为函数名不对导致的失败数, 这部分包括模型幻觉出错 +function_call_fail_name = 0 +# function call 回复失败数中,工具名对了,但是参数不对导致的失败数 +function_call_fail_param = 0 +# function call 回复失败中 函数名幻觉的失败数 +function_call_fail_name_illusion = 0 + +# assistant ans 回复相关度列表 +assistant_ans_relevancy_list = [] + +# 推理结果 +test_result_lines = [] + +get_assistant_ans = GetAssistantAns(gpu_num=GPU_NUM) + +def remove_punctuation(text): + pattern = r'[^\w\s]' + return re.sub(pattern, '', text) + + +def cmp_arguments(args_str1, args_str2): + rtn_flag = False + try: + args_dict1 = json.loads(args_str1) + args_dict2 = json.loads(args_str2) + # 比较两个字典是否一致 + if args_dict1 == args_dict2: + rtn_flag = True + except Exception as e: + print("json.loads error: ", e) + return rtn_flag + return rtn_flag + + +# 计算两个答案的相关度 +# 要是预测回复的是functioncall类型的,相似为0 +# 要是预测回复的包含了所有要点,相似度为1 +# 相似度保存在assistant_ans_relevancy_list中 +def calc_relevancy(ass_predict, ass_truth, chatrounds): + global assistant_ans_relevancy_list + if "function_call" in ass_predict: + assistant_ans_relevancy_list.append(0) + return + # 将user 和 function 的部分组合 + content_msg = "" + for chatround in chatrounds["chatrounds"]: + if chatround["role"] == "user": + content_msg += chatround["content"] + elif chatround["role"] == "function": + content_msg += chatround["content"] + content_msg_counter = Counter(jieba.cut(remove_punctuation(content_msg))) + ass_truth_counter = Counter(jieba.cut(remove_punctuation(ass_truth["content"]))) + ass_predict_counter = Counter(jieba.cut(remove_punctuation(ass_predict["content"]))) + relative_counter = content_msg_counter & ass_truth_counter + len_relative = sum(relative_counter.values()) + predict_relative = ass_predict_counter & relative_counter + + if len_relative == 0: + # 要是标准答案和问题相关词都无 直接给1 + assistant_ans_relevancy_list.append(1) + else: + # 交集与相关词的占比 + assistant_ans_relevancy_list.append(sum(predict_relative.values())/len_relative) + + + + +def calc_llm_index(ass_predict, ass_truth, chatrounds): + global function_call_sum, function_call_correct, function_call_fail, function_call_fail_functioncall, function_call_fail_name, function_call_fail_name_illusion, function_call_fail_param + + chatrounds_functionname_list = [] + for function_dict in chatrounds.get("functions", []): + chatrounds_functionname_list.append(function_dict["name"]) + + if "function_call" in ass_truth: + function_call_sum += 1 + if "function_call" not in ass_predict: + function_call_fail += 1 + function_call_fail_functioncall += 1 + elif ass_predict["function_call"]["name"] not in chatrounds_functionname_list: + # 模型幻觉 + function_call_fail += 1 + function_call_fail_name += 1 + function_call_fail_name_illusion += 1 + else: + function_call_name_label = False + function_call_args_label = False + if ass_predict["function_call"]["name"] == ass_truth["function_call"]["name"]: + function_call_name_label = True + if cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]): + function_call_args_label = True + else: + function_call_fail_param += 1 + else: + function_call_fail_name += 1 + + if function_call_name_label and function_call_args_label: + function_call_correct += 1 + else: + function_call_fail += 1 + else: + calc_relevancy(ass_predict, ass_truth, chatrounds) + + +def print_result(): + # 打印指标结果 + print("=============统计数据=========================") + print(f"function_call_sum: {function_call_sum}") + print(f"function_call_correct: {function_call_correct}") + print(f"function_call_fail: {function_call_fail}") + print(f"function_call_fail_functioncall: {function_call_fail_functioncall}") + print(f"function_call_fail_name: {function_call_fail_name}") + print(f"function_call_fail_param: {function_call_fail_param}") + print(f"function_call_fail_name_illusion: {function_call_fail_name_illusion}") + print(f"assistant_ans_sum: {len(assistant_ans_relevancy_list)}") + print(f"assistant_ans_relevancy: {np.mean(assistant_ans_relevancy_list)}") + print("=============实验结果=========================") + function_call_correct_rate = function_call_correct/function_call_sum + function_call_fail_rate = function_call_fail/function_call_sum + function_call_fail_functioncall_rate = function_call_fail_functioncall/function_call_fail if function_call_fail else 0 + function_call_fail_name_rate = function_call_fail_name/function_call_fail if function_call_fail else 0 + function_call_fail_param_rate = function_call_fail_param/function_call_fail if function_call_fail else 0 + function_call_fail_name_illusion_rate = function_call_fail_name_illusion/function_call_fail if function_call_fail else 0 + print(f"工具识别正确率fccr: {function_call_correct_rate}") + print(f"工具识别失败率fcfr: {function_call_fail_rate}") + print(f"工具调用识别失败占比fcffr: {function_call_fail_functioncall_rate}") + print(f"工具名识别失败占比fcfnr: {function_call_fail_name_rate}") + print(f"工具参数识别失败占比fcfpr: {function_call_fail_param_rate}") + print(f"工具幻觉识别失败占比fcfnir: {function_call_fail_name_illusion_rate}") + print(f"助手回复答案相关度aar: {np.mean(assistant_ans_relevancy_list)}") + print("==============================================") + # 保存数据 + with open("test_result_data.jsonl","w") as fw: + for line in test_result_lines: + print(line, file=fw) + + +def test_process(test_lines, gpu_index): + global test_result_lines + for line in tqdm(test_lines, desc="Process%02d"%(gpu_index)): + chat_dict = json.loads(line) + test_dict = {} + test_dict["functions"] = chat_dict["functions"] + test_dict["chatrounds"] = [] + for chatround in chat_dict["chatrounds"]: + if chatround["role"] == "assistant": + ass_predict = get_assistant_ans.gen_answer(test_dict, gpu_index=gpu_index) + save_dict = copy.deepcopy(test_dict) + save_dict["chatrounds"].append(ass_predict) + test_result_lines.append(json.dumps(save_dict, ensure_ascii=False)) + calc_llm_index(ass_predict, chatround, test_dict) + test_dict["chatrounds"].append(chatround) + + +def main(): + pool = ThreadPoolExecutor(max_workers=GPU_NUM) + + test_lines = [] + for test_ans_file in test_ans_file_list: + print(test_ans_file) + with open(test_ans_file, "r") as f: + lines = f.readlines() + test_lines += lines + + batch_num = len(test_lines)//GPU_NUM + int(len(test_lines)%GPU_NUM>0) + + obj_list = [] + for idx in range(GPU_NUM): + batch_test_lines = test_lines[idx*batch_num:(idx+1)*batch_num] + obj = pool.submit(test_process, batch_test_lines, gpu_index=idx) + obj_list.append(obj) + + for future in as_completed(obj_list): + # 暂时留在这里,但是其实没有返回数据 + data = future.result() + + print_result() + +if __name__ == "__main__": + main() diff --git a/src/prompts/__init__.py b/src/prompts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/prompts/base_prompts_config.py b/src/prompts/base_prompts_config.py new file mode 100644 index 0000000..c5071b8 --- /dev/null +++ b/src/prompts/base_prompts_config.py @@ -0,0 +1,21 @@ + +TOOL_FILL_BASE_PROMPT = '''你现在是一位参数填充助手,帮助我从历史问题问答中抽取出指定API入参结构所需要的参数信息 +HISTORY_QUESTION: {query} +API_SCHEMA: {api_schema} +返回json结构的API调用参数: +''' + + +TOOL_PARSER_BASE_PROMPT = '''你现在是一位API调用解析,帮助我生成可解析API_RESPONSE来回答用户问题的代码 +HISTORY_QUESTION: {query} +API_SCHEMA: {api_schema} +API_RESPONSE: {response} +返回解析response的代码: +''' + +TOOL_SUMMARY_BASE_PROMPT = '''你现在是一位API调用总结助手,帮助我从API的RESPONSE中获取到特定的信息,来回答用户问题 +HISTORY_QUESTION: {query} +API_SCHEMA: {api_schema} +API_RESPONSE: {response} +返回回答结果: +''' diff --git a/src/qwen_eval_main.py b/src/qwen_eval_main.py new file mode 100644 index 0000000..77c7ab9 --- /dev/null +++ b/src/qwen_eval_main.py @@ -0,0 +1,59 @@ +import os, sys + +from src.datasets import ToolFillDataset, ToolParserDataset, ToolSummaryDataset, FuncCallDataset +from src.evals import ToolFillEvalution, ToolParserEvalution, ToolSummaryEvalution, FuncCallEvalution +from src.models import QwenModel, ToolModel, InternlmModel +from src.models.generate_configs import GenerateConfigs +from src.prompts.base_prompts_config import TOOL_FILL_BASE_PROMPT, TOOL_PARSER_BASE_PROMPT, TOOL_SUMMARY_BASE_PROMPT + +import warnings +import re + +# 定义要过滤的警告消息内容 +filtered_content = "for open-end generation" +# 过滤包含特定内容的警告消息 +warnings.filterwarnings("ignore", message=re.escape(filtered_content)) + + +model_infos = [ + {"model_name": "", "template": "chatml", "model_path": "", + "peft_path": "", "model_class": QwenModel}] + +datainfos = [ + {"dataset_path": "~/fcdata_luban_zh_test.jsonl", "dataset_name": "fcdata_luban_zh", "tool_task": "func_call"}, + {"dataset_path": "~/test_datas/fcdata_zh_test_v1.jsonl", "dataset_name": "fcdata_zh", "tool_task": "func_call"}, +] + +save_path = "" + + +for model_info in datainfos: + print(f"******** model_name: {model_info['model_name']} *****") + model_path = model_info["model_path"] + peft_path = model_info["peft_path"] + template = model_info["template"] + + tool_model = model_info["model_class"](model_path, peft_path, template, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25) + + for datainfo in datainfos: + + print(f"******** dataset_name: {datainfo['dataset_name']} *****") + + dataset_name = datainfo["dataset_name"] + tool_task = datainfo["tool_task"] + dataset_path = datainfo["dataset_path"] + funccall_dataset = FuncCallDataset(dataset_name, tool_task, dataset_path) + + generate_configs = GenerateConfigs(max_new_tokens=256, temperature=0.2, stop_words=["<|endoftext|>"]) + + funccall_evalution = FuncCallEvalution( + model=tool_model, + dataset=funccall_dataset, + base_prompt=TOOL_FILL_BASE_PROMPT, + template=model_info["template"], + generate_configs=generate_configs, + ) + metric = funccall_evalution.calc() + + # save predict results to local + save_to_jsonl_file(funccall_evalution.predicts, f"{save_path}/{model_info['model_name']}/{datainfo['dataset_name']}/result.jsonl") \ No newline at end of file diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py new file mode 100644 index 0000000..9219c05 --- /dev/null +++ b/src/utils/json_utils.py @@ -0,0 +1,31 @@ +import json, re, os + + +def flatten_json(nested_json, parent_key='', sep='_'): + """\n Flatten a nested JSON object\n """ + items = [] + for key, value in nested_json.items(): + new_key = f"{parent_key}{sep}{key}" if parent_key else key + if isinstance(value, dict): + items.extend(flatten_json(value, new_key, sep=sep).items()) + elif isinstance(value, list): + value_c = sorted(value) + for i, v in enumerate(value_c): + new_item = flatten_json(v, f"{new_key}{sep}{i}", sep=sep) + items.extend(new_item.items()) + else: + items.append((new_key, value)) + return dict(items) + + +def read_json_file(filename): + with open(filename, "r", encoding="utf-8") as f: + return json.load(f) + + +def save_to_json_file(data, filename, encoding="utf-8"): + dir_name = os.path.dirname(filename) + if not os.path.exists(dir_name): os.makedirs(dir_name) + + with open(filename, "w", encoding=encoding) as f: + json.dump(data, f, indent=2, ensure_ascii=False) \ No newline at end of file diff --git a/src/utils/jsonl_utils.py b/src/utils/jsonl_utils.py new file mode 100644 index 0000000..b9ede96 --- /dev/null +++ b/src/utils/jsonl_utils.py @@ -0,0 +1,19 @@ +import re, json, os, copy, traceback + + +def read_jsonl_file(filename): + data = [] + with open(filename, "r", encoding="utf-8") as f: + for line in f: + data.append(json.loads(line)) + return data + + +def save_to_jsonl_file(data, filename): + dir_name = os.path.dirname(filename) + if not os.path.exists(dir_name): os.makedirs(dir_name) + + with open(filename, "w", encoding="utf-8") as f: + for item in data: + f.write(json.dumps(item, ensure_ascii=False) + "\n") +