diff --git a/.github/scripts/pr_oc_score_assert.py b/.github/scripts/pr_oc_score_assert.py new file mode 100644 index 000000000..5c4bb85b0 --- /dev/null +++ b/.github/scripts/pr_oc_score_assert.py @@ -0,0 +1,77 @@ +import csv +import os + +import pytest + +output_path = 'regression_result' +model = 'internlm-chat-7b-hf' +dataset = 'siqa' + + +@pytest.fixture() +def result_scores(): + file = find_csv_files(output_path) + if file is None: + return None + return read_csv_file(file) + + +@pytest.mark.usefixtures('result_scores') +class TestChatScore: + """Test cases for chat model.""" + + def test_model_dataset_score(self, result_scores): + result_score = result_scores.get(model).get(dataset) + assert_score(result_score, 73.59) + + +def assert_score(score, baseline): + if score is None or score == '-': + assert False, 'value is none' + if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97): + print(score + ' between ' + str(baseline * 0.97) + ' and ' + + str(baseline * 1.03)) + assert True + else: + assert False, score + ' not between ' + str( + baseline * 0.97) + ' and ' + str(baseline * 1.03) + + +def find_csv_files(directory): + csv_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.csv'): + csv_files.append(os.path.join(root, file)) + if len(csv_files) > 1: + raise 'have more than 1 result file, please check the result manually' + if len(csv_files) == 0: + return None + return csv_files[0] + + +def read_csv_file(file_path): + with open(file_path, 'r') as csvfile: + reader = csv.DictReader(csvfile) + filtered_data = [] + + for row in reader: + filtered_row = { + k: v + for k, v in row.items() + if k not in ['version', 'metric', 'mode'] + } + filtered_data.append(filtered_row) + + result = {} + for data in filtered_data: + dataset = data.get('dataset') + for key in data.keys(): + if key == 'dataset': + continue + else: + if key in result.keys(): + result.get(key)[dataset] = data.get(key) + else: + result[key] = {dataset: data.get(key)} + return result diff --git a/.gitignore b/.gitignore index f64196b45..9d1d16607 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ configs/eval_debug*.py configs/viz_*.py data work_dirs +outputs models/* configs/internal/ # Byte-compiled / optimized / DLL files @@ -121,6 +122,7 @@ turbomind/ *.txt *.jpg *.json +*.jsonl *.csv *.npy *.c diff --git a/configs/datasets/MathBench/mathbench_2024_gen_de9ff9.py b/configs/datasets/MathBench/deprecated_mathbench_2024_gen_de9ff9.py similarity index 100% rename from configs/datasets/MathBench/mathbench_2024_gen_de9ff9.py rename to configs/datasets/MathBench/deprecated_mathbench_2024_gen_de9ff9.py diff --git a/configs/datasets/MathBench/mathbench_2024_gen_649349.py b/configs/datasets/MathBench/mathbench_2024_gen_649349.py index a73ce1d6e..b1e41e3ce 100644 --- a/configs/datasets/MathBench/mathbench_2024_gen_649349.py +++ b/configs/datasets/MathBench/mathbench_2024_gen_649349.py @@ -60,7 +60,7 @@ 'high': ['single_choice_cn', 'single_choice_en'], 'middle': ['single_choice_cn', 'single_choice_en'], 'primary': ['cloze_cn', 'cloze_en'], - 'calculate': ['cloze_en'], + 'arithmetic': ['cloze_en'], # Theory part 'college_knowledge': ['single_choice_cn','single_choice_en'], 'high_knowledge': ['single_choice_cn','single_choice_en'], @@ -102,7 +102,7 @@ dict( abbr="mathbench-" + _split + '-' + _name, type=MathBenchDataset, - path=f"./data/mathbench_v1_ori/{_split}", + path=f"./data/mathbench_v1/{_split}", name=_name, with_circular=with_circular_eval, reader_cfg=dict( diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_649349.py b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_649349.py new file mode 100644 index 000000000..9b036be17 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_649349.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, mathbench_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess + +single_choice_prompts = { + "single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n", + "single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:\n", + "single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n", + "single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n", +} + +cloze_prompts = { + "cloze_cn": [ + "Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6", + "Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5", + "Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?\nA: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39", + "Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?\nA: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8", + "Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?\nA: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9", + "Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29", + "Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?\nA: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33", + "Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8", + "Q: {question}\nA: {answer}", + ], + "cloze_en": [ + "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.", + "Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.", + "Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.", + "Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.", + "Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.", + "Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.", + "Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.", + "Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.", + "Q: {question}\nA: {answer}", + ], +} + +mathbench_sets = { + # Practice Part + "college": ["single_choice_cn", "single_choice_en"], + "high": ["single_choice_cn", "single_choice_en"], + "middle": ["single_choice_cn", "single_choice_en"], + "primary": ["cloze_cn", "cloze_en"], + "calculate": ["cloze_en"], + # Theory part + "college_knowledge": ["single_choice_cn", "single_choice_en"], + "high_knowledge": ["single_choice_cn", "single_choice_en"], + "middle_knowledge": ["single_choice_cn", "single_choice_en"], + "primary_knowledge": ["single_choice_cn", "single_choice_en"], +} + +# Generate reasoning path or not, only for single choice +with_reasoning = False + +# Use circular evaluation or not +with_circular_eval = False + +mathbench_datasets = [] + +for _split in list(mathbench_sets.keys()): + for _name in mathbench_sets[_split]: + mathbench_reader_cfg = dict( + input_columns=["question"], + output_column="answer", + ) + + if "single_choice" in _name: + if with_reasoning: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + "_with_reasoning"]), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template={answer: f"{single_choice_prompts[_name]}{answer}" for answer in ['A', 'B', 'C', 'D']}), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template='\n'.join(cloze_prompts[_name])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ) + + + if "single_choice" in _name: + pred_postprocessor = dict(type=first_option_postprocess, options="ABCD") + else: + pred_postprocessor = dict(type=mathbench_postprocess, name=_name) + if "single_choice" in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + mathbench_eval_cfg = dict( + evaluator=evaluator, + pred_postprocessor=pred_postprocessor, + ) + + mathbench_datasets.append( + dict( + abbr="mathbench-" + _split + "-" + _name, + type=MathBenchDataset, + path=f"./data/mathbench_v1_ori/{_split}", + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_gen.py b/configs/datasets/MathBench/mathbench_gen.py index a4ae1207f..942566fac 100644 --- a/configs/datasets/MathBench/mathbench_gen.py +++ b/configs/datasets/MathBench/mathbench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mathbench_2024_gen_de9ff9 import mathbench_datasets # noqa: F401, F403 + from .mathbench_2024_gen_649349 import mathbench_datasets # noqa: F401, F403 diff --git a/configs/datasets/apps/apps_mini_gen_c7893a.py b/configs/datasets/apps/apps_mini_gen_c7893a.py index dea71c95d..e9b90ca89 100644 --- a/configs/datasets/apps/apps_mini_gen_c7893a.py +++ b/configs/datasets/apps/apps_mini_gen_c7893a.py @@ -19,7 +19,7 @@ dict( type=APPS_miniDataset, abbr="apps_mini", - path="codeparrot_mini/apps", + path="./data/apps_mini", num_repeats=1, reader_cfg=APPS_reader_cfg, infer_cfg=APPS_infer_cfg, diff --git a/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py b/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py new file mode 100644 index 000000000..c2075289a --- /dev/null +++ b/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py @@ -0,0 +1,102 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import CEvalDataset + +ceval_subject_mapping = { + 'computer_network': ['Computer Network', '计算机网络', 'STEM'], + 'operating_system': ['Operating System', '操作系统', 'STEM'], + 'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'], + 'college_programming': ['College Programming', '大学编程', 'STEM'], + 'college_physics': ['College Physics', '大学物理', 'STEM'], + 'college_chemistry': ['College Chemistry', '大学化学', 'STEM'], + 'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'], + 'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'], + 'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'], + 'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'], + 'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'], + 'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'], + 'high_school_physics': ['High School Physics', '高中物理', 'STEM'], + 'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'], + 'high_school_biology': ['High School Biology', '高中生物', 'STEM'], + 'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'], + 'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'], + 'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'], + 'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'], + 'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'], + 'college_economics': ['College Economics', '大学经济学', 'Social Science'], + 'business_administration': ['Business Administration', '工商管理', 'Social Science'], + 'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'], + 'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'], + 'education_science': ['Education Science', '教育学', 'Social Science'], + 'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'], + 'high_school_politics': ['High School Politics', '高中政治', 'Social Science'], + 'high_school_geography': ['High School Geography', '高中地理', 'Social Science'], + 'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'], + 'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'], + 'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'], + 'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'], + 'logic': ['Logic', '逻辑学', 'Humanities'], + 'law': ['Law', '法学', 'Humanities'], + 'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'], + 'art_studies': ['Art Studies', '艺术学', 'Humanities'], + 'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'], + 'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'], + 'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'], + 'high_school_history': ['High School History', '高中历史', 'Humanities'], + 'middle_school_history': ['Middle School History', '初中历史', 'Humanities'], + 'civil_servant': ['Civil Servant', '公务员', 'Other'], + 'sports_science': ['Sports Science', '体育学', 'Other'], + 'plant_protection': ['Plant Protection', '植物保护', 'Other'], + 'basic_medicine': ['Basic Medicine', '基础医学', 'Other'], + 'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'], + 'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'], + 'accountant': ['Accountant', '注册会计师', 'Other'], + 'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'], + 'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'], + 'tax_accountant': ['Tax Accountant', '税务师', 'Other'], + 'physician': ['Physician', '医师资格', 'Other'], +} +ceval_all_sets = list(ceval_subject_mapping.keys()) + +ceval_datasets = [] +for _split in ["val", "test"]: + for _name in ceval_all_sets: + ceval_reader_cfg = dict( + input_columns=["question", "A", "B", "C", "D"], + output_column="answer", + train_split="dev", + test_split=_split, + ) + + _ch_name = ceval_subject_mapping[_name][1] + + hint = f"以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。" + question_and_options = "{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}" + ceval_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={answer: f"{question_and_options}\n答案: {answer}\n" for answer in ["A", "B", "C", "D"]}, + ), + prompt_template=dict( + type=PromptTemplate, + template={answer: f"{hint}\n{question_and_options}\n答案: {answer}" for answer in ["A", "B", "C", "D"]}, + ice_token="", + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer), + ) + + ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + + ceval_datasets.append( + dict( + type=CEvalDataset, + path="./data/ceval_internal/formal_ceval", + name=_name, + abbr="ceval-" + _name if _split == "val" else "ceval-test-" + _name, + reader_cfg=ceval_reader_cfg, + infer_cfg=ceval_infer_cfg, + eval_cfg=ceval_eval_cfg, + )) diff --git a/configs/datasets/collections/chat_core.py b/configs/datasets/collections/chat_core.py index 2b64f5b62..f87d9bfd5 100644 --- a/configs/datasets/collections/chat_core.py +++ b/configs/datasets/collections/chat_core.py @@ -12,7 +12,7 @@ from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from ..math.math_evaluatorv2_gen_265cce import math_datasets + from ..math.math_evaluatorv2_gen_cecb31 import math_datasets from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets diff --git a/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py b/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py new file mode 100644 index 000000000..5dad31d42 --- /dev/null +++ b/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator + +gsm8k_reader_cfg = dict(input_columns=["question"], output_column="answer") + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role="HUMAN", prompt="{question}\nPlease reason step by step, and put your final answer within \\boxed{}."), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +gsm8k_eval_cfg = dict( + evaluator=dict(type=Gsm8kEvaluator), + pred_postprocessor=dict(type=gsm8k_postprocess), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), +) + +gsm8k_datasets = [ + dict( + abbr="gsm8k", + type=GSM8KDataset, + path="./data/gsm8k", + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/configs/datasets/math/math_evaluatorv2_gen_265cce.py b/configs/datasets/math/deprecated_math_evaluatorv2_gen_265cce.py similarity index 100% rename from configs/datasets/math/math_evaluatorv2_gen_265cce.py rename to configs/datasets/math/deprecated_math_evaluatorv2_gen_265cce.py diff --git a/configs/datasets/math/math_0shot_gen_393424.py b/configs/datasets/math/math_0shot_gen_393424.py new file mode 100644 index 000000000..29db75928 --- /dev/null +++ b/configs/datasets/math/math_0shot_gen_393424.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=["problem"], output_column="solution") + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role="HUMAN", prompt="{problem}\nPlease reason step by step, and put your final answer within \\boxed{}."), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version="v2"), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr="math", + path="./data/math/math.json", + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/configs/datasets/math/math_evaluatorv2_gen_cecb31.py b/configs/datasets/math/math_evaluatorv2_gen_cecb31.py new file mode 100644 index 000000000..d603bac18 --- /dev/null +++ b/configs/datasets/math/math_evaluatorv2_gen_cecb31.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role="HUMAN", prompt="Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"), + dict(role="BOT", prompt="The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"), + dict(role="HUMAN", prompt="Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"), + dict(role="BOT", prompt="We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"), + dict(role="HUMAN", prompt="Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"), + dict(role="BOT", prompt="If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"), + dict(role="HUMAN", prompt="Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"), + dict(role="BOT", prompt="If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"), + dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='./data/math/math.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/configs/datasets/subjective/compassarena/compassarena_compare.py b/configs/datasets/subjective/compassarena/compassarena_compare.py index 59464dbc7..e31d174e5 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -91,7 +91,7 @@ creation_prompt = """ 请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 评分要求(重要性依次递减): -1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 +1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答 3. 好的回答必须具有创造性的词语和表达丰富度 @@ -99,7 +99,7 @@ {question} """ + base_prompt -sub_map = {"knowledge": knowledge_prompt, "language": language_prompt, "math_v2": math_prompt, "reason_v2": reason_prompt, "creationv2_zh": creation_prompt} +sub_map = {"language": language_prompt, "knowledge": knowledge_prompt, "reason_v2": reason_prompt, "math_v2": math_prompt, "creationv2_zh": creation_prompt} for _name, _prompt in sub_map.items(): subjective_infer_cfg = dict( diff --git a/configs/datasets/subjective/compassarena/compassarena_compare_creationv3.py b/configs/datasets/subjective/compassarena/compassarena_compare_creationv3.py new file mode 100644 index 000000000..9518ac2bc --- /dev/null +++ b/configs/datasets/subjective/compassarena/compassarena_compare_creationv3.py @@ -0,0 +1,145 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import CompassArenaDataset + +subjective_reader_cfg = dict( + input_columns=['question', 'ref'], + output_column='judge', + ) + +data_path ="data/subjective/compass_arena" + +subjective_datasets = [] + +base_prompt = """ + +[回答1开始] +{prediction} +[回答1结束] + +[回答2开始] +{prediction2} +[回答2结束] + +根据评分要求,在以下 3 个选项中做出选择: +A. 回答1更好 +B. 回答2更好 +C. 回答1、2平局 +并提供你的解释原因。 + +如果你认为回答1更好,你的输出应形如: +选择:A +原因:blahblah blahblah\n + +如果你认为回答2更好,你的输出应形如: +选择:B +原因:blahblah blahblah\n + +如果你认为回答1、2打成平手,你的输出应形如: +选择:C +原因:blahblah blahblah\n +""" + +knowledge_prompt = """ +请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 更好的回答能与参考答案吻合或表明参考答案的意思。 +2. 在都准确答对问题的前提下,更好的回答能对知识点进行额外补充,且补充的知识准确无误。 +3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 + +[用户问题] +{question} + +[参考答案] +{ref} +""" + base_prompt + + +language_prompt = """ +请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 在有明确的参考答案的情况下,越贴近参考答案或表明了参考答案的意思的回答越好。 +2. 更好的回答在语言表达上更流畅,更加符合与人类对话的习惯,包括语气、情调等 +3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误。 + +[用户问题] +{question} + +[参考答案] +{ref} +""" + base_prompt + + +math_prompt = """ +请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 更好的回答的答案能和参考答案一致。 +2. 若两个回答的答案都与参考答案不一致,则更好的回答的推理过程应更加合理。 +3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 + +[用户问题] +{question} + +[参考答案] +{ref} +""" + base_prompt + +reason_prompt = math_prompt + +creation_prompt = """ +请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 +2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答 +3. 好的回答必须具有创造性的词语和表达丰富度 + +[用户问题] +{question} +""" + base_prompt + +sub_map = {"creationv3": creation_prompt} + +for _name, _prompt in sub_map.items(): + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt="{question}" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + infer_order='double', + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt = _prompt + ), + ]), + ), + ), + pred_role="BOT", + ) + + subjective_datasets.append( + dict( + abbr=f"{_name}", + type=CompassArenaDataset, + path=data_path, + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg + )) diff --git a/configs/datasets/taco/taco_gen_c7893a.py b/configs/datasets/taco/taco_gen_c7893a.py index edb9d62cb..300a03273 100644 --- a/configs/datasets/taco/taco_gen_c7893a.py +++ b/configs/datasets/taco/taco_gen_c7893a.py @@ -19,7 +19,7 @@ dict( type=TACODataset, abbr="TACO", - path='BAAI/TACO', + path='./data/BAAI-TACO', num_repeats = 1, reader_cfg=TACO_reader_cfg, infer_cfg=TACO_infer_cfg, diff --git a/configs/eval_internlm2_chat_keyset.py b/configs/eval_internlm2_chat_keyset.py index ec3ca84e7..27e95a4cf 100644 --- a/configs/eval_internlm2_chat_keyset.py +++ b/configs/eval_internlm2_chat_keyset.py @@ -6,7 +6,7 @@ from .datasets.agieval.agieval_gen_64afd3 import agieval_datasets from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from .datasets.math.math_evaluatorv2_gen_265cce import math_datasets + from .datasets.math.math_evaluatorv2_gen_cecb31 import math_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets diff --git a/configs/models/mistral/hf_mistral_7b_v0_2.py b/configs/models/mistral/hf_mistral_7b_v0_2.py new file mode 100644 index 000000000..02a0a0886 --- /dev/null +++ b/configs/models/mistral/hf_mistral_7b_v0_2.py @@ -0,0 +1,23 @@ +from opencompass.models import HuggingFaceCausalLM + + +models = [ + dict( + abbr='mistral-7b-v0.2-hf', + type=HuggingFaceCausalLM, + path='alpindale/Mistral-7B-v0.2-hf', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + max_out_len=100, + max_seq_len=2048, + batch_size=8, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/nanbeige/hf_nanbeige2_8b_chat.py b/configs/models/nanbeige/hf_nanbeige2_8b_chat.py new file mode 100644 index 000000000..a399a5d49 --- /dev/null +++ b/configs/models/nanbeige/hf_nanbeige2_8b_chat.py @@ -0,0 +1,36 @@ +from opencompass.models import HuggingFaceCausalLM + +_meta_template = dict( + begin="<|im_start|>system\n你是一个名为\"南北阁\"的人工智能助手,正在与人类用户进行交谈。你的目标是以最有帮助和最逻辑的方式回答问题,同时确保内容的安全性。你的回答中不应包含任何有害、政治化、宗教化、不道德、种族主义、非法的内容。请确保你的回答不带有社会偏见,符合社会主义价值观。如果遇到的问题无意义或事实上不连贯,请不要回答错误的内容,而是解释问题为何无效或不连贯。如果你不知道问题的答案,也请勿提供错误的信息。<|im_end|>\n", + round=[ + dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), + ], +) + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='nanbeige2-8b-chat-hf', + path="Nanbeige/Nanbeige2-8B-Chat", + tokenizer_path='Nanbeige/Nanbeige2-8B-Chat', + model_kwargs=dict( + device_map='auto', + torch_dtype='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='right', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + meta_template=_meta_template, + batch_padding=False, + max_out_len=100, + max_seq_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1, num_procs=1), + end_str='<|im_end|>', + ) +] diff --git a/configs/models/others/hf_dbrx_instruct.py b/configs/models/others/hf_dbrx_instruct.py new file mode 100644 index 000000000..263129e20 --- /dev/null +++ b/configs/models/others/hf_dbrx_instruct.py @@ -0,0 +1,35 @@ + +from opencompass.models import HuggingFaceCausalLM + + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), + ], +) + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='dbrx-instruct-hf', + path="databricks/dbrx-instruct", + model_kwargs=dict( + trust_remote_code=True, + device_map='auto', + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + use_fast=False, + trust_remote_code=True, + ), + max_out_len=100, + max_seq_len=2048, + batch_size=8, + meta_template=_meta_template, + run_cfg=dict(num_gpus=8, num_procs=1), + end_str='<|im_end|>', + batch_padding=True, + ) +] diff --git a/configs/summarizers/compass_knowledge.py b/configs/summarizers/compass_knowledge.py index dd46e8d86..336ed37c7 100644 --- a/configs/summarizers/compass_knowledge.py +++ b/configs/summarizers/compass_knowledge.py @@ -12,7 +12,6 @@ {'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]}, ] -'compassbench_v1_knowledge-mixed-cloze_en' summarizer = dict( dataset_abbrs=[ 'knowledge_perf_4_and_cloze', diff --git a/configs/summarizers/compassbench_v1_objective.py b/configs/summarizers/compassbench_v1_objective.py new file mode 100644 index 000000000..cf1a68b53 --- /dev/null +++ b/configs/summarizers/compassbench_v1_objective.py @@ -0,0 +1,227 @@ + +from mmengine.config import read_base + +with read_base(): + from .groups.cibench import cibench_summary_groups + from .groups.plugineval import plugineval_summary_groups + + +compassbench_v1_language_names = [ + # ['information_retrieval_en', 'score'], + # ['information_retrieval_zh', 'score'], + ['intention_recognition_en_circular', 'acc_origin'], + ['intention_recognition_en_circular', 'perf_circular'], + ['intention_recognition_zh_circular', 'acc_origin'], + ['intention_recognition_zh_circular', 'perf_circular'], + ['sentiment_analysis_en_circular', 'acc_origin'], + ['sentiment_analysis_en_circular', 'perf_circular'], + ['sentiment_analysis_zh_circular', 'acc_origin'], + ['sentiment_analysis_zh_circular', 'perf_circular'], + ['translation', 'score'], + ['content_critic_en_circular', 'acc_origin'], + ['content_critic_en_circular', 'perf_circular'], + ['content_critic_zh_circular', 'acc_origin'], + ['content_critic_zh_circular', 'perf_circular'], + ['content_summarization_en', 'rouge1'], + ['content_summarization_zh', 'rouge1'], + ['traditional_cultural_understanding_zh_circular', 'acc_origin'], + ['traditional_cultural_understanding_zh_circular', 'perf_circular'], + ['chinese_semantic_understanding_zh_circular', 'acc_origin'], + ['chinese_semantic_understanding_zh_circular', 'perf_circular'], +] + +compassbench_v1_language_summary_groups = [ + {'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']}, + {'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']}, + {'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']}, + + {'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']}, + {'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']}, + {'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']}, +] + +# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen` +compassbench_v1_knowledge_names = [ + 'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', + 'compassbench_v1_knowledge-humanity-single_choice_cn_circular', + 'compassbench_v1_knowledge-natural_science-single_choice_cn_circular', + 'compassbench_v1_knowledge-social_science-single_choice_cn_circular', +] + +compassbench_v1_knowledge_summary_groups = [ + {'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names}, + {'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]}, + {'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]}, +] + +compassbench_v1_reason_summary_groups = [ + {'name': 'reasonbench_cn_abductive_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular']}, + {'name': 'reasonbench_en_abductive_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular']}, + {'name': 'reasonbench_cn_deductive_circular', 'subsets': ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular']}, + {'name': 'reasonbench_cn_inductive_circular', 'subsets': ['reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']}, + {'name': 'reasonbench_en_inductive_circular', 'subsets': ['reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']}, + + {'name': 'reasonbench_cn_circular', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_abductive_circular', 'reasonbench_cn_deductive_circular', 'reasonbench_cn_inductive_circular']}, + {'name': 'reasonbench_en_circular', 'subsets': ['reasonbench_en_commonsense_circular', 'reasonbench_en_abductive_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_inductive_circular']}, + {'name': 'reasonbench', 'subsets': ['reasonbench_cn_circular', 'reasonbench_en_circular']}, +] + +compassbench_v1_math_summary_groups = [ + {'name': 'math_acc_1_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'acc_1'], ['compassbench_v1_math-high-single_choice_en', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]}, + {'name': 'math_perf_4_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]}, + {'name': 'math_perf_4_and_fill_in_blank_cn', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy']]}, + {'name': 'math_perf_4_and_fill_in_blank_en', 'subsets': [['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]}, +] + + +code_passk_summary_groups = [ + # rename + {'name': 'humaneval_pass@1(greedy)', 'subsets': [['openai_humaneval', 'humaneval_pass@1']]}, + {'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_passk', 'humaneval_pass@10']]}, + {'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_repeat10', 'humaneval_pass@10']]}, + {'name': 'humaneval_cn_pass@1(greedy)', 'subsets': [['openai_humaneval_cn', 'humaneval_pass@1']]}, + {'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_passk', 'humaneval_pass@10']]}, + {'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_repeat10', 'humaneval_pass@10']]}, + {'name': 'humaneval_plus_pass@1(greedy)', 'subsets': [['humaneval_plus', 'humaneval_plus_pass@1']]}, + {'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_passk', 'humaneval_plus_pass@10']]}, + {'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_repeat10', 'humaneval_plus_pass@10']]}, + {'name': 'mbpp_pass@1(greedy)', 'subsets': [['mbpp', 'score']]}, + {'name': 'mbpp_pass@10', 'subsets': [['mbpp_passk', 'pass@10']]}, + {'name': 'mbpp_pass@10', 'subsets': [['mbpp_repeat10', 'pass@10']]}, + {'name': 'mbpp_cn_pass@1(greedy)', 'subsets': [['mbpp_cn', 'score']]}, + {'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_passk', 'pass@10']]}, + {'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_repeat10', 'pass@10']]}, + {'name': 'sanitized_mbpp_pass@1(greedy)', 'subsets': [['sanitized_mbpp', 'score']]}, + {'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_passk', 'pass@10']]}, + {'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]}, + # real add + {'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']}, + {'name': 'code_cn', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)']}, + {'name': 'code_en', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']}, + {'name': 'code', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']}, +] + +agent_summary_groups = [ + # dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']), + # dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']), + + dict(name='cibench_template', subsets=['cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim']), + dict(name='cibench_template_cn', subsets=['cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim']), + + dict(name='agent_cn', subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh']), + dict(name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']), + dict(name='agent', subsets=['agent_cn', 'agent_en']), +] + +other_summary_groups = [ + { + "name": "average_cn", + "subsets": [ + ["language_zh_perf_4_and_non_mcq", "naive_average"], + ["knowledge_cn", "perf_4"], + ["reasonbench_cn_circular", "perf_circular"], + ["math_perf_4_and_fill_in_blank_cn", "naive_average"], + ["code_cn", "naive_average"], + ["agent_cn", "naive_average"], + ], + }, + { + "name": "average_en", + "subsets": [ + ["language_en_perf_4_and_non_mcq", "naive_average"], + ["compassbench_v1_knowledge-mixed-cloze_en", "score"], + ["reasonbench_en_circular", "perf_circular"], + ["math_perf_4_and_fill_in_blank_en", "naive_average"], + ["code_en", "naive_average"], + ["agent_en", "naive_average"], + ], + }, + { + "name": "average", + "subsets": [ + ["language_perf_4_and_non_mcq", "naive_average"], + ["knowledge_perf_4_and_cloze", "naive_average"], + ["reasonbench", "perf_circular"], + ["math_perf_4_and_fill_in_blank", "naive_average"], + ["code", "naive_average"], + ["agent", "naive_average"], + ], + }, +] + + + + +summarizer = dict( + dataset_abbrs=[ + ['average', 'naive_average'], + ['average_cn', 'naive_average'], + ['average_en', 'naive_average'], + '', + '', + '', + + ['language_perf_4_and_non_mcq', 'naive_average'], + ['language_zh_perf_4_and_non_mcq', 'naive_average'], + ['language_en_perf_4_and_non_mcq', 'naive_average'], + ['intention_recognition_zh_circular', 'perf_circular'], + ['intention_recognition_en_circular', 'perf_circular'], + ['sentiment_analysis_zh_circular', 'perf_circular'], + ['sentiment_analysis_en_circular', 'perf_circular'], + ['translation', 'score'], + ['content_critic_zh_circular', 'perf_circular'], + ['content_critic_en_circular', 'perf_circular'], + ['content_summarization_zh', 'rouge1'], + ['content_summarization_en', 'rouge1'], + ['traditional_cultural_understanding_zh_circular', 'perf_circular'], + ['chinese_semantic_understanding_zh_circular', 'perf_circular'], + + ['knowledge_perf_4_and_cloze', 'naive_average'], + ['knowledge_cn', 'perf_4'], + ['compassbench_v1_knowledge-mixed-cloze_en', 'score'], + ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'], + ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'], + ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'], + ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'], + + ['reasonbench', 'perf_circular'], + ['reasonbench_cn_circular', 'perf_circular'], + ['reasonbench_en_circular', 'perf_circular'], + ['reasonbench_cn_commonsense_circular', 'perf_circular'], + ['reasonbench_cn_abductive_circular', 'perf_circular'], + ['reasonbench_cn_deductive_circular', 'perf_circular'], + ['reasonbench_cn_inductive_circular', 'perf_circular'], + ['reasonbench_en_commonsense_circular', 'perf_circular'], + ['reasonbench_en_abductive_circular', 'perf_circular'], + ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'], + ['reasonbench_en_inductive_circular', 'perf_circular'], + + ['math_perf_4_and_fill_in_blank', 'naive_average'], + ['math_perf_4_and_fill_in_blank_cn', 'naive_average'], + ['math_perf_4_and_fill_in_blank_en', 'naive_average'], + ['compassbench_v1_math-high-single_choice_cn', 'perf_4'], + ['compassbench_v1_math-high-single_choice_en', 'perf_4'], + ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], + ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], + ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], + ['compassbench_v1_math-primary-cloze_en', 'accuracy'], + + ['code', 'naive_average'], + ['code_cn', 'naive_average'], + ['code_en', 'naive_average'], + ['humaneval_cn_pass@1(greedy)', 'naive_average'], + ['humaneval_plus_pass@1(greedy)', 'naive_average'], + ['mbpp_cn_pass@1(greedy)', 'naive_average'], + ['sanitized_mbpp_pass@1(greedy)', 'naive_average'], + ['humanevalx', 'naive_average'], + + ['agent', 'naive_average'], + ['agent_cn', 'naive_average'], + ['agent_en', 'naive_average'], + ['cibench_template_cn', 'naive_average'], + ['cibench_template', 'naive_average'], + ['plugin_eval-mus-p10_one_review_zh', 'naive_average'], + ['plugin_eval-mus-p10_one_review', 'naive_average'], + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), +) diff --git a/configs/summarizers/groups/lcbench.py b/configs/summarizers/groups/lcbench.py new file mode 100644 index 000000000..bfdc17e27 --- /dev/null +++ b/configs/summarizers/groups/lcbench.py @@ -0,0 +1,3 @@ +lcbench_summary_groups = [ + {'name': 'lcbench', 'subsets': ['lcbench_en', 'lcbench_cn']}, +] diff --git a/configs/summarizers/groups/mathbench_v1.py b/configs/summarizers/groups/mathbench_v1.py new file mode 100644 index 000000000..08b241ea4 --- /dev/null +++ b/configs/summarizers/groups/mathbench_v1.py @@ -0,0 +1,13 @@ +mathbench_v1_summary_groups = [ + {'name': 'mathbench-college_application', 'subsets': ['mathbench-college-single_choice_cn', 'mathbench-college-single_choice_en']}, + {'name': 'mathbench-high_application', 'subsets': ['mathbench-high-single_choice_cn', 'mathbench-high-single_choice_en']}, + {'name': 'mathbench-middle_application', 'subsets': ['mathbench-middle-single_choice_cn', 'mathbench-middle-single_choice_en']}, + {'name': 'mathbench-primary_application', 'subsets': ['mathbench-primary-cloze_cn', 'mathbench-primary-cloze_en', 'mathbench-calculate-cloze_en'], 'weights': {'mathbench-primary-cloze_cn': 1, 'mathbench-primary-cloze_en': 1, 'mathbench-calculate-cloze_en': 2}}, + {'name': 'mathbench-college_knowledge', 'subsets': ['mathbench-college_knowledge-single_choice_cn', 'mathbench-college_knowledge-single_choice_en']}, + {'name': 'mathbench-high_knowledge', 'subsets': ['mathbench-high_knowledge-single_choice_cn', 'mathbench-high_knowledge-single_choice_en']}, + {'name': 'mathbench-middle_knowledge', 'subsets': ['mathbench-middle_knowledge-single_choice_cn', 'mathbench-middle_knowledge-single_choice_en']}, + {'name': 'mathbench-primary_knowledge', 'subsets': ['mathbench-primary_knowledge-single_choice_cn', 'mathbench-primary_knowledge-single_choice_en']}, + {'name': 'mathbench_application', 'subsets': ['mathbench-college_application', 'mathbench-high_application', 'mathbench-middle_application', 'mathbench-primary_application']}, + {'name': 'mathbench_knowledge', 'subsets': ['mathbench-college_knowledge', 'mathbench-high_knowledge', 'mathbench-middle_knowledge', 'mathbench-primary_knowledge']}, + {'name': 'mathbench', 'subsets': ['mathbench_application', 'mathbench_knowledge']}, +] diff --git a/configs/summarizers/groups/plugineval.py b/configs/summarizers/groups/plugineval.py index c94146962..994ee1ee8 100644 --- a/configs/summarizers/groups/plugineval.py +++ b/configs/summarizers/groups/plugineval.py @@ -71,6 +71,40 @@ ['plugin_eval-review_str_v1', 'review_quality'], ] }, + + # special treatment for first 10% data points + { + 'name': 'plugin_eval-p10-instruct_v1', + 'metric': 'format_metric', + 'subsets': [ + ['plugin_eval-p10-instruct_v1', 'string_format_metric'], + ['plugin_eval-p10-instruct_v1', 'json_format_metric'], + ] + }, + { + 'name': 'plugin_eval-p10-instruct_v1', + 'metric': 'args_em_metric', + 'subsets': [ + ['plugin_eval-p10-instruct_v1', 'string_args_em_metric'], + ['plugin_eval-p10-instruct_v1', 'json_args_em_metric'], + ] + }, + { + 'name': 'plugin_eval-p10', + 'subsets': [ + ['plugin_eval-p10-instruct_v1', 'format_metric'], + ['plugin_eval-p10-instruct_v1', 'args_em_metric'], + ['plugin_eval-p10-plan_str_v1', 'f1_score'], + ['plugin_eval-p10-plan_json_v1', 'f1_score'], + ['plugin_eval-p10-reason_str_v2', 'thought'], + ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'], + ['plugin_eval-p10-retrieve_str_v2', 'name'], + ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'], + ['plugin_eval-p10-understand_str_v2', 'args'], + ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'], + ['plugin_eval-p10-review_str_v6', 'review_quality'], + ] + }, ] plugineval_summary_groups = [] diff --git a/configs/summarizers/mathbench_v1.py b/configs/summarizers/mathbench_v1.py index 2a70ea1f6..1fe4c081d 100644 --- a/configs/summarizers/mathbench_v1.py +++ b/configs/summarizers/mathbench_v1.py @@ -1,6 +1,6 @@ summarizer = dict( dataset_abbrs=[ - '######## MathBench Accuracy ########', # category + '######## MathBench Application Accuracy ########', # category ['mathbench-college-single_choice_cn', 'acc_1'], ['mathbench-college-single_choice_en', 'acc_1'], ['mathbench-high-single_choice_cn', 'acc_1'], @@ -9,15 +9,15 @@ ['mathbench-middle-single_choice_en', 'acc_1'], ['mathbench-primary-cloze_cn', 'accuracy'], ['mathbench-primary-cloze_en', 'accuracy'], - ['mathbench-calculate-cloze_en', 'accuracy'], - '######## MathBench CircularEval ########', # category + ['mathbench-arithmetic-cloze_en', 'accuracy'], + '######## MathBench Application CircularEval ########', # category ['mathbench-college-single_choice_cn', 'perf_4'], ['mathbench-college-single_choice_en', 'perf_4'], ['mathbench-high-single_choice_cn', 'perf_4'], ['mathbench-high-single_choice_en', 'perf_4'], ['mathbench-middle-single_choice_cn', 'perf_4'], ['mathbench-middle-single_choice_en', 'perf_4'], - '######## MathBench Knowledge ########', # category + '######## MathBench Knowledge CircularEval ########', # category ['mathbench-college_knowledge-single_choice_cn', 'perf_4'], ['mathbench-college_knowledge-single_choice_en', 'perf_4'], ['mathbench-high_knowledge-single_choice_cn', 'perf_4'], @@ -26,6 +26,15 @@ ['mathbench-middle_knowledge-single_choice_en', 'perf_4'], ['mathbench-primary_knowledge-single_choice_cn', 'perf_4'], ['mathbench-primary_knowledge-single_choice_en', 'perf_4'], + '######## MathBench Knowledge Accuracy ########', # category + ['mathbench-college_knowledge-single_choice_cn', 'acc_1'], + ['mathbench-college_knowledge-single_choice_en', 'acc_1'], + ['mathbench-high_knowledge-single_choice_cn', 'acc_1'], + ['mathbench-high_knowledge-single_choice_en', 'acc_1'], + ['mathbench-middle_knowledge-single_choice_cn', 'acc_1'], + ['mathbench-middle_knowledge-single_choice_en', 'acc_1'], + ['mathbench-primary_knowledge-single_choice_cn', 'acc_1'], + ['mathbench-primary_knowledge-single_choice_en', 'acc_1'], ], summary_groups=sum( [v for k, v in locals().items() if k.endswith("_summary_groups")], []) diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py new file mode 100644 index 000000000..0c07906f5 --- /dev/null +++ b/opencompass/cli/main.py @@ -0,0 +1,364 @@ +import argparse +import getpass +import os +import os.path as osp +from datetime import datetime + +from mmengine.config import Config, DictAction + +from opencompass.partitioners import MultimodalNaivePartitioner +from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg +from opencompass.runners import SlurmRunner +from opencompass.summarizers import DefaultSummarizer +from opencompass.utils import LarkReporter, get_logger +from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg, + fill_infer_cfg, get_config_from_arg) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Run an evaluation task') + parser.add_argument('config', nargs='?', help='Train config file path') + + # add mutually exclusive args `--slurm` `--dlc`, defaults to local runner + # if "infer" or "eval" not specified + launch_method = parser.add_mutually_exclusive_group() + launch_method.add_argument('--slurm', + action='store_true', + default=False, + help='Whether to force tasks to run with srun. ' + 'If True, `--partition(-p)` must be set. ' + 'Defaults to False') + launch_method.add_argument('--dlc', + action='store_true', + default=False, + help='Whether to force tasks to run on dlc. If ' + 'True, `--aliyun-cfg` must be set. Defaults' + ' to False') + # multi-modal support + parser.add_argument('--mm-eval', + help='Whether or not enable multimodal evaluation', + action='store_true', + default=False) + # Add shortcut parameters (models, datasets and summarizer) + parser.add_argument('--models', nargs='+', help='', default=None) + parser.add_argument('--datasets', nargs='+', help='', default=None) + parser.add_argument('--summarizer', help='', default=None) + # add general args + parser.add_argument('--debug', + help='Debug mode, in which scheduler will run tasks ' + 'in the single process, and output will not be ' + 'redirected to files', + action='store_true', + default=False) + parser.add_argument('--dry-run', + help='Dry run mode, in which the scheduler will not ' + 'actually run the tasks, but only print the commands ' + 'to run', + action='store_true', + default=False) + parser.add_argument('-m', + '--mode', + help='Running mode. You can choose "infer" if you ' + 'only want the inference results, or "eval" if you ' + 'already have the results and want to evaluate them, ' + 'or "viz" if you want to visualize the results.', + choices=['all', 'infer', 'eval', 'viz'], + default='all', + type=str) + parser.add_argument('-r', + '--reuse', + nargs='?', + type=str, + const='latest', + help='Reuse previous outputs & results, and run any ' + 'missing jobs presented in the config. If its ' + 'argument is not specified, the latest results in ' + 'the work_dir will be reused. The argument should ' + 'also be a specific timestamp, e.g. 20230516_144254') + parser.add_argument('-w', + '--work-dir', + help='Work path, all the outputs will be ' + 'saved in this path, including the slurm logs, ' + 'the evaluation results, the summary results, etc.' + 'If not specified, the work_dir will be set to ' + './outputs/default.', + default=None, + type=str) + parser.add_argument( + '--config-dir', + default='configs', + help='Use the custom config directory instead of config/ to ' + 'search the configs for datasets, models and summarizers', + type=str) + parser.add_argument('-l', + '--lark', + help='Report the running status to lark bot', + action='store_true', + default=False) + parser.add_argument('--max-partition-size', + help='The maximum size of an infer task. Only ' + 'effective when "infer" is missing from the config.', + type=int, + default=40000), + parser.add_argument( + '--gen-task-coef', + help='The dataset cost measurement coefficient for generation tasks, ' + 'Only effective when "infer" is missing from the config.', + type=int, + default=20) + parser.add_argument('--max-num-workers', + help='Max number of workers to run in parallel. ' + 'Will be overrideen by the "max_num_workers" argument ' + 'in the config.', + type=int, + default=32) + parser.add_argument('--max-workers-per-gpu', + help='Max task to run in parallel on one GPU. ' + 'It will only be used in the local runner.', + type=int, + default=1) + parser.add_argument( + '--retry', + help='Number of retries if the job failed when using slurm or dlc. ' + 'Will be overrideen by the "retry" argument in the config.', + type=int, + default=2) + parser.add_argument( + '--dump-eval-details', + help='Whether to dump the evaluation details, including the ' + 'correctness of each sample, bpb, etc.', + action='store_true', + ) + # set srun args + slurm_parser = parser.add_argument_group('slurm_args') + parse_slurm_args(slurm_parser) + # set dlc args + dlc_parser = parser.add_argument_group('dlc_args') + parse_dlc_args(dlc_parser) + # set hf args + hf_parser = parser.add_argument_group('hf_args') + parse_hf_args(hf_parser) + # set custom dataset args + custom_dataset_parser = parser.add_argument_group('custom_dataset_args') + parse_custom_dataset_args(custom_dataset_parser) + args = parser.parse_args() + if args.slurm: + assert args.partition is not None, ( + '--partition(-p) must be set if you want to use slurm') + if args.dlc: + assert os.path.exists(args.aliyun_cfg), ( + 'When launching tasks using dlc, it needs to be configured ' + 'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"' + ' to specify a new path.') + return args + + +def parse_slurm_args(slurm_parser): + """These args are all for slurm launch.""" + slurm_parser.add_argument('-p', + '--partition', + help='Slurm partition name', + default=None, + type=str) + slurm_parser.add_argument('-q', + '--quotatype', + help='Slurm quota type', + default=None, + type=str) + slurm_parser.add_argument('--qos', + help='Slurm quality of service', + default=None, + type=str) + + +def parse_dlc_args(dlc_parser): + """These args are all for dlc launch.""" + dlc_parser.add_argument('--aliyun-cfg', + help='The config path for aliyun config', + default='~/.aliyun.cfg', + type=str) + + +def parse_hf_args(hf_parser): + """These args are all for the quick construction of HuggingFace models.""" + hf_parser.add_argument('--hf-path', type=str) + hf_parser.add_argument('--peft-path', type=str) + hf_parser.add_argument('--tokenizer-path', type=str) + hf_parser.add_argument('--model-kwargs', + nargs='+', + action=DictAction, + default={}) + hf_parser.add_argument('--tokenizer-kwargs', + nargs='+', + action=DictAction, + default={}) + hf_parser.add_argument('--max-out-len', type=int) + hf_parser.add_argument('--max-seq-len', type=int) + hf_parser.add_argument('--no-batch-padding', + action='store_true', + default=False) + hf_parser.add_argument('--batch-size', type=int) + hf_parser.add_argument('--num-gpus', type=int) + hf_parser.add_argument('--pad-token-id', type=int) + + +def parse_custom_dataset_args(custom_dataset_parser): + """These args are all for the quick construction of custom datasets.""" + custom_dataset_parser.add_argument('--custom-dataset-path', type=str) + custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str) + custom_dataset_parser.add_argument('--custom-dataset-data-type', + type=str, + choices=['mcq', 'qa']) + custom_dataset_parser.add_argument('--custom-dataset-infer-method', + type=str, + choices=['gen', 'ppl']) + + +def main(): + args = parse_args() + if args.dry_run: + args.debug = True + # initialize logger + logger = get_logger(log_level='DEBUG' if args.debug else 'INFO') + + cfg = get_config_from_arg(args) + if args.work_dir is not None: + cfg['work_dir'] = args.work_dir + else: + cfg.setdefault('work_dir', './outputs/default/') + + # cfg_time_str defaults to the current time + cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S') + if args.reuse: + if args.reuse == 'latest': + if not os.path.exists(cfg.work_dir) or not os.listdir( + cfg.work_dir): + logger.warning('No previous results to reuse!') + else: + dirs = os.listdir(cfg.work_dir) + dir_time_str = sorted(dirs)[-1] + else: + dir_time_str = args.reuse + logger.info(f'Reusing experiements from {dir_time_str}') + elif args.mode in ['eval', 'viz']: + raise ValueError('You must specify -r or --reuse when running in eval ' + 'or viz mode!') + + # update "actual" work_dir + cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str) + os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True) + + # dump config + output_config_path = osp.join(cfg.work_dir, 'configs', + f'{cfg_time_str}.py') + cfg.dump(output_config_path) + # Config is intentally reloaded here to avoid initialized + # types cannot be serialized + cfg = Config.fromfile(output_config_path, format_python_code=False) + + # report to lark bot if specify --lark + if not args.lark: + cfg['lark_bot_url'] = None + elif cfg.get('lark_bot_url', None): + content = f'{getpass.getuser()}\'s task has been launched!' + LarkReporter(cfg['lark_bot_url']).post(content) + + if args.mode in ['all', 'infer']: + # When user have specified --slurm or --dlc, or have not set + # "infer" in config, we will provide a default configuration + # for infer + if (args.dlc or args.slurm) and cfg.get('infer', None): + logger.warning('You have set "infer" in the config, but ' + 'also specified --slurm or --dlc. ' + 'The "infer" configuration will be overridden by ' + 'your runtime arguments.') + # Check whether run multimodal evaluation + if args.mm_eval: + partitioner = MultimodalNaivePartitioner( + osp.join(cfg['work_dir'], 'predictions/')) + tasks = partitioner(cfg) + exec_mm_infer_runner(tasks, args, cfg) + return + + if args.dlc or args.slurm or cfg.get('infer', None) is None: + fill_infer_cfg(cfg, args) + + if args.partition is not None: + if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner: + cfg.infer.runner.partition = args.partition + cfg.infer.runner.quotatype = args.quotatype + else: + logger.warning('SlurmRunner is not used, so the partition ' + 'argument is ignored.') + if args.debug: + cfg.infer.runner.debug = True + if args.lark: + cfg.infer.runner.lark_bot_url = cfg['lark_bot_url'] + cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'], + 'predictions/') + partitioner = PARTITIONERS.build(cfg.infer.partitioner) + tasks = partitioner(cfg) + if args.dry_run: + return + runner = RUNNERS.build(cfg.infer.runner) + # Add extra attack config if exists + if hasattr(cfg, 'attack'): + for task in tasks: + cfg.attack.dataset = task.datasets[0][0].abbr + task.attack = cfg.attack + runner(tasks) + + # evaluate + if args.mode in ['all', 'eval']: + # When user have specified --slurm or --dlc, or have not set + # "eval" in config, we will provide a default configuration + # for eval + if (args.dlc or args.slurm) and cfg.get('eval', None): + logger.warning('You have set "eval" in the config, but ' + 'also specified --slurm or --dlc. ' + 'The "eval" configuration will be overridden by ' + 'your runtime arguments.') + + if args.dlc or args.slurm or cfg.get('eval', None) is None: + fill_eval_cfg(cfg, args) + if args.dump_eval_details: + cfg.eval.runner.task.dump_details = True + + if args.partition is not None: + if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner: + cfg.eval.runner.partition = args.partition + cfg.eval.runner.quotatype = args.quotatype + else: + logger.warning('SlurmRunner is not used, so the partition ' + 'argument is ignored.') + if args.debug: + cfg.eval.runner.debug = True + if args.lark: + cfg.eval.runner.lark_bot_url = cfg['lark_bot_url'] + cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/') + partitioner = PARTITIONERS.build(cfg.eval.partitioner) + tasks = partitioner(cfg) + if args.dry_run: + return + runner = RUNNERS.build(cfg.eval.runner) + + # For meta-review-judge in subjective evaluation + if isinstance(tasks, list) and len(tasks) != 0 and isinstance( + tasks[0], list): + for task_part in tasks: + runner(task_part) + else: + runner(tasks) + + # visualize + if args.mode in ['all', 'eval', 'viz']: + summarizer_cfg = cfg.get('summarizer', {}) + if not summarizer_cfg or summarizer_cfg.get('type', None) is None: + summarizer_cfg['type'] = DefaultSummarizer + summarizer_cfg['config'] = cfg + summarizer = build_from_cfg(summarizer_cfg) + summarizer.summarize(time_str=cfg_time_str) + + +if __name__ == '__main__': + main() diff --git a/opencompass/datasets/NPHardEval/cmp_GCP_D.py b/opencompass/datasets/NPHardEval/cmp_GCP_D.py index f94564cc6..b8c5a6ad8 100644 --- a/opencompass/datasets/NPHardEval/cmp_GCP_D.py +++ b/opencompass/datasets/NPHardEval/cmp_GCP_D.py @@ -1,6 +1,10 @@ import ast -import networkx as nx +try: + import networkx as nx +except ImportError: + nx = None + from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator diff --git a/opencompass/datasets/NPHardEval/cmp_TSP_D.py b/opencompass/datasets/NPHardEval/cmp_TSP_D.py index 52819dcec..c56df0f07 100644 --- a/opencompass/datasets/NPHardEval/cmp_TSP_D.py +++ b/opencompass/datasets/NPHardEval/cmp_TSP_D.py @@ -1,7 +1,11 @@ import ast import json -import networkx as nx +try: + import networkx as nx +except ImportError: + nx = None + import pandas as pd from datasets import Dataset diff --git a/opencompass/datasets/NPHardEval/p_SPP.py b/opencompass/datasets/NPHardEval/p_SPP.py index 7e143664c..6ade3d827 100644 --- a/opencompass/datasets/NPHardEval/p_SPP.py +++ b/opencompass/datasets/NPHardEval/p_SPP.py @@ -1,7 +1,11 @@ import ast import json -import networkx as nx +try: + import networkx as nx +except ImportError: + nx = None + from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index d76de5c05..852d33ab0 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -3,6 +3,7 @@ from .agieval import * # noqa: F401, F403 from .anli import AnliDataset # noqa: F401, F403 from .anthropics_evals import * # noqa: F401, F403 +from .apps import * # noqa: F401, F403 from .arc import * # noqa: F401, F403 from .ax import * # noqa: F401, F403 from .bbh import * # noqa: F401, F403 @@ -94,6 +95,7 @@ from .summscreen import * # noqa: F401, F403 from .svamp import * # noqa: F401, F403 from .tabmwp import * # noqa: F401, F403 +from .taco import * # noqa: F401, F403 from .teval import * # noqa: F401, F403 from .TheoremQA import * # noqa: F401, F403 from .tnews import * # noqa: F401, F403 diff --git a/opencompass/datasets/apps.py b/opencompass/datasets/apps.py index 95c37a88d..d2ce4e346 100644 --- a/opencompass/datasets/apps.py +++ b/opencompass/datasets/apps.py @@ -19,13 +19,19 @@ import numpy as np from datasets import Dataset, DatasetDict, load_dataset, load_from_disk -from pyext import RuntimeModule + +try: + from pyext import RuntimeModule +except ImportError: + RuntimeModule = None from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils.logging import get_logger from .base import BaseDataset +logger = get_logger() TIMEOUT = 10 @@ -67,18 +73,20 @@ def load(path: str, num_repeats: int = 1): new_dataset[split] = Dataset.from_dict(new_data) # num_repeats duplicate - train_repeated = [] + # train_repeated = [] test_repeated = [] - for sample in new_dataset['train']: - train_repeated.extend([sample] * num_repeats) + # for sample in new_dataset['train']: + # train_repeated.extend([sample] * num_repeats) for sample in new_dataset['test']: test_repeated.extend([sample] * num_repeats) - dataset_train_repeated = new_dataset['train'].from_list(train_repeated) + # dataset_train_repeated = new_dataset['train'].from_list( + # train_repeated + # ) dataset_test_repeated = new_dataset['test'].from_list(test_repeated) return DatasetDict({ - 'train': dataset_train_repeated, + # 'train': dataset_train_repeated, 'test': dataset_test_repeated }) @@ -121,18 +129,20 @@ def load(path: str, num_repeats: int = 1): new_dataset[split] = Dataset.from_dict(new_data) # num_repeats duplicate - train_repeated = [] + # train_repeated = [] test_repeated = [] - for sample in new_dataset['train']: - train_repeated.extend([sample] * num_repeats) + # for sample in new_dataset['train']: + # train_repeated.extend([sample] * num_repeats) for sample in new_dataset['test']: test_repeated.extend([sample] * num_repeats) - dataset_train_repeated = new_dataset['train'].from_list(train_repeated) + # dataset_train_repeated = new_dataset['train'].from_list( + # train_repeated + # ) dataset_test_repeated = new_dataset['test'].from_list(test_repeated) return DatasetDict({ - 'train': dataset_train_repeated, + # 'train': dataset_train_repeated, 'test': dataset_test_repeated }) @@ -308,7 +318,10 @@ def timeout_handler(signum, frame): raise TimeoutException -signal.signal(signal.SIGALRM, timeout_handler) +try: + signal.signal(signal.SIGALRM, timeout_handler) +except AttributeError: + logger.warning('signal.SIGALRM is not available on this platform') timeout = 4 # seconds diff --git a/opencompass/datasets/custom.py b/opencompass/datasets/custom.py index 21955eebc..ad3fbe2c7 100644 --- a/opencompass/datasets/custom.py +++ b/opencompass/datasets/custom.py @@ -210,6 +210,8 @@ def make_mcq_gen_config(meta): input_columns=meta['input_columns'], output_column=meta['output_column'], ) + if 'test_range' in meta: + reader_cfg['test_range'] = meta['test_range'] infer_cfg = dict( prompt_template=dict( type=PromptTemplate, @@ -255,6 +257,8 @@ def make_circular_mcq_gen_config(meta): input_columns=meta['input_columns'], output_column=meta['output_column'], ) + if 'test_range' in meta: + reader_cfg['test_range'] = meta['test_range'] infer_cfg = dict( prompt_template=dict( type=PromptTemplate, @@ -304,6 +308,8 @@ def make_qa_gen_config(meta): input_columns=meta['input_columns'], output_column=meta['output_column'], ) + if 'test_range' in meta: + reader_cfg['test_range'] = meta['test_range'] infer_cfg = dict( prompt_template=dict( type=PromptTemplate, @@ -353,6 +359,8 @@ def make_mcq_ppl_config(meta): input_columns=meta['input_columns'], output_column=meta['output_column'], ) + if 'test_range' in meta: + reader_cfg['test_range'] = meta['test_range'] infer_cfg = dict( prompt_template=dict( type=PromptTemplate, @@ -399,6 +407,8 @@ def make_circular_mcq_ppl_config(meta): input_columns=meta['input_columns'], output_column=meta['output_column'], ) + if 'test_range' in meta: + reader_cfg['test_range'] = meta['test_range'] infer_cfg = dict( prompt_template=dict( type=PromptTemplate, diff --git a/opencompass/datasets/humanevalx.py b/opencompass/datasets/humanevalx.py index 2d8283f8e..6e9c3dff9 100644 --- a/opencompass/datasets/humanevalx.py +++ b/opencompass/datasets/humanevalx.py @@ -168,9 +168,12 @@ def _clean_up_code(text: str, language_type: str, reference) -> str: """Cleans up the generated code.""" try: # for chatGLM related text - text = eval(text) + eval_text = eval(text) except Exception: pass + else: + if isinstance(eval_text, str): + text = eval_text # extract code from code block text = text.lstrip('\n') if '```' in text: diff --git a/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py b/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py index e12a1ac46..637eae5b9 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py +++ b/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py @@ -1,5 +1,4 @@ import re -import cn2an """ task: law article prediction @@ -15,6 +14,7 @@ def compute_ljp_article(data_dict): A reference contains a list of articles of the Criminal Law of the People's Republic of China. We compute the F1-score between the prediction and the reference. """ + import cn2an score_list, abstentions = [], 0 diff --git a/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py b/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py index fc5bc0dad..80b8ec2dd 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py +++ b/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py @@ -1,9 +1,11 @@ import math -import cn2an + import re #法律判决预测-刑期预测 def compute_ljp_imprison(data_dict): + import cn2an + score_list, abstentions = [], 0 for example in data_dict: diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py index 03661cf78..36bcd6d78 100644 --- a/opencompass/datasets/math.py +++ b/opencompass/datasets/math.py @@ -85,6 +85,7 @@ def normalize_final_answer(final_answer: str) -> str: # Extract answer that is in LaTeX math, is bold, # is surrounded by a box, etc. + final_answer = re.sub(r'(\\text\{)\((.*?)\)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\text\{)(.*?)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer) @@ -178,10 +179,7 @@ def __init__(self, version='v1'): def score(self, predictions, references): if len(predictions) != len(references): - return { - 'error': 'predictions and references have different ' - 'length' - } + return {'error': 'preds and refrs have different length'} correct = 0 count = 0 details = [] @@ -457,9 +455,24 @@ def is_equiv(self, str1, str2, verbose=False): ss2 = strip_string_func(str2) if verbose: print(ss1, ss2) - return ss1 == ss2 + if ss1 == ss2: + return True + ss1 = normalize_final_answer(ss1) + ss2 = normalize_final_answer(ss2) + if ss1 == ss2: + return True + except Exception: + pass + + try: + ss1 = normalize_final_answer(str1) + ss2 = normalize_final_answer(str2) + if ss1 == ss2: + return True except Exception: - return str1 == str2 + pass + + return str1 == str2 @ICL_EVALUATORS.register_module() diff --git a/opencompass/datasets/mathbench.py b/opencompass/datasets/mathbench.py index 995a758e7..743a5f340 100644 --- a/opencompass/datasets/mathbench.py +++ b/opencompass/datasets/mathbench.py @@ -57,7 +57,7 @@ def load(path: str, name: str, with_circular: bool = True): """ data = [] filename = osp.join(path, f'{name}.jsonl') - with open(filename, 'r') as infile: + with open(filename, 'r', encoding='utf-8') as infile: for id, line in enumerate(infile): entry = json.loads(line) if 'cloze' in name: diff --git a/opencompass/datasets/mbpp.py b/opencompass/datasets/mbpp.py index e9a3c81fd..e7c0f1cac 100644 --- a/opencompass/datasets/mbpp.py +++ b/opencompass/datasets/mbpp.py @@ -244,6 +244,7 @@ def score(self, predictions, references): if not isinstance(preds, list): preds = [preds] for pred in preds: + pred = self._process_answer(pred) mbpp_preds.append({'task_id': refer, 'solution': pred}) with tempfile.TemporaryDirectory() as tmp_dir: out_dir = osp.join(tmp_dir, 'mbpp_eval.jsonl') diff --git a/opencompass/datasets/taco.py b/opencompass/datasets/taco.py index b81e59c95..ea3393644 100644 --- a/opencompass/datasets/taco.py +++ b/opencompass/datasets/taco.py @@ -18,14 +18,20 @@ from unittest.mock import mock_open, patch import numpy as np -from datasets import Dataset, DatasetDict, load_dataset -from pyext import RuntimeModule +from datasets import Dataset, DatasetDict, load_from_disk + +try: + from pyext import RuntimeModule +except ImportError: + RuntimeModule = None from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils.logging import get_logger from .base import BaseDataset +logger = get_logger() TIMEOUT = 10 @@ -34,7 +40,7 @@ class TACODataset(BaseDataset): @staticmethod def load(path: str, num_repeats: int = 1): - dataset = load_dataset(path) + dataset = load_from_disk(path) new_dataset = DatasetDict() # add new column "starter" in the prompt for split in dataset.keys(): @@ -69,18 +75,20 @@ def load(path: str, num_repeats: int = 1): new_dataset[split] = Dataset.from_dict(new_data) # num_repeats duplicate - train_repeated = [] + # train_repeated = [] test_repeated = [] - for sample in new_dataset['train']: - train_repeated.extend([sample] * num_repeats) + # for sample in new_dataset['train']: + # train_repeated.extend([sample] * num_repeats) for sample in new_dataset['test']: test_repeated.extend([sample] * num_repeats) - dataset_train_repeated = new_dataset['train'].from_list(train_repeated) + # dataset_train_repeated = new_dataset['train'].from_list( + # train_repeated + # ) dataset_test_repeated = new_dataset['test'].from_list(test_repeated) return DatasetDict({ - 'train': dataset_train_repeated, + # 'train': dataset_train_repeated, 'test': dataset_test_repeated }) @@ -256,7 +264,10 @@ def timeout_handler(signum, frame): raise TimeoutException -signal.signal(signal.SIGALRM, timeout_handler) +try: + signal.signal(signal.SIGALRM, timeout_handler) +except AttributeError: + logger.warning('signal.SIGALRM is not available on this platform') timeout = 4 # seconds diff --git a/opencompass/lagent/agents/react.py b/opencompass/lagent/agents/react.py index 0232068f1..3d5c89e98 100644 --- a/opencompass/lagent/agents/react.py +++ b/opencompass/lagent/agents/react.py @@ -201,3 +201,77 @@ def chat(self, message: str) -> AgentReturn: self._session_history.append( dict(role='assistant', content=agent_return.response)) return agent_return + + +class CIReActMergeRole(CIReAct): + """如有第一轮 SYSTEM, 则使用 SYSTEM。后续 SYSTEM 使用 USER 合并复数轮 USER USER 与 BOT + 交替出现.""" + + def chat(self, message: str) -> AgentReturn: + for hist in self._session_history: + if hist['role'] == 'system': + hist['role'] = self.system_role + self._inner_history = [] + # append the user message for session history + self._session_history.append(dict(role='user', content=message)) + agent_return = AgentReturn() + force_stop = False + default_response = '对不起,我无法回答你的问题' + for turn in range(self.max_turn): + prompt = self._protocol.format( + chat_history=self.session_history, + inner_step=self._inner_history, + action_executor=self._action_executor, + force_stop=force_stop) + prompt = self.merge_role(prompt) + response = self._llm.generate_from_template(prompt, 512) + self._inner_history.append(dict(role='assistant', + content=response)) + thought, action, action_input = self._protocol.parse( + response, self._action_executor) + action_return: ActionReturn = self._action_executor( + action, action_input) + action_return.thought = thought + agent_return.actions.append(action_return) + if action_return.state == ActionStatusCode.SUCCESS: + # if success, stash model response and system response + self._session_history.append( + dict(role='assistant', content=response)) + self._session_history.append( + dict( + role=self.system_role, + content=self._protocol.format_response(action_return))) + agent_return.response = action_return.result['text'] + return agent_return + elif action_return.type == self._action_executor.invalid_action.name: # noqa + action_return.errmsg = 'The action is invalid, please check the action name.' # noqa + self._inner_history.append( + dict(role=self.system_role, + content=self._protocol.format_response(action_return))) + if turn == self.max_turn - 1: + force_stop = True + agent_return.response = default_response + self._session_history.append( + dict(role='assistant', content=agent_return.response)) + return agent_return + + def merge_role(self, inputs): + messages = [] + msg_buffer, last_role = [], None + for index, item in enumerate(inputs): + if index == 0 and item['role'] == 'system': + role = 'system' + elif item['role'] == 'assistant': + role = 'assistant' + else: + role = 'user' + if role != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['content']) + last_role = role + messages.append({'content': '\n'.join(msg_buffer), 'role': last_role}) + return messages diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 3d4d6fccd..1d6589b51 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -1,7 +1,7 @@ from .accessory import LLaMA2AccessoryModel # noqa: F401 from .ai360_api import AI360GPT # noqa: F401 from .alaya import AlayaLM # noqa: F401 -from .baichuan_api import BaiChuan # noqa: F401 +from .baichuan_api import BaiChuan, BaiChuan3 # noqa: F401 from .baidu_api import ERNIEBot # noqa: F401 from .base import BaseModel, LMTemplateParser # noqa from .base_api import APITemplateParser, BaseAPIModel # noqa @@ -12,12 +12,14 @@ from .huggingface import HuggingFace # noqa: F401, F403 from .huggingface import HuggingFaceCausalLM # noqa: F401, F403 from .huggingface import HuggingFaceChatGLM3 # noqa: F401, F403 +from .hunyuan_api import Hunyuan # noqa: F401 from .intern_model import InternLM # noqa: F401, F403 from .krgpt_api import KrGPT # noqa: F401 from .lightllm_api import LightllmAPI # noqa: F401 from .llama2 import Llama2, Llama2Chat # noqa: F401, F403 from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401 from .minimax_api import MiniMax # noqa: F401 +from .mistral_api import Mistral # noqa: F401 from .mixtral import Mixtral # noqa: F401 from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403 from .moonshot_api import MoonShot # noqa: F401 @@ -28,7 +30,9 @@ from .sensetime_api import SenseTime # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 from .turbomind_tis import TurboMindTisModel # noqa: F401 +from .unigpt_api import UniGPT # noqa: F401 from .vllm import VLLM # noqa: F401 from .xunfei_api import XunFei # noqa: F401 +from .yayi_api import Yayi # noqa: F401 from .zhipuai_api import ZhiPuAI # noqa: F401 from .zhipuai_v2_api import ZhiPuV2AI # noqa: F401 diff --git a/opencompass/models/ai360_api.py b/opencompass/models/ai360_api.py index ddc2a6258..87c805645 100644 --- a/opencompass/models/ai360_api.py +++ b/opencompass/models/ai360_api.py @@ -60,13 +60,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -83,13 +83,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/baichuan_api.py b/opencompass/models/baichuan_api.py index 35f6f2743..b4cc0dc2f 100644 --- a/opencompass/models/baichuan_api.py +++ b/opencompass/models/baichuan_api.py @@ -59,13 +59,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -82,13 +82,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -157,3 +157,127 @@ def _generate( max_num_retries += 1 raise RuntimeError(response) + + +class BaiChuan3(BaseAPIModel): + + def __init__( + self, + path: str, + api_key: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + ): # noqa E125 + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry) + + self.api_key = api_key + self.url = url + self.model = path + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + history = [] + prompt = input + else: + messages = [] + msg_buffer, last_role = [], None + for item in input: + role = 'BOT' if item['role'] == 'BOT' else 'USER' + if role != last_role and last_role is not None: + messages.append({ + 'data': '\n'.join(msg_buffer), + 'from': 0 if last_role == 'USER' else 1 + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = role + messages.append({ + 'data': '\n'.join(msg_buffer), + 'from': 0 if last_role == 'USER' else 1 + }) + history = messages[:-1] + prompt = messages[-1]['data'] + + data = { + 'access_token_key': self.api_key, + 'app_info': { + 'id': 123 + }, + 'prompt': { + 'data': prompt + }, + 'history': history, + } + + for _ in range(self.retry): + try: + response = requests.post(self.url, json=data) + except Exception as e: + print(e) + continue + if response is None or response.status_code != 200: + code = response.status_code if response else -1 + print(f'[chat_api]-[failed] request err, status_code: {code}') + continue + try: + response = response.json() + except Exception as e: + print(e) + continue + print(response) + status = response.get('answer', {}).get('status', 0) + session_status = response.get('session_info', {}).get('status', 0) + if status < 0 or session_status < 0: + print('[chat_api]-[warn] prompt or answer is unsafe') + return 'Rejection: unsafe prompt or answer' + return response.get('answer', {}).get('data', '') + + raise RuntimeError(response['msg']) diff --git a/opencompass/models/baidu_api.py b/opencompass/models/baidu_api.py index 0dd9c858f..ef20807e7 100644 --- a/opencompass/models/baidu_api.py +++ b/opencompass/models/baidu_api.py @@ -88,13 +88,13 @@ def _generate_access_token(self): def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -111,13 +111,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/base.py b/opencompass/models/base.py index 519b98ba2..742361346 100644 --- a/opencompass/models/base.py +++ b/opencompass/models/base.py @@ -129,7 +129,7 @@ def parse_template(self, prompt_template: PromptType, mode: str) -> str: applicable. Args: - prompt_template (List[str or PromptList]): A prompt + prompt_template (List[PromptType]): A prompt template (potentially before being wrapped by meta template). mode (str): Parsing mode. Choices are 'ppl' and 'gen'. @@ -266,7 +266,7 @@ def parse_template(self, prompt_template: PromptType, mode: str) -> str: applicable. Args: - prompt_template (List[str or PromptList]): A prompt + prompt_template (List[PromptType]): A prompt template (potentially before being wrapped by meta template). mode (str): Parsing mode. Choices are 'ppl' and 'gen'. diff --git a/opencompass/models/base_api.py b/opencompass/models/base_api.py index 3c1e12522..96b75b085 100644 --- a/opencompass/models/base_api.py +++ b/opencompass/models/base_api.py @@ -60,7 +60,7 @@ def generate(self, inputs: List[PromptType], """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -111,7 +111,7 @@ def get_ppl(self, """Get perplexity scores given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings. + inputs (List[PromptType]): A list of strings. mask_length (Optional[List[int]]): A list of mask lengths. If provided, the perplexity scores will be calculated with the first mask_length[i] tokens masked out. It's okay to skip @@ -200,12 +200,12 @@ def parse_template(self, prompt_template: PromptType, {'role': 'user', 'prompt': '...'}). Args: - prompt_template (List[str or PromptList]): An intermidate prompt + prompt_template (List[PromptType]): An intermidate prompt template (potentially before being wrapped by meta template). mode (str): Parsing mode. Choices are 'ppl' and 'gen'. Returns: - List[str or PromptList]: The finalized prompt or a conversation. + List[PromptType]: The finalized prompt or a conversation. """ assert isinstance(prompt_template, (str, list, PromptList, tuple)) diff --git a/opencompass/models/bytedance_api.py b/opencompass/models/bytedance_api.py index 6c7239663..45248ca99 100644 --- a/opencompass/models/bytedance_api.py +++ b/opencompass/models/bytedance_api.py @@ -64,13 +64,13 @@ def __init__(self, def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -87,13 +87,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/claude_api/claude_api.py b/opencompass/models/claude_api/claude_api.py index 542afab59..98421b430 100644 --- a/opencompass/models/claude_api/claude_api.py +++ b/opencompass/models/claude_api/claude_api.py @@ -52,13 +52,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -74,13 +74,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/gemini_api.py b/opencompass/models/gemini_api.py index bbb2c4278..5779d3d9d 100644 --- a/opencompass/models/gemini_api.py +++ b/opencompass/models/gemini_api.py @@ -58,13 +58,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -81,13 +81,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -234,13 +234,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/huggingface.py b/opencompass/models/huggingface.py index 0a3a38b9f..3974ad526 100644 --- a/opencompass/models/huggingface.py +++ b/opencompass/models/huggingface.py @@ -723,7 +723,7 @@ def __init__(self, self.num_extra_tokens = num_extra_tokens def generate(self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, skip_overlength=False, **kwargs) -> str: diff --git a/opencompass/models/hunyuan_api.py b/opencompass/models/hunyuan_api.py new file mode 100644 index 000000000..07b26252b --- /dev/null +++ b/opencompass/models/hunyuan_api.py @@ -0,0 +1,121 @@ +import json +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Union + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +class Hunyuan(BaseAPIModel): + + def __init__( + self, + path: str, + secret_id: str, + secret_key: str, + endpoint: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + ): # noqa E125 + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + ) + + self.secret_id = secret_id + self.secret_key = secret_key + self.endpoint = endpoint + + from tencentcloud.common import credential + from tencentcloud.common.common_client import CommonClient + from tencentcloud.common.profile.client_profile import ClientProfile + from tencentcloud.common.profile.http_profile import HttpProfile + + cred = credential.Credential(self.secret_id, self.secret_key) + httpProfile = HttpProfile() + httpProfile.endpoint = self.endpoint + clientProfile = ClientProfile() + clientProfile.httpProfile = httpProfile + self.client = CommonClient('hunyuan', + '2023-09-01', + cred, + 'ap-beijing', + profile=clientProfile) + + def generate(self, + inputs: List[PromptType], + max_out_len: int = 512) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate(self, input: PromptType, max_out_len: int = 512) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + for item in input: + msg = {'Content': item['prompt']} + if item['role'] == 'HUMAN': + msg['Role'] = 'user' + elif item['role'] == 'BOT': + msg['Role'] = 'assistant' + messages.append(msg) + + from tencentcloud.common.exception.tencent_cloud_sdk_exception import \ + TencentCloudSDKException + + data = {'Messages': messages} + + for _ in range(self.retry): + try: + resp = self.client.call_sse('ChatPro', data) + contents = [] + for event in resp: + part = json.loads(event['data']) + contents.append(part['Choices'][0]['Delta']['Content']) + answer = ''.join(contents) + + except TencentCloudSDKException as err: + print(err) + + print(answer) + return answer + + raise RuntimeError(f'Failed to respond in {self.retry} retrys') diff --git a/opencompass/models/llama2.py b/opencompass/models/llama2.py index 9cdde9665..47f932f2e 100644 --- a/opencompass/models/llama2.py +++ b/opencompass/models/llama2.py @@ -199,7 +199,7 @@ def _load_tokenizer(self, tokenizer_path: str): self.tokenizer = Tokenizer(tokenizer_path) def generate(self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, temperature: float = 0.6) -> str: """Generate response from input prompt. diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py index 506e43bd3..814c3cc68 100644 --- a/opencompass/models/lmdeploy_pytorch.py +++ b/opencompass/models/lmdeploy_pytorch.py @@ -124,13 +124,13 @@ def wait(self): def _generate(self, generator, session_id, - prompt: str or PromptList, + prompt: PromptType, gen_config=None, end_str: Optional[str] = None) -> str: """Generate results given a list of inputs. Args: - prompt (str or PromptList): A string or PromptDict. + prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. gen_config (EngineGenerationConfig, optional): Generation diff --git a/opencompass/models/minimax_api.py b/opencompass/models/minimax_api.py index 96914ee63..c069dc97f 100644 --- a/opencompass/models/minimax_api.py +++ b/opencompass/models/minimax_api.py @@ -60,13 +60,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -83,13 +83,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in Test' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/mistral_api.py b/opencompass/models/mistral_api.py new file mode 100644 index 000000000..8c01e5a4d --- /dev/null +++ b/opencompass/models/mistral_api.py @@ -0,0 +1,123 @@ +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Union + +import requests + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +class Mistral(BaseAPIModel): + + def __init__( + self, + path: str, + api_key: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + ): # noqa E125 + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + ) + + self.api_key = api_key + self.url = url + self.model = path + + def generate(self, + inputs: List[PromptType], + max_out_len: int = 512) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate(self, input: PromptType, max_out_len: int = 512) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + for item in input: + msg = {'content': item['prompt']} + if item['role'] == 'HUMAN': + msg['role'] = 'user' + elif item['role'] == 'BOT': + msg['role'] = 'assistant' + elif item['role'] == 'SYSTEM': + msg['role'] = 'system' + messages.append(msg) + messages[-1]['role'] = 'user' + + data = { + 'model': self.path, + 'messages': messages, + } + + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + 'Authorization': f'Bearer {self.api_key}', + } + + from pprint import pprint + print('-' * 128) + pprint(data) + + for _ in range(self.retry): + try: + response = requests.post(self.url, json=data, headers=headers) + except Exception as e: + print(e) + continue + try: + response = response.json() + except Exception as e: + print(e) + continue + print('=' * 128) + pprint(response) + try: + msg = response['choices'][0]['message']['content'] + except Exception as e: + print(e) + continue + return msg + + raise RuntimeError(f'Failed to respond in {self.retry} retrys') diff --git a/opencompass/models/moonshot_api.py b/opencompass/models/moonshot_api.py index 529953819..a54f48352 100644 --- a/opencompass/models/moonshot_api.py +++ b/opencompass/models/moonshot_api.py @@ -55,13 +55,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -78,13 +78,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -98,29 +98,27 @@ def _generate( messages = [{'role': 'user', 'content': input}] else: messages = [] + msg_buffer, last_role = [], None for item in input: - msg = {'content': item['prompt']} - if item['role'] == 'HUMAN': - msg['role'] = 'user' - elif item['role'] == 'BOT': - msg['role'] = 'assistant' - - messages.append(msg) - - system = { - 'role': 'system', - 'content': self.system_prompt - # '你是 Kimi,由 Moonshot AI 提供的人工智能助手,你更擅长中文和英文的对话。' - # '你会为用户提供安全,有帮助,准确的回答。同时,你会拒绝一些涉及恐怖主义,种族歧视,' - # '黄色暴力等问题的回答。Moonshot AI 为专有名词,不可翻译成其他语言。' - } - - messages.insert(0, system) - - data = { - 'model': self.model, - 'messages': messages, - } + item['role'] = 'assistant' if item['role'] == 'BOT' else 'user' + if item['role'] != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = item['role'] + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + + if self.system_prompt: + system = {'role': 'system', 'content': self.system_prompt} + messages.insert(0, system) + + data = {'model': self.model, 'messages': messages} max_num_retries = 0 while max_num_retries < self.retry: diff --git a/opencompass/models/nanbeige_api.py b/opencompass/models/nanbeige_api.py index d39c34bff..436b9b640 100644 --- a/opencompass/models/nanbeige_api.py +++ b/opencompass/models/nanbeige_api.py @@ -52,13 +52,13 @@ def __init__(self, def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -75,13 +75,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index a36670942..463127bbd 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -103,14 +103,14 @@ def __init__(self, def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, temperature: float = 0.7, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -132,12 +132,12 @@ def generate( [temperature] * len(inputs))) return results - def _generate(self, input: str or PromptList, max_out_len: int, + def _generate(self, input: PromptType, max_out_len: int, temperature: float) -> str: """Generate results given a list of inputs. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -207,6 +207,7 @@ def _generate(self, input: str or PromptList, max_out_len: int, header = { 'Authorization': f'Bearer {key}', 'content-type': 'application/json', + 'api-key': key, } if self.orgs: @@ -239,6 +240,7 @@ def _generate(self, input: str or PromptList, max_out_len: int, self.logger.error('JsonDecode error, got', str(raw_response.content)) continue + self.logger.error(str(response)) try: if self.logprobs: return response['choices'] @@ -247,13 +249,16 @@ def _generate(self, input: str or PromptList, max_out_len: int, except KeyError: if 'error' in response: if response['error']['code'] == 'rate_limit_exceeded': - time.sleep(1) + time.sleep(10) self.logger.warn('Rate limit exceeded, retrying...') continue elif response['error']['code'] == 'insufficient_quota': self.invalid_keys.add(key) self.logger.warn(f'insufficient_quota key: {key}') continue + elif response['error']['code'] == 'invalid_prompt': + self.logger.warn('Invalid prompt:', str(input)) + return '' self.logger.error('Find error message in response: ', str(response['error'])) @@ -363,12 +368,12 @@ def __init__(self, 'content-type': 'application/json', } - def _generate(self, input: str or PromptList, max_out_len: int, + def _generate(self, input: PromptType, max_out_len: int, temperature: float) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/pangu_api.py b/opencompass/models/pangu_api.py index 8ef69218e..d3153c3d1 100644 --- a/opencompass/models/pangu_api.py +++ b/opencompass/models/pangu_api.py @@ -67,13 +67,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -117,13 +117,13 @@ def _get_token(self): def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/qwen_api.py b/opencompass/models/qwen_api.py index 0f15edc3d..68f8dae86 100644 --- a/opencompass/models/qwen_api.py +++ b/opencompass/models/qwen_api.py @@ -48,13 +48,13 @@ def __init__(self, def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -71,13 +71,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -103,16 +103,26 @@ def _generate( messages = [{'role': 'user', 'content': input}] else: messages = [] - for item in input: - msg = {'content': item['prompt']} - if item['role'] == 'HUMAN': - msg['role'] = 'user' + msg_buffer, last_role = [], None + for index, item in enumerate(input): + if index == 0 and item['role'] == 'SYSTEM': + role = 'system' elif item['role'] == 'BOT': - msg['role'] = 'assistant' - elif item['role'] == 'SYSTEM': - msg['role'] = 'system' - - messages.append(msg) + role = 'assistant' + else: + role = 'user' + if role != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = role + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) data = {'messages': messages} data.update(self.generation_kwargs) @@ -142,6 +152,8 @@ def _generate( if response.status_code == 200: try: msg = response.output.text + print('=' * 128) + print(msg) return msg except KeyError: print(response) @@ -153,6 +165,8 @@ def _generate( time.sleep(2) continue if response.status_code == 400: + print('=' * 128) + print(response) msg = 'Output data may contain inappropriate content.' return msg diff --git a/opencompass/models/sensetime_api.py b/opencompass/models/sensetime_api.py index 80c946fd1..fd56e983e 100644 --- a/opencompass/models/sensetime_api.py +++ b/opencompass/models/sensetime_api.py @@ -61,13 +61,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -84,13 +84,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -114,7 +114,8 @@ def _generate( messages.append(msg) data = {'messages': messages, 'model': self.model} - data.update(self.params) + if self.params is not None: + data.update(self.params) stream = data['stream'] @@ -123,10 +124,14 @@ def _generate( self.acquire() max_num_retries += 1 - raw_response = requests.request('POST', - url=self.url, - headers=self.headers, - json=data) + try: + raw_response = requests.request('POST', + url=self.url, + headers=self.headers, + json=data) + except Exception: + time.sleep(1) + continue requests_id = raw_response.headers['X-Request-Id'] # noqa self.release() diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index 9a4023e00..50c3e5ca6 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -1,3 +1,4 @@ +import copy from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union @@ -70,11 +71,10 @@ def __init__(self, self.gen_config = gen_config self.end_str = end_str - def generate( - self, - inputs: List[str], - max_out_len: int = 512, - ) -> List[str]: + def generate(self, + inputs: List[str], + max_out_len: int = 512, + **kwargs) -> List[str]: """Generate results given a list of inputs. Args: @@ -93,6 +93,15 @@ def generate( inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size) ] + gen_config = copy.deepcopy(self.gen_config) + if 'do_sample' in kwargs: + if kwargs['do_sample']: + gen_config.top_k = 1000 + gen_config.temperature = kwargs.get('temperature', 1) + else: + gen_config.top_k = 1 + gen_config.temperature = 0.01 + results = [] for batch_input in batch_inputs: with ThreadPoolExecutor() as executor: @@ -103,7 +112,7 @@ def generate( self.generator_ids[:len(batch_input)], batch_input, [max_out_len] * len(batch_input), - [self.gen_config] * len(batch_input), + [gen_config] * len(batch_input), [self.end_str] * len(batch_input), )) results += _results @@ -123,14 +132,14 @@ def wait(self): def _generate(self, generator, session_id, - prompt: str or PromptList, + prompt: PromptType, max_out_len: int, gen_config=None, end_str: Optional[str] = None) -> str: """Generate results given a list of inputs. Args: - prompt (str or PromptList): A string or PromptDict. + prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -189,3 +198,22 @@ def get_ppl(self, results.append(res) results = np.concatenate(results) return results + + def get_loglikelihood( + self, + inputs: List[str], + conts: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + assert isinstance( + inputs, List), f'List(str) is expected, but got {type(inputs)}' + results = [] + for text, cont in zip(inputs, conts): + input_ids = self.tokenizer.encode(text) + res = self.generators[0].get_ppl(input_ids) + logit_sum = res * len(input_ids) + input_ids = self.tokenizer.encode(text.replace(cont, '')) + res = self.generators[0].get_ppl(input_ids) + logit_part = res * len(input_ids) + results.append(-(logit_sum - logit_part)) + results = np.concatenate(results) + return results diff --git a/opencompass/models/turbomind_api.py b/opencompass/models/turbomind_api.py index 75db216e0..a4efd367e 100644 --- a/opencompass/models/turbomind_api.py +++ b/opencompass/models/turbomind_api.py @@ -60,14 +60,14 @@ def __init__(self, def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, temperature: float = 1.0, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -102,12 +102,12 @@ def wait(self): """ return self.token_bucket.get_token() - def _generate(self, prompt: str or PromptList, max_out_len: int, + def _generate(self, prompt: PromptType, max_out_len: int, temperature: float, end_str: str) -> str: """Generate results given a list of inputs. Args: - prompt (str or PromptList): A string or PromptDict. + prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/turbomind_tis.py b/opencompass/models/turbomind_tis.py index c6a411c6c..8541b9de5 100644 --- a/opencompass/models/turbomind_tis.py +++ b/opencompass/models/turbomind_tis.py @@ -58,14 +58,14 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, temperature: float = 1.0, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -96,12 +96,12 @@ def wait(self): """ return self.token_bucket.get_token() - def _generate(self, prompt: str or PromptList, max_out_len: int, + def _generate(self, prompt: PromptType, max_out_len: int, temperature: float) -> str: """Generate results given a list of inputs. Args: - prompt (str or PromptList): A string or PromptDict. + prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/unigpt_api.py b/opencompass/models/unigpt_api.py new file mode 100644 index 000000000..033abb8d1 --- /dev/null +++ b/opencompass/models/unigpt_api.py @@ -0,0 +1,147 @@ +import hashlib +import time +import uuid +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Union + +import requests + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +def get_sign(appkey, udid, timestamp, secret): + original_str = f'{appkey}{udid}{timestamp}{secret}' + sign = '' + try: + md = hashlib.sha256() + md.update(original_str.encode('utf-8')) + bytes_result = md.digest() + for byte in bytes_result: + hex_value = format(byte, '02X') + sign += hex_value.upper() + except Exception as e: + print(e) + return sign + + +class UniGPT(BaseAPIModel): + + def __init__( + self, + path: str, + appkey: str, + secret: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + temperature: float = 0.2, + ): # noqa E125 + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + ) + + self.appkey = appkey + self.secret = secret + self.udid = str(uuid.uuid1()) + self.url = url + self.model = path + self.temperature = temperature + + def generate(self, + inputs: List[PromptType], + max_out_len: int = 512) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate(self, input: PromptType, max_out_len: int = 512) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + for item in input: + msg = {'content': item['prompt']} + if item['role'] == 'HUMAN': + msg['role'] = 'user' + elif item['role'] == 'BOT': + msg['role'] = 'assistant' + elif item['role'] == 'SYSTEM': + msg['role'] = 'system' + messages.append(msg) + + data = { + 'model': self.path, + 'temperature': self.temperature, + 'messages': messages, + 'max_tokens': max_out_len, + } + + timestamp = str(int(time.time()) * 1000) + headers = { + 'appkey': self.appkey, + 'sign': get_sign(self.appkey, self.udid, timestamp, self.secret), + 'stream': 'false', + 'timestamp': timestamp, + 'udid': self.udid, + 'censor': 'none', + } + + for _ in range(self.retry): + try: + response = requests.post(self.url, json=data, headers=headers) + except Exception as e: + print(e) + continue + if response is None or response.status_code != 200: + code = response.status_code if response else -1 + print(f'request err, status_code: {code}') + time.sleep(10) + continue + try: + response = response.json() + except Exception as e: + print(e) + continue + print(response) + if response.get('errorCode') == '8500502': + return 'context_length_exceeded' + return response['result']['choices'][0]['message']['content'] + raise RuntimeError(f'Failed to respond in {self.retry} retrys') diff --git a/opencompass/models/xunfei_api.py b/opencompass/models/xunfei_api.py index 0e1de20e5..ee75ca124 100644 --- a/opencompass/models/xunfei_api.py +++ b/opencompass/models/xunfei_api.py @@ -98,13 +98,13 @@ def get_url(self): def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -121,13 +121,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> List[str]: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/yayi_api.py b/opencompass/models/yayi_api.py new file mode 100644 index 000000000..73ba3ea0e --- /dev/null +++ b/opencompass/models/yayi_api.py @@ -0,0 +1,261 @@ +import base64 +import hashlib +import hmac +import random +import string +import time +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +from typing import Dict, List, Optional, Union + +import requests + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +def generate_random_string(length=16): + """生成随机串. + + :param length: 随机串长度,默认为 16 + :return: 随机串 + """ + letters = string.ascii_letters + string.digits + rand_str = ''.join(random.choice(letters) for i in range(length)) + return rand_str + + +def get_current_time(format='%Y-%m-%d %H:%M:%S'): + """获取当前时间. + + :param format: 时间格式,默认为 '%H:%M:%S' + :return: 当前时间字符串 + """ + now = datetime.now() + time_str = now.strftime(format) + return time_str + + +def get_current_timestamp(): + """ + 获取当前时间时间戳 + :return: + """ + timestamp_str = int(round(time.time() * 1000)) + return str(timestamp_str) + + +def encode_base64_string(s): + """对字符串进行 Base64 编码. + + :param s: 字符串 + :return: 编码后的字符串 + """ + encoded = base64.b64encode(s).decode() + return encoded + + +def get_current_time_gmt_format(): + """ + 获取当前时间的GMT 时间 + :return: + """ + GMT_FORMAT = '%a, %d %b %Y %H:%M:%SGMT+00:00' + now = datetime.now() + time_str = now.strftime(GMT_FORMAT) + return time_str + + +class Yayi(BaseAPIModel): + """Model wrapper around SenseTime. + + Args: + path (str): The name of SenseTime model. + e.g. `nova-ptc-xl-v1` + key (str): Authorization key. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + url: str, + url_path: str, + x_tilake_app_key: str, + x_tilake_app_secret: str, + x_tilake_ca_sginature_method: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + temperature: float = 0.4, + ): + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + ) + + self.url = url + self.url_path = url_path + self.X_TILAKE_APP_KEY = x_tilake_app_key + self.X_TILAKE_APP_SECRET = x_tilake_app_secret + self.X_TILAKE_CA_SGINATURE_METHOD = x_tilake_ca_sginature_method + self.temperature = temperature + self.model = path + + def generate_signature(self, method, accept, content_type, date, url_path): + """生成签名. + + :param method: + :param accept: + :param content_type: + :param date: + :param url_path: + :return: + """ + string_to_sign = (method + '\n' + accept + '\n' + content_type + '\n' + + date + '\n' + url_path) + string_to_sign = string_to_sign.encode('utf-8') + secret_key = self.X_TILAKE_APP_SECRET.encode('utf-8') + signature = hmac.new(secret_key, string_to_sign, + hashlib.sha256).digest() + return encode_base64_string(signature) + + def generate_header(self, content_type, accept, date, signature): + """生成请求头参数. + + :param content_type: + :param accept: + :return: + """ + headers = { + 'x-tilake-app-key': self.X_TILAKE_APP_KEY, + 'x-tilake-ca-signature-method': self.X_TILAKE_CA_SGINATURE_METHOD, + 'x-tilake-ca-timestamp': get_current_timestamp(), + 'x-tilake-ca-nonce': generate_random_string(), + 'x-tilake-ca-signature': signature, + 'Date': date, + 'Content-Type': content_type, + 'Accept': accept, + } + return headers + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + msg_buffer, last_role = [], None + for item in input: + item['role'] = 'yayi' if item['role'] == 'BOT' else 'user' + if item['role'] != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = item['role'] + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + + date = get_current_time_gmt_format() + content_type = 'application/json' + accept = '*/*' + method = 'POST' + data = { + 'id': '001', # 请求id,无需修改。 + 'model': self.model, + 'messages': messages, + 'max_new_tokens': max_out_len, # max_new_tokens及以下参数可根据实际任务进行调整。 + 'temperature': self.temperature, + 'presence_penalty': 0.85, + 'frequency_penalty': 0.16, + 'do_sample': True, + 'top_p': 1.0, + 'top_k': -1, + } + + for _ in range(self.retry): + signature_str = self.generate_signature(method=method, + accept=accept, + content_type=content_type, + date=date, + url_path=self.url_path) + headers = self.generate_header(content_type=content_type, + accept=accept, + date=date, + signature=signature_str) + + try: + response = requests.post(self.url, json=data, headers=headers) + except Exception as e: + print(e) + continue + try: + response = response.json() + except Exception as e: + print(e) + continue + print(response) + try: + return response['data']['choices'][0]['message']['content'] + except Exception as e: + print(e) + continue + + raise RuntimeError(f'Failed to respond in {self.retry} retrys') diff --git a/opencompass/models/zhipuai_api.py b/opencompass/models/zhipuai_api.py index dd7c4d706..cca4f71ed 100644 --- a/opencompass/models/zhipuai_api.py +++ b/opencompass/models/zhipuai_api.py @@ -44,13 +44,13 @@ def __init__( def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -67,13 +67,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. diff --git a/opencompass/models/zhipuai_v2_api.py b/opencompass/models/zhipuai_v2_api.py index 9d1f19d4c..3b7afa02b 100644 --- a/opencompass/models/zhipuai_v2_api.py +++ b/opencompass/models/zhipuai_v2_api.py @@ -2,8 +2,6 @@ from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union -from httpx import ProxyError - from opencompass.utils.prompt import PromptList from .base_api import BaseAPIModel @@ -59,13 +57,13 @@ def __init__(self, def generate( self, - inputs: List[str or PromptList], + inputs: List[PromptType], max_out_len: int = 512, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. + inputs (List[PromptType]): A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -82,13 +80,13 @@ def generate( def _generate( self, - input: str or PromptList, + input: PromptType, max_out_len: int = 512, ) -> str: """Generate results given an input. Args: - inputs (str or PromptList): A string or PromptDict. + inputs (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -103,6 +101,8 @@ def _generate( else: messages = [] for item in input: + if not item['prompt']: + continue msg = {'content': item['prompt']} if item['role'] == 'HUMAN': msg['role'] = 'user' @@ -115,11 +115,15 @@ def _generate( data = {'model': self.model, 'messages': messages} data.update(self.generation_kwargs) + from pprint import pprint + print('-' * 128) + pprint(data) max_num_retries = 0 while max_num_retries < self.retry: self.acquire() response = None + from httpx import ProxyError try: response = self.client.chat.completions.create(**data) @@ -161,6 +165,8 @@ def _generate( # msg = response['data']['choices'][0]['content'] else: msg = response.choices[0].message.content + print('=' * 128) + print(msg) return msg # sensitive content, prompt overlength, network error # or illegal prompt diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 05477692c..79de767e9 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -120,7 +120,7 @@ def score(self, meta: Optional[bool] = False, infer_order: Optional[str] = 'random') -> Dict: dup_indices = [] - if type(predictions) == list: + if isinstance(predictions, list): """Apply to multi-model comparison.""" references = [{} for _ in range(len(predictions[0]['model_preds'])) ] if references is None else references @@ -137,7 +137,7 @@ def score(self, if len(set(check)) == 1: dup_indices.append(i) - elif type(predictions) == dict: + elif isinstance(predictions, dict): """Apply to single-model scoring.""" references = [{} for _ in range(len(predictions[0]['model_preds'])) ] if references is None else references diff --git a/opencompass/openicl/icl_prompt_template.py b/opencompass/openicl/icl_prompt_template.py index 5741b9325..db2dcc911 100644 --- a/opencompass/openicl/icl_prompt_template.py +++ b/opencompass/openicl/icl_prompt_template.py @@ -77,7 +77,7 @@ def generate_ice_item(self, entry: Dict, label: Hashable) -> PromptType: label (:obj:`Hashable`): The value of the output field. Returns: - str or PromptList: The generated in-context example. + PromptType: The generated in-context example. """ # Select the corresponding template if isinstance(self.template, str) or self.prompt_type == 'meta': @@ -114,7 +114,7 @@ def generate_label_prompt_item(self, entry (:obj:`Dict`): A piece of data containing the input field content. - ice (str or PromptList): The generated in-context example. + ice (PromptType): The generated in-context example. label (:obj:`Hashable`): The value of the output field. remain_sep (:obj:`bool`): If remain sep_token @@ -165,7 +165,7 @@ def generate_item( the :obj:`ice_token`. Defaults to ``''``. Returns: - str or PromptList: The generated item. + PromptType: The generated item. """ template = None if isinstance(self.template, str): @@ -220,7 +220,7 @@ def _encode_template(self, prompt_template: Union[List[Union[str, Dict]], examples. Returns: - str or PromptList: The encoded template. + PromptType: The encoded template. """ if isinstance(prompt_template, str): return prompt_template diff --git a/opencompass/registry.py b/opencompass/registry.py index c7de6b7f0..ceddef834 100644 --- a/opencompass/registry.py +++ b/opencompass/registry.py @@ -1,7 +1,21 @@ +from typing import Callable, List, Optional, Type, Union + from mmengine.registry import DATASETS as MMENGINE_DATASETS from mmengine.registry import METRICS as MMENGINE_METRICS from mmengine.registry import MODELS as MMENGINE_MODELS -from mmengine.registry import Registry +from mmengine.registry import Registry as OriginalRegistry + + +class Registry(OriginalRegistry): + + # override the default force behavior + def register_module( + self, + name: Optional[Union[str, List[str]]] = None, + force: bool = True, + module: Optional[Type] = None) -> Union[type, Callable]: + return super().register_module(name, force, module) + PARTITIONERS = Registry('partitioner', locations=['opencompass.partitioners']) RUNNERS = Registry('runner', locations=['opencompass.runners']) diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 208c92e5d..799248419 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -118,6 +118,7 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): conda_env_name = self.aliyun_cfg['conda_env_name'] shell_cmd = (f'source {bashrc_path}; ' f'conda activate {conda_env_name}; ') + shell_cmd += f'export PYTHONPATH={pwd}:$PYTHONPATH; ' else: # using public conda env # users can also set `python_env_path` to their @@ -151,6 +152,11 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): if hf_endpoint is not None: shell_cmd += f'export HF_ENDPOINT={hf_endpoint}; ' + extra_envs = self.aliyun_cfg.get('extra_envs') + if extra_envs is not None: + for extra_env in extra_envs: + shell_cmd += f'export {extra_env}; ' + shell_cmd += f'cd {pwd}; ' shell_cmd += '{task_cmd}' @@ -161,9 +167,9 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): f" -c {self.aliyun_cfg['dlc_config_path']}" f" --workspace_id {self.aliyun_cfg['workspace_id']}" ' --worker_count 1' - f' --worker_cpu {max(num_gpus * 8, 32)}' + f' --worker_cpu {max(num_gpus * 8, 12)}' f' --worker_gpu {num_gpus}' - f' --worker_memory {max(num_gpus * 128, 256)}' + f' --worker_memory {max(num_gpus * 128, 192)}' f" --worker_image {self.aliyun_cfg['worker_image']}") get_cmd = partial(task.get_command, cfg_path=param_file, @@ -185,14 +191,25 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): time.sleep(random.randint(0, 10)) def _run_within_retry(): - output = subprocess.getoutput(cmd) - match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output) - if match is None: - raise RuntimeError( - f'Failed to launch dlc job for {output}') + num_retry_to_start = 5 + index_to_start = 0 + while index_to_start < num_retry_to_start: + index_to_start += 1 + output = subprocess.getoutput(cmd) + match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output) + if match is None: + stdout.write('Failed to get job id from output:') + stdout.write(output) + if index_to_start < num_retry_to_start: + stdout.write(f'Retry #{index_to_start} starting') + time.sleep(2) + continue + else: + job_id = match.group(1) + stdout.write(output) + break else: - job_id = match.group(1) - stdout.write(output) + raise RuntimeError(f'Cannot get job id from {output}') pod_create_time = None pri_time = None @@ -200,7 +217,7 @@ def _run_within_retry(): while True: # 1. Avoid to request dlc too frequently. # 2. DLC job may not be ready immediately after creation. - for _ in range(5): + for _ in range(20): time.sleep(2) try: job_info = json.loads( diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index e4fe023cf..d8a666060 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -17,7 +17,7 @@ from opencompass.utils.prompt import get_prompt_hash METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth', 'f1', 'exact_match'] -METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len'] +METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len', 'tool_rate'] def model_abbr_from_cfg_used_in_summarizer(model): if model.get('summarizer_abbr', None): diff --git a/opencompass/summarizers/subjective/alignmentbench.py b/opencompass/summarizers/subjective/alignmentbench.py index d114a1578..8831386e7 100644 --- a/opencompass/summarizers/subjective/alignmentbench.py +++ b/opencompass/summarizers/subjective/alignmentbench.py @@ -218,8 +218,9 @@ def get_dimension_results(judged_answers, references, fout, fout_flag, model): dimension_avg_ratings = defaultdict(float) for dimension, total_score in dimension_ratings.items(): - dimension_avg_ratings[ - dimension] = total_score / dimension_counts[dimension] + s = total_score / dimension_counts[dimension] + s = round(s, 2) + dimension_avg_ratings[dimension] = s scores = {model: dimension_avg_ratings} rows = list(scores.keys()) @@ -249,8 +250,9 @@ def get_capability_results(judged_answers, capability_avg_ratings = defaultdict(float) for capability, total_score in capability_ratings.items(): - capability_avg_ratings[ - capability] = total_score / capability_counts[capability] + s = total_score / capability_counts[capability] + s = round(s, 2) + capability_avg_ratings[capability] = s temp_list = [] total_column_num = 2 @@ -260,11 +262,14 @@ def get_capability_results(judged_answers, np.mean(capability_avg_ratings[cat]) for cat in categories[category] ]) + capability_avg_ratings[category + '总分'] = round( + capability_avg_ratings[category + '总分'], 2) temp_list.append(category + '总分') capability_avg_ratings['总分'] = 0 for temp in temp_list: capability_avg_ratings['总分'] += capability_avg_ratings[temp] capability_avg_ratings['总分'] /= len(temp_list) + capability_avg_ratings['总分'] = round(capability_avg_ratings['总分'], 2) scores = {model: capability_avg_ratings} with open(fout, 'a+', newline='') as csvfile: @@ -365,8 +370,10 @@ def summarize(self, print(subdir_path + ' is not exist! please check!') if self.judge_type == 'general': with open(fout, 'r') as f: - x = from_csv(f) + x = from_csv(f, delimiter=',') print(x) + print(fout) with open(fout2, 'r') as f: - x = from_csv(f) + x = from_csv(f, delimiter=',') print(x) + print(fout2) diff --git a/opencompass/summarizers/subjective/compass_arena.py b/opencompass/summarizers/subjective/compass_arena.py index e62d260c6..f6e6c246e 100644 --- a/opencompass/summarizers/subjective/compass_arena.py +++ b/opencompass/summarizers/subjective/compass_arena.py @@ -229,4 +229,5 @@ def summarize( for fout in fout_list: with open(fout, 'r') as f: x = from_csv(f) + print(fout) print(x) diff --git a/opencompass/summarizers/subjective/mtbench.py b/opencompass/summarizers/subjective/mtbench.py index 5da2e5388..5867769b1 100644 --- a/opencompass/summarizers/subjective/mtbench.py +++ b/opencompass/summarizers/subjective/mtbench.py @@ -65,8 +65,9 @@ def get_capability_results( capability_avg_ratings = defaultdict(float) for capability, total_score in capability_ratings.items(): - capability_avg_ratings[ - capability] = total_score / capability_counts[capability] + s = total_score / capability_counts[capability] + s = round(s, 2) + capability_avg_ratings[capability] = s columns = list(capability_avg_ratings.keys()) columns.insert(0, columns.pop(columns.index('total'))) with open(fout, 'a+', newline='') as csvfile: @@ -142,5 +143,6 @@ def summarize(self, with open(fout, 'r') as f: x = from_csv(f) print(x) + print(fout) elif self.judge_type == 'pair': super().summarize() diff --git a/opencompass/tasks/openicl_infer.py b/opencompass/tasks/openicl_infer.py index 78e31043f..3810db71a 100644 --- a/opencompass/tasks/openicl_infer.py +++ b/opencompass/tasks/openicl_infer.py @@ -43,9 +43,12 @@ def get_command(self, cfg_path, template): the command. """ script_path = __file__ - has_vllm = ('VLLM' in str(self.model_cfgs[0].get('type', ''))) or \ - 'VLLM' in str(self.model_cfgs[0].get('llm', {}).get('type', '')) - if self.num_gpus > 0 and not has_vllm: + backend_keys = ['VLLM', 'Lmdeploy'] + use_backend = any( + key in str(self.model_cfgs[0].get('type', '')) + or key in str(self.model_cfgs[0].get('llm', {}).get('type', '')) + for key in backend_keys) + if self.num_gpus > 0 and not use_backend: port = random.randint(12000, 32000) command = (f'torchrun --master_port={port} ' f'--nproc_per_node {self.num_procs} ' diff --git a/opencompass/tasks/outer_eval/alpacaeval.py b/opencompass/tasks/outer_eval/alpacaeval.py index 4e7c7146f..006551e2c 100644 --- a/opencompass/tasks/outer_eval/alpacaeval.py +++ b/opencompass/tasks/outer_eval/alpacaeval.py @@ -120,7 +120,10 @@ def get_command(self, cfg_path, template): filename = get_infer_output_path(m_cfg, dataset_cfg, osp.join(work_dir, 'predictions')) output_path = osp.join(work_dir, 'results', m_cfg['abbr']) - command = f'export OPENAI_API_KEY={api_key}; alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}' + command = '' + if api_key is not None: + command += f'export OPENAI_API_KEY={api_key}; ' + command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}' return template.format(task_cmd=command) def run(self): diff --git a/opencompass/utils/prompt.py b/opencompass/utils/prompt.py index fe6e00e59..496eec157 100644 --- a/opencompass/utils/prompt.py +++ b/opencompass/utils/prompt.py @@ -73,7 +73,7 @@ def format(self, **kwargs) -> PromptList: Args: src (str): The string to be replaced. - dst (str or PromptList): The string or PromptList to replace with. + dst (PromptType): The string or PromptList to replace with. Returns: PromptList: A new PromptList with 'src' replaced by 'dst'. @@ -98,7 +98,7 @@ def replace(self, src: str, dst: Union[str, PromptList]) -> PromptList: Args: src (str): The string to be replaced. - dst (str or PromptList): The string or PromptList to replace with. + dst (PromptType): The string or PromptList to replace with. Returns: PromptList: A new PromptList with 'src' replaced by 'dst'. @@ -139,7 +139,7 @@ def __add__(self, other: Union[str, PromptList]) -> PromptList: """Adds a string or another PromptList to this PromptList. Args: - other (str or PromptList): The string or PromptList to be added. + other (PromptType): The string or PromptList to be added. Returns: PromptList: A new PromptList that is the result of the addition. @@ -156,7 +156,7 @@ def __radd__(self, other: Union[str, PromptList]) -> PromptList: '+' operator. Args: - other (str or PromptList): The string or PromptList to be added. + other (PromptType): The string or PromptList to be added. Returns: PromptList: A new PromptList that is the result of the addition. @@ -172,7 +172,7 @@ def __iadd__(self, other: Union[str, PromptList]) -> PromptList: """Implements in-place addition for the PromptList. Args: - other (str or PromptList): The string or PromptList to be added. + other (PromptType): The string or PromptList to be added. Returns: PromptList: The updated PromptList. diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 0e068b2a0..3f3baa776 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -48,6 +48,19 @@ def match_cfg_file(workdir: str, pattern: Union[str, List[str]]) -> List[str]: return files +def try_fill_in_custom_cfgs(config): + for i, dataset in enumerate(config['datasets']): + if 'type' not in dataset: + config['datasets'][i] = make_custom_dataset_config(dataset) + if 'model_dataset_combinations' not in config: + return config + for mdc in config['model_dataset_combinations']: + for i, dataset in enumerate(mdc['datasets']): + if 'type' not in dataset: + mdc['datasets'][i] = make_custom_dataset_config(dataset) + return config + + def get_config_from_arg(args) -> Config: """Get the config object given args. @@ -58,9 +71,7 @@ def get_config_from_arg(args) -> Config: """ if args.config: config = Config.fromfile(args.config, format_python_code=False) - for i, dataset in enumerate(config['datasets']): - if 'type' not in dataset: - config['datasets'][i] = make_custom_dataset_config(dataset) + config = try_fill_in_custom_cfgs(config) return config # parse dataset args if not args.datasets and not args.custom_dataset_path: diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index 564078754..968b4c348 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -94,11 +94,11 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: f'答案是\s?(\S+)(?:。|$)', f'答案应该是\s?(\S+)(?:。|$)', f'答案为\s?(\S+)(?:。|$)', - f'[Tt]he answer is ([{options}])', - f'[Tt]he answer is option ([{options}])', - f'[Tt]he correct answer is ([{options}])', - f'[Tt]he correct answer is option ([{options}])', - f'[Tt]he answer to the question is ([{options}])', + f'[Tt]he answer is \(?([{options}])\)?', + f'[Tt]he answer is option \(?([{options}])\)?', + f'[Tt]he correct answer is \(?([{options}])\)?', + f'[Tt]he correct answer is option \(?([{options}])\)?', + f'[Tt]he answer to the question is \(?([{options}])\)?', f'^选项\s?([{options}])', f'^([{options}])\s?选?项', f'(\s|^)[{options}][\s。,,::\.$]', diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 3f810934b..eb7468fc1 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -21,6 +21,7 @@ OpenCC opencv-python-headless pandas<2.0.0 prettytable +pyext pypinyin python-Levenshtein rank_bm25==0.2.2 diff --git a/run.py b/run.py index 09c901840..491fbf7c2 100644 --- a/run.py +++ b/run.py @@ -1,364 +1,4 @@ -import argparse -import getpass -import os -import os.path as osp -from datetime import datetime - -from mmengine.config import Config, DictAction - -from opencompass.partitioners import MultimodalNaivePartitioner -from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg -from opencompass.runners import SlurmRunner -from opencompass.summarizers import DefaultSummarizer -from opencompass.utils import LarkReporter, get_logger -from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg, - fill_infer_cfg, get_config_from_arg) - - -def parse_args(): - parser = argparse.ArgumentParser(description='Run an evaluation task') - parser.add_argument('config', nargs='?', help='Train config file path') - - # add mutually exclusive args `--slurm` `--dlc`, defaults to local runner - # if "infer" or "eval" not specified - launch_method = parser.add_mutually_exclusive_group() - launch_method.add_argument('--slurm', - action='store_true', - default=False, - help='Whether to force tasks to run with srun. ' - 'If True, `--partition(-p)` must be set. ' - 'Defaults to False') - launch_method.add_argument('--dlc', - action='store_true', - default=False, - help='Whether to force tasks to run on dlc. If ' - 'True, `--aliyun-cfg` must be set. Defaults' - ' to False') - # multi-modal support - parser.add_argument('--mm-eval', - help='Whether or not enable multimodal evaluation', - action='store_true', - default=False) - # Add shortcut parameters (models, datasets and summarizer) - parser.add_argument('--models', nargs='+', help='', default=None) - parser.add_argument('--datasets', nargs='+', help='', default=None) - parser.add_argument('--summarizer', help='', default=None) - # add general args - parser.add_argument('--debug', - help='Debug mode, in which scheduler will run tasks ' - 'in the single process, and output will not be ' - 'redirected to files', - action='store_true', - default=False) - parser.add_argument('--dry-run', - help='Dry run mode, in which the scheduler will not ' - 'actually run the tasks, but only print the commands ' - 'to run', - action='store_true', - default=False) - parser.add_argument('-m', - '--mode', - help='Running mode. You can choose "infer" if you ' - 'only want the inference results, or "eval" if you ' - 'already have the results and want to evaluate them, ' - 'or "viz" if you want to visualize the results.', - choices=['all', 'infer', 'eval', 'viz'], - default='all', - type=str) - parser.add_argument('-r', - '--reuse', - nargs='?', - type=str, - const='latest', - help='Reuse previous outputs & results, and run any ' - 'missing jobs presented in the config. If its ' - 'argument is not specified, the latest results in ' - 'the work_dir will be reused. The argument should ' - 'also be a specific timestamp, e.g. 20230516_144254'), - parser.add_argument('-w', - '--work-dir', - help='Work path, all the outputs will be ' - 'saved in this path, including the slurm logs, ' - 'the evaluation results, the summary results, etc.' - 'If not specified, the work_dir will be set to ' - './outputs/default.', - default=None, - type=str) - parser.add_argument( - '--config-dir', - default='configs', - help='Use the custom config directory instead of config/ to ' - 'search the configs for datasets, models and summarizers', - type=str) - parser.add_argument('-l', - '--lark', - help='Report the running status to lark bot', - action='store_true', - default=False) - parser.add_argument('--max-partition-size', - help='The maximum size of an infer task. Only ' - 'effective when "infer" is missing from the config.', - type=int, - default=40000), - parser.add_argument( - '--gen-task-coef', - help='The dataset cost measurement coefficient for generation tasks, ' - 'Only effective when "infer" is missing from the config.', - type=int, - default=20) - parser.add_argument('--max-num-workers', - help='Max number of workers to run in parallel. ' - 'Will be overrideen by the "max_num_workers" argument ' - 'in the config.', - type=int, - default=32) - parser.add_argument('--max-workers-per-gpu', - help='Max task to run in parallel on one GPU. ' - 'It will only be used in the local runner.', - type=int, - default=1) - parser.add_argument( - '--retry', - help='Number of retries if the job failed when using slurm or dlc. ' - 'Will be overrideen by the "retry" argument in the config.', - type=int, - default=2) - parser.add_argument( - '--dump-eval-details', - help='Whether to dump the evaluation details, including the ' - 'correctness of each sample, bpb, etc.', - action='store_true', - ) - # set srun args - slurm_parser = parser.add_argument_group('slurm_args') - parse_slurm_args(slurm_parser) - # set dlc args - dlc_parser = parser.add_argument_group('dlc_args') - parse_dlc_args(dlc_parser) - # set hf args - hf_parser = parser.add_argument_group('hf_args') - parse_hf_args(hf_parser) - # set custom dataset args - custom_dataset_parser = parser.add_argument_group('custom_dataset_args') - parse_custom_dataset_args(custom_dataset_parser) - args = parser.parse_args() - if args.slurm: - assert args.partition is not None, ( - '--partition(-p) must be set if you want to use slurm') - if args.dlc: - assert os.path.exists(args.aliyun_cfg), ( - 'When launching tasks using dlc, it needs to be configured ' - 'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"' - ' to specify a new path.') - return args - - -def parse_slurm_args(slurm_parser): - """These args are all for slurm launch.""" - slurm_parser.add_argument('-p', - '--partition', - help='Slurm partition name', - default=None, - type=str) - slurm_parser.add_argument('-q', - '--quotatype', - help='Slurm quota type', - default=None, - type=str) - slurm_parser.add_argument('--qos', - help='Slurm quality of service', - default=None, - type=str) - - -def parse_dlc_args(dlc_parser): - """These args are all for dlc launch.""" - dlc_parser.add_argument('--aliyun-cfg', - help='The config path for aliyun config', - default='~/.aliyun.cfg', - type=str) - - -def parse_hf_args(hf_parser): - """These args are all for the quick construction of HuggingFace models.""" - hf_parser.add_argument('--hf-path', type=str) - hf_parser.add_argument('--peft-path', type=str) - hf_parser.add_argument('--tokenizer-path', type=str) - hf_parser.add_argument('--model-kwargs', - nargs='+', - action=DictAction, - default={}) - hf_parser.add_argument('--tokenizer-kwargs', - nargs='+', - action=DictAction, - default={}) - hf_parser.add_argument('--max-out-len', type=int) - hf_parser.add_argument('--max-seq-len', type=int) - hf_parser.add_argument('--no-batch-padding', - action='store_true', - default=False) - hf_parser.add_argument('--batch-size', type=int) - hf_parser.add_argument('--num-gpus', type=int) - hf_parser.add_argument('--pad-token-id', type=int) - - -def parse_custom_dataset_args(custom_dataset_parser): - """These args are all for the quick construction of custom datasets.""" - custom_dataset_parser.add_argument('--custom-dataset-path', type=str) - custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str) - custom_dataset_parser.add_argument('--custom-dataset-data-type', - type=str, - choices=['mcq', 'qa']) - custom_dataset_parser.add_argument('--custom-dataset-infer-method', - type=str, - choices=['gen', 'ppl']) - - -def main(): - args = parse_args() - if args.dry_run: - args.debug = True - # initialize logger - logger = get_logger(log_level='DEBUG' if args.debug else 'INFO') - - cfg = get_config_from_arg(args) - if args.work_dir is not None: - cfg['work_dir'] = args.work_dir - else: - cfg.setdefault('work_dir', './outputs/default/') - - # cfg_time_str defaults to the current time - cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S') - if args.reuse: - if args.reuse == 'latest': - if not os.path.exists(cfg.work_dir) or not os.listdir( - cfg.work_dir): - logger.warning('No previous results to reuse!') - else: - dirs = os.listdir(cfg.work_dir) - dir_time_str = sorted(dirs)[-1] - else: - dir_time_str = args.reuse - logger.info(f'Reusing experiements from {dir_time_str}') - elif args.mode in ['eval', 'viz']: - raise ValueError('You must specify -r or --reuse when running in eval ' - 'or viz mode!') - - # update "actual" work_dir - cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str) - os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True) - - # dump config - output_config_path = osp.join(cfg.work_dir, 'configs', - f'{cfg_time_str}.py') - cfg.dump(output_config_path) - # Config is intentally reloaded here to avoid initialized - # types cannot be serialized - cfg = Config.fromfile(output_config_path, format_python_code=False) - - # report to lark bot if specify --lark - if not args.lark: - cfg['lark_bot_url'] = None - elif cfg.get('lark_bot_url', None): - content = f'{getpass.getuser()}\'s task has been launched!' - LarkReporter(cfg['lark_bot_url']).post(content) - - if args.mode in ['all', 'infer']: - # When user have specified --slurm or --dlc, or have not set - # "infer" in config, we will provide a default configuration - # for infer - if (args.dlc or args.slurm) and cfg.get('infer', None): - logger.warning('You have set "infer" in the config, but ' - 'also specified --slurm or --dlc. ' - 'The "infer" configuration will be overridden by ' - 'your runtime arguments.') - # Check whether run multimodal evaluation - if args.mm_eval: - partitioner = MultimodalNaivePartitioner( - osp.join(cfg['work_dir'], 'predictions/')) - tasks = partitioner(cfg) - exec_mm_infer_runner(tasks, args, cfg) - return - - if args.dlc or args.slurm or cfg.get('infer', None) is None: - fill_infer_cfg(cfg, args) - - if args.partition is not None: - if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner: - cfg.infer.runner.partition = args.partition - cfg.infer.runner.quotatype = args.quotatype - else: - logger.warning('SlurmRunner is not used, so the partition ' - 'argument is ignored.') - if args.debug: - cfg.infer.runner.debug = True - if args.lark: - cfg.infer.runner.lark_bot_url = cfg['lark_bot_url'] - cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'], - 'predictions/') - partitioner = PARTITIONERS.build(cfg.infer.partitioner) - tasks = partitioner(cfg) - if args.dry_run: - return - runner = RUNNERS.build(cfg.infer.runner) - # Add extra attack config if exists - if hasattr(cfg, 'attack'): - for task in tasks: - cfg.attack.dataset = task.datasets[0][0].abbr - task.attack = cfg.attack - runner(tasks) - - # evaluate - if args.mode in ['all', 'eval']: - # When user have specified --slurm or --dlc, or have not set - # "eval" in config, we will provide a default configuration - # for eval - if (args.dlc or args.slurm) and cfg.get('eval', None): - logger.warning('You have set "eval" in the config, but ' - 'also specified --slurm or --dlc. ' - 'The "eval" configuration will be overridden by ' - 'your runtime arguments.') - - if args.dlc or args.slurm or cfg.get('eval', None) is None: - fill_eval_cfg(cfg, args) - if args.dump_eval_details: - cfg.eval.runner.task.dump_details = True - - if args.partition is not None: - if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner: - cfg.eval.runner.partition = args.partition - cfg.eval.runner.quotatype = args.quotatype - else: - logger.warning('SlurmRunner is not used, so the partition ' - 'argument is ignored.') - if args.debug: - cfg.eval.runner.debug = True - if args.lark: - cfg.eval.runner.lark_bot_url = cfg['lark_bot_url'] - cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/') - partitioner = PARTITIONERS.build(cfg.eval.partitioner) - tasks = partitioner(cfg) - if args.dry_run: - return - runner = RUNNERS.build(cfg.eval.runner) - - # For meta-review-judge in subjective evaluation - if isinstance(tasks, list) and len(tasks) != 0 and isinstance( - tasks[0], list): - for task_part in tasks: - runner(task_part) - else: - runner(tasks) - - # visualize - if args.mode in ['all', 'eval', 'viz']: - summarizer_cfg = cfg.get('summarizer', {}) - if not summarizer_cfg or summarizer_cfg.get('type', None) is None: - summarizer_cfg['type'] = DefaultSummarizer - summarizer_cfg['config'] = cfg - summarizer = build_from_cfg(summarizer_cfg) - summarizer.summarize(time_str=cfg_time_str) - +from opencompass.cli.main import main if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index 2fade798b..08c81d0df 100644 --- a/setup.py +++ b/setup.py @@ -103,39 +103,46 @@ def get_version(): def do_setup(): - setup(name='opencompass', - author='OpenCompass Contributors', - version=get_version(), - description='A comprehensive toolkit for large model evaluation', - url='https://github.com/open-compass/opencompass', - long_description=readme(), - long_description_content_type='text/markdown', - maintainer='OpenCompass Authors', - cmdclass={'download_nltk': DownloadNLTK}, - setup_requires=['nltk==3.8'], - python_requires='>=3.8.0', - install_requires=parse_requirements('requirements/runtime.txt'), - license='Apache License 2.0', - packages=find_packages(exclude=[ - 'test*', - 'configs', - 'data', - 'docs', - 'tools', - 'tmp', - ]), - keywords=[ - 'AI', 'NLP', 'in-context learning', 'large language model', - 'evaluation', 'benchmark', 'llm' - ], - classifiers=[ - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Intended Audience :: Developers', - 'Intended Audience :: Education', - 'Intended Audience :: Science/Research', - ]) + setup( + name='opencompass', + author='OpenCompass Contributors', + version=get_version(), + description='A comprehensive toolkit for large model evaluation', + url='https://github.com/open-compass/opencompass', + long_description=readme(), + long_description_content_type='text/markdown', + maintainer='OpenCompass Authors', + cmdclass={'download_nltk': DownloadNLTK}, + setup_requires=['nltk==3.8'], + python_requires='>=3.8.0', + # install_requires=parse_requirements('requirements/runtime.txt'), + license='Apache License 2.0', + packages=find_packages(exclude=[ + 'test*', + 'configs', + 'data', + 'docs', + 'tools', + 'tmp', + ]), + keywords=[ + 'AI', 'NLP', 'in-context learning', 'large language model', + 'evaluation', 'benchmark', 'llm' + ], + classifiers=[ + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + ], + entry_points={ + 'console_scripts': [ + 'opencompass = opencompass.cli.main:main', + ], + }, + ) if __name__ == '__main__': diff --git a/tools/prediction_merger.py b/tools/prediction_merger.py index 47b9439b0..cefef821a 100644 --- a/tools/prediction_merger.py +++ b/tools/prediction_merger.py @@ -1,7 +1,7 @@ import argparse import copy import json -import os.path as osp +import os import mmengine from mmengine.config import Config, ConfigDict @@ -13,24 +13,16 @@ def parse_args(): parser = argparse.ArgumentParser( description='Merge patitioned predictions') parser.add_argument('config', help='Train config file path') - parser.add_argument('-w', - '--work-dir', - help='Work path, all the outputs will be ' - 'saved in this path, including the slurm logs, ' - 'the evaluation results, the summary results, etc.' - 'If not specified, the work_dir will be set to ' - './outputs/default.', - default=None, - type=str) + parser.add_argument('-w', '--work-dir', default=None, type=str) + parser.add_argument('-r', '--reuse', default='latest', type=str) + parser.add_argument('-c', '--clean', action='store_true') args = parser.parse_args() return args class PredictionMerger: - """""" def __init__(self, cfg: ConfigDict) -> None: - self.cfg = cfg self.model_cfg = copy.deepcopy(self.cfg['model']) self.dataset_cfg = copy.deepcopy(self.cfg['dataset']) @@ -39,32 +31,29 @@ def __init__(self, cfg: ConfigDict) -> None: def run(self): filename = get_infer_output_path( self.model_cfg, self.dataset_cfg, - osp.join(self.work_dir, 'predictions')) - root, ext = osp.splitext(filename) + os.path.join(self.work_dir, 'predictions')) + root, ext = os.path.splitext(filename) partial_filename = root + '_0' + ext - if osp.exists(osp.realpath(filename)): + if os.path.exists(os.path.realpath(filename)): return - if not osp.exists(osp.realpath(partial_filename)): + if not os.path.exists(os.path.realpath(partial_filename)): print(f'{filename} not found') return # Load predictions partial_filenames = [] - if osp.exists(osp.realpath(filename)): - preds = mmengine.load(filename) - else: - preds, offset = {}, 0 - i = 1 - while osp.exists(osp.realpath(partial_filename)): - partial_filenames.append(osp.realpath(partial_filename)) - _preds = mmengine.load(partial_filename) - partial_filename = root + f'_{i}' + ext - i += 1 - for _o in range(len(_preds)): - preds[str(offset)] = _preds[str(_o)] - offset += 1 + preds, offset = {}, 0 + i = 1 + while os.path.exists(os.path.realpath(partial_filename)): + partial_filenames.append(os.path.realpath(partial_filename)) + _preds = mmengine.load(partial_filename) + partial_filename = root + f'_{i}' + ext + i += 1 + for _o in range(len(_preds)): + preds[str(offset)] = _preds[str(_o)] + offset += 1 dataset = build_dataset_from_cfg(self.dataset_cfg) if len(preds) != len(dataset.test): @@ -75,6 +64,11 @@ def run(self): with open(filename, 'w', encoding='utf-8') as f: json.dump(preds, f, indent=4, ensure_ascii=False) + if self.cfg['clean']: + for partial_filename in partial_filenames: + print(f'Remove {partial_filename}') + os.remove(partial_filename) + def dispatch_tasks(cfg): for model in cfg['models']: @@ -82,7 +76,8 @@ def dispatch_tasks(cfg): PredictionMerger({ 'model': model, 'dataset': dataset, - 'work_dir': cfg['work_dir'] + 'work_dir': cfg['work_dir'], + 'clean': cfg['clean'] }).run() @@ -94,6 +89,22 @@ def main(): cfg['work_dir'] = args.work_dir else: cfg.setdefault('work_dir', './outputs/default') + + if args.reuse: + if args.reuse == 'latest': + if not os.path.exists(cfg.work_dir) or not os.listdir( + cfg.work_dir): + print('No previous results to reuse!') + return + else: + dirs = os.listdir(cfg.work_dir) + dir_time_str = sorted(dirs)[-1] + else: + dir_time_str = args.reuse + cfg['work_dir'] = os.path.join(cfg.work_dir, dir_time_str) + + cfg['clean'] = args.clean + dispatch_tasks(cfg)