|
| 1 | +from os import getenv as gv |
| 2 | +from opencompass.models import HuggingFaceCausalLM |
| 3 | +from mmengine.config import read_base |
| 4 | + |
| 5 | +with read_base(): |
| 6 | + from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets |
| 7 | + |
| 8 | +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI |
| 9 | +from opencompass.partitioners import NaivePartitioner, SizePartitioner |
| 10 | +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner |
| 11 | +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner |
| 12 | +from opencompass.runners import LocalRunner |
| 13 | +from opencompass.runners import SlurmSequentialRunner |
| 14 | +from opencompass.tasks import OpenICLInferTask |
| 15 | +from opencompass.tasks.subjective_eval import SubjectiveEvalTask |
| 16 | +from opencompass.summarizers import CompassBenchSummarizer |
| 17 | + |
| 18 | +api_meta_template = dict( |
| 19 | + round=[ |
| 20 | + dict(role='HUMAN', api_role='HUMAN'), |
| 21 | + dict(role='BOT', api_role='BOT', generate=True), |
| 22 | + ], |
| 23 | + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], |
| 24 | +) |
| 25 | + |
| 26 | +# -------------Inference Stage ---------------------------------------- |
| 27 | + |
| 28 | +from opencompass.models import HuggingFacewithChatTemplate |
| 29 | + |
| 30 | +models = [ |
| 31 | + dict( |
| 32 | + type=HuggingFacewithChatTemplate, |
| 33 | + abbr='internlm2-chat-7b-hf', |
| 34 | + path='internlm/internlm2-chat-7b', |
| 35 | + max_out_len=1024, |
| 36 | + batch_size=8, |
| 37 | + run_cfg=dict(num_gpus=1), |
| 38 | + stop_words=['</s>', '<|im_end|>'], |
| 39 | + generation_kwargs=dict( |
| 40 | + do_sample=True, |
| 41 | + ), |
| 42 | + ) |
| 43 | +] |
| 44 | + |
| 45 | +datasets = [*subjective_datasets] |
| 46 | + |
| 47 | +infer = dict( |
| 48 | + partitioner=dict(type=NaivePartitioner), |
| 49 | + runner=dict( |
| 50 | + type=SlurmSequentialRunner, |
| 51 | + partition='llmeval', |
| 52 | + quotatype='reserved', |
| 53 | + max_num_workers=256, |
| 54 | + task=dict(type=OpenICLInferTask), |
| 55 | + ), |
| 56 | +) |
| 57 | + |
| 58 | +gpt4 = dict( |
| 59 | + abbr='gpt4-turbo', |
| 60 | + type=OpenAI, |
| 61 | + path='gpt-4-1106-preview', |
| 62 | + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well |
| 63 | + meta_template=api_meta_template, |
| 64 | + query_per_second=1, |
| 65 | + max_out_len=2048, |
| 66 | + max_seq_len=4096, |
| 67 | + batch_size=4, |
| 68 | + retry=20, |
| 69 | + temperature=1, |
| 70 | +) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions |
| 71 | + |
| 72 | +# -------------Evalation Stage ---------------------------------------- |
| 73 | + |
| 74 | +## ------------- JudgeLLM Configuration |
| 75 | +judge_models = [dict( |
| 76 | + abbr='GPT4-Turbo', |
| 77 | + type=OpenAI, |
| 78 | + path='gpt-4-1106-preview', |
| 79 | + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well |
| 80 | + meta_template=api_meta_template, |
| 81 | + query_per_second=1, |
| 82 | + max_out_len=1024, |
| 83 | + max_seq_len=4096, |
| 84 | + batch_size=2, |
| 85 | + retry=20, |
| 86 | + temperature=0, |
| 87 | +)] |
| 88 | + |
| 89 | +judge_models = [ |
| 90 | + dict( |
| 91 | + type=HuggingFacewithChatTemplate, |
| 92 | + abbr='internlm102b', |
| 93 | + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', |
| 94 | + max_out_len=1024, |
| 95 | + batch_size=8, |
| 96 | + run_cfg=dict(num_gpus=4), |
| 97 | + stop_words=['</s>', '<|im_end|>'], |
| 98 | + ), |
| 99 | + dict( |
| 100 | + type=HuggingFacewithChatTemplate, |
| 101 | + abbr='internlm102b2', |
| 102 | + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', |
| 103 | + max_out_len=1024, |
| 104 | + batch_size=8, |
| 105 | + run_cfg=dict(num_gpus=4), |
| 106 | + stop_words=['</s>', '<|im_end|>'], |
| 107 | + ), |
| 108 | + dict( |
| 109 | + type=HuggingFacewithChatTemplate, |
| 110 | + abbr='internlm102b3', |
| 111 | + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', |
| 112 | + max_out_len=1024, |
| 113 | + batch_size=8, |
| 114 | + run_cfg=dict(num_gpus=4), |
| 115 | + stop_words=['</s>', '<|im_end|>'], |
| 116 | + ) |
| 117 | +] |
| 118 | + |
| 119 | +## ------------- Evaluation Configuration |
| 120 | +eval = dict( |
| 121 | + partitioner=dict( |
| 122 | + type=SubjectiveSizePartitioner, |
| 123 | + strategy='split', |
| 124 | + max_task_size=10000000, |
| 125 | + mode='m2n', |
| 126 | + infer_order='double', |
| 127 | + base_models=[gpt4], |
| 128 | + compare_models=models, |
| 129 | + judge_models=judge_models, |
| 130 | + ), |
| 131 | + runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)), |
| 132 | + #given_pred = [{'abbr':'gpt4-turbo', 'path':''}] |
| 133 | +) |
| 134 | + |
| 135 | +work_dir = 'outputs/compassbench/' |
| 136 | + |
| 137 | +summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add') |
0 commit comments