Skip to content

Commit a77b8a5

Browse files
author
Fengzhe Zhou
authored
[Sync] format (#1214)
1 parent d59189b commit a77b8a5

9 files changed

Lines changed: 561 additions & 9 deletions

File tree

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.openicl.icl_evaluator import LMEvaluator
5+
from opencompass.datasets import CompassBenchDataset
6+
7+
subjective_reader_cfg = dict(
8+
input_columns=['question', 'judge_prompt'],
9+
output_column='judge',
10+
)
11+
12+
data_path ='data/subjective/compassbench'
13+
14+
subjective_datasets = []
15+
16+
versions = ['CompassbenchV1']
17+
18+
for version_abbr in versions:
19+
subjective_infer_cfg = dict(
20+
prompt_template=dict(
21+
type=PromptTemplate,
22+
template=dict(round=[
23+
dict(
24+
role='HUMAN',
25+
prompt='{question}'
26+
),
27+
]),
28+
),
29+
retriever=dict(type=ZeroRetriever),
30+
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
31+
)
32+
33+
subjective_eval_cfg = dict(
34+
evaluator=dict(
35+
type=LMEvaluator,
36+
prompt_template=dict(
37+
type=PromptTemplate,
38+
template=dict(round=[
39+
dict(
40+
role='HUMAN',
41+
prompt = '{judge_prompt}'
42+
),
43+
]),
44+
),
45+
),
46+
pred_role='BOT',
47+
)
48+
49+
subjective_datasets.append(
50+
dict(
51+
abbr=version_abbr,
52+
type=CompassBenchDataset,
53+
path=data_path,
54+
name=version_abbr,
55+
reader_cfg=subjective_reader_cfg,
56+
infer_cfg=subjective_infer_cfg,
57+
eval_cfg=subjective_eval_cfg
58+
))
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from os import getenv as gv
2+
from opencompass.models import HuggingFaceCausalLM
3+
from mmengine.config import read_base
4+
5+
with read_base():
6+
from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets
7+
8+
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
9+
from opencompass.partitioners import NaivePartitioner, SizePartitioner
10+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
11+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
12+
from opencompass.runners import LocalRunner
13+
from opencompass.runners import SlurmSequentialRunner
14+
from opencompass.tasks import OpenICLInferTask
15+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
16+
from opencompass.summarizers import CompassBenchSummarizer
17+
18+
api_meta_template = dict(
19+
round=[
20+
dict(role='HUMAN', api_role='HUMAN'),
21+
dict(role='BOT', api_role='BOT', generate=True),
22+
],
23+
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
24+
)
25+
26+
# -------------Inference Stage ----------------------------------------
27+
28+
from opencompass.models import HuggingFacewithChatTemplate
29+
30+
models = [
31+
dict(
32+
type=HuggingFacewithChatTemplate,
33+
abbr='internlm2-chat-7b-hf',
34+
path='internlm/internlm2-chat-7b',
35+
max_out_len=1024,
36+
batch_size=8,
37+
run_cfg=dict(num_gpus=1),
38+
stop_words=['</s>', '<|im_end|>'],
39+
generation_kwargs=dict(
40+
do_sample=True,
41+
),
42+
)
43+
]
44+
45+
datasets = [*subjective_datasets]
46+
47+
infer = dict(
48+
partitioner=dict(type=NaivePartitioner),
49+
runner=dict(
50+
type=SlurmSequentialRunner,
51+
partition='llmeval',
52+
quotatype='reserved',
53+
max_num_workers=256,
54+
task=dict(type=OpenICLInferTask),
55+
),
56+
)
57+
58+
gpt4 = dict(
59+
abbr='gpt4-turbo',
60+
type=OpenAI,
61+
path='gpt-4-1106-preview',
62+
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
63+
meta_template=api_meta_template,
64+
query_per_second=1,
65+
max_out_len=2048,
66+
max_seq_len=4096,
67+
batch_size=4,
68+
retry=20,
69+
temperature=1,
70+
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
71+
72+
# -------------Evalation Stage ----------------------------------------
73+
74+
## ------------- JudgeLLM Configuration
75+
judge_models = [dict(
76+
abbr='GPT4-Turbo',
77+
type=OpenAI,
78+
path='gpt-4-1106-preview',
79+
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
80+
meta_template=api_meta_template,
81+
query_per_second=1,
82+
max_out_len=1024,
83+
max_seq_len=4096,
84+
batch_size=2,
85+
retry=20,
86+
temperature=0,
87+
)]
88+
89+
judge_models = [
90+
dict(
91+
type=HuggingFacewithChatTemplate,
92+
abbr='internlm102b',
93+
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
94+
max_out_len=1024,
95+
batch_size=8,
96+
run_cfg=dict(num_gpus=4),
97+
stop_words=['</s>', '<|im_end|>'],
98+
),
99+
dict(
100+
type=HuggingFacewithChatTemplate,
101+
abbr='internlm102b2',
102+
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
103+
max_out_len=1024,
104+
batch_size=8,
105+
run_cfg=dict(num_gpus=4),
106+
stop_words=['</s>', '<|im_end|>'],
107+
),
108+
dict(
109+
type=HuggingFacewithChatTemplate,
110+
abbr='internlm102b3',
111+
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
112+
max_out_len=1024,
113+
batch_size=8,
114+
run_cfg=dict(num_gpus=4),
115+
stop_words=['</s>', '<|im_end|>'],
116+
)
117+
]
118+
119+
## ------------- Evaluation Configuration
120+
eval = dict(
121+
partitioner=dict(
122+
type=SubjectiveSizePartitioner,
123+
strategy='split',
124+
max_task_size=10000000,
125+
mode='m2n',
126+
infer_order='double',
127+
base_models=[gpt4],
128+
compare_models=models,
129+
judge_models=judge_models,
130+
),
131+
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
132+
#given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
133+
)
134+
135+
work_dir = 'outputs/compassbench/'
136+
137+
summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add')

configs/summarizers/groups/charm_reason.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@
2020
]
2121

2222

23-
charm_reaso_summary_groups = []
23+
charm_reason_summary_groups = []
2424
for prompt in prompts:
2525
for region in regions:
2626
subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
27-
charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
27+
charm_reason_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
2828

2929
for prompt in prompts:
3030
subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
31-
charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
31+
charm_reason_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
3232

33-
charm_reaso_summary_groups.append(
33+
charm_reason_summary_groups.append(
3434
{'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
3535
)

opencompass/datasets/subjective/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
22
from .arena_hard import ArenaHardDataset # noqa: F401, F403
33
from .compass_arena import CompassArenaDataset # noqa: F401, F403
4+
from .compassbench import CompassBenchDataset # noqa: F401, F403
45
from .corev2 import Corev2Dataset # noqa: F401, F403
56
from .creationbench import CreationBenchDataset # noqa: F401, F403
67
from .information_retrival import IRDataset # noqa: F401, F403
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# flake8: noqa
2+
import json
3+
import os.path as osp
4+
5+
from datasets import Dataset
6+
7+
from opencompass.registry import LOAD_DATASET
8+
9+
from ..base import BaseDataset
10+
11+
base_prompt_zh = """请根据 用户问题 以及 相应的两个回答,判断哪一个回答更好。
12+
[用户问题]
13+
{question}
14+
15+
[回答1开始]
16+
{prediction}
17+
[回答1结束]
18+
19+
[回答2开始]
20+
{prediction2}
21+
[回答2结束]
22+
23+
根据评分要求,请先对两个回答进行评价,最后在以下 3 个选项中做出选择:
24+
A. 回答1更好
25+
B. 回答2更好
26+
C. 回答1、2平局
27+
28+
如果你认为回答1更好,你的输出应形如:
29+
评价1:回答1 xxx
30+
评价2:回答2 xxx
31+
选择:[[A]]
32+
33+
如果你认为回答2更好,你的输出应形如:
34+
评价1:回答1 xxx
35+
评价2:回答2 xxx
36+
选择:[[B]]
37+
38+
如果你认为回答1、2打成平手,你的输出应形如:
39+
评价1:回答1 xxx
40+
评价2:回答2 xxx
41+
选择:[[C]]
42+
"""
43+
44+
base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options:
45+
A. Response 1 is better
46+
B. Response 2 is better
47+
C. Both responses are equal
48+
49+
[user's question]
50+
{question}
51+
52+
[Response 1 Start]
53+
{prediction}
54+
[Response 1 End]
55+
56+
[Response 2 Start]
57+
{prediction2}
58+
[Response 2 End]
59+
60+
If you believe that Response 1 is better, your output should be formatted as follows:
61+
Evaluation 1: Response 1 xxx
62+
Evaluation 2: Response 2 xxx
63+
Choice: [[A]]
64+
65+
If you believe that Response 2 is better, your output should be formatted as follows:
66+
Evaluation 1: Response 1 xxx
67+
Evaluation 2: Response 2 xxx
68+
Choice: [[B]]
69+
70+
If you believe that both responses are equally good, your output should be formatted as follows:
71+
Evaluation 1: Response 1 xxx
72+
Evaluation 2: Response 2 xxx
73+
Choice: [[C]]
74+
"""
75+
76+
77+
@LOAD_DATASET.register_module()
78+
class CompassBenchDataset(BaseDataset):
79+
80+
def load(self, path: str, name: str):
81+
filename = osp.join(path, f'{name}.json')
82+
raw_data = []
83+
with open(filename, 'r', encoding='utf-8') as f:
84+
json_data = json.load(f)
85+
for problem in json_data:
86+
question = problem['question']
87+
lan = problem['language']
88+
others = problem['others']
89+
judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en
90+
raw_data.append({
91+
'question': question,
92+
'judge_prompt': judge_prompt,
93+
'judge': {
94+
'lan': lan,
95+
'level': others['level'],
96+
'category': problem['category'],
97+
'question': question
98+
}
99+
})
100+
dataset = Dataset.from_list(raw_data)
101+
return dataset

opencompass/summarizers/subjective/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .alpacaeval import AlpacaSummarizer
55
from .arenahard import ArenaHardSummarizer
66
from .compass_arena import CompassArenaSummarizer
7+
from .compassbench import CompassBenchSummarizer
78
from .corev2 import Corev2Summarizer
89
from .creationbench import CreationBenchSummarizer
910
from .flames import FlamesSummarizer

0 commit comments

Comments
 (0)