Skip to content

Commit ac99d10

Browse files
authored
Merge branch 'main' into clem_mmlupro
2 parents a549107 + a455539 commit ac99d10

File tree

95 files changed

+4650
-6668
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+4650
-6668
lines changed

community_tasks/_template.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,10 @@
3030
"""
3131

3232
import numpy as np
33-
from aenum import extend_enum
3433

35-
from lighteval.metrics.metrics import Metrics, SampleLevelMetric
36-
from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase
37-
from lighteval.tasks.default_prompts import LETTER_INDICES
34+
from lighteval.metrics.metrics import SampleLevelMetric
3835
from lighteval.tasks.lighteval_task import LightevalTaskConfig
39-
from lighteval.tasks.requests import Doc
36+
from lighteval.tasks.requests import Doc, SamplingMethod
4037

4138

4239
# DEFINE YOUR PROMPT FUNCTIONS
@@ -49,7 +46,7 @@ def prompt_fn(line, task_name: str = None):
4946
return Doc(
5047
task_name=task_name,
5148
query="",
52-
choices="",
49+
choices=[""],
5350
gold_index=0,
5451
instruction="",
5552
)
@@ -68,7 +65,7 @@ def prompt_fn(line, task_name: str = None):
6865
evaluation_splits=[],
6966
few_shots_split="",
7067
few_shots_select="",
71-
metric=[], # select your metric in Metrics
68+
metrics=[], # select your metric in Metrics
7269
)
7370

7471
# EVALS WITH SUBSET
@@ -91,7 +88,7 @@ def __init__(
9188
hf_subset=hf_subset,
9289
prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
9390
hf_repo="",
94-
metric=[custom_metric], # select your metric in Metrics or use your custom_metric
91+
metrics=[custom_metric], # select your metric in Metrics or use your custom_metric
9592
hf_avail_splits=[],
9693
evaluation_splits=[],
9794
few_shots_split="",
@@ -111,8 +108,7 @@ def __init__(
111108
custom_metric = SampleLevelMetric(
112109
metric_name="my_custom_metric_name",
113110
higher_is_better=True,
114-
category=MetricCategory.IGNORED,
115-
use_case=MetricUseCase.NONE,
111+
category=SamplingMethod.GENERATIVE, # or LOGPROBS, PERPLEXITY, etc.
116112
sample_level_fn=lambda x: x, # how to compute score for one sample
117113
corpus_level_fn=np.mean, # aggregation
118114
)

community_tasks/arabic_evals.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,10 @@
3232
from typing import Any, Dict, List, Optional, Union
3333

3434
from lighteval.metrics.llm_as_judge import JudgeLM
35-
from lighteval.metrics.metrics import Metric, MetricCategory, Metrics
36-
from lighteval.metrics.utils.metric_utils import MetricUseCase
35+
from lighteval.metrics.metrics import Metric, Metrics
3736
from lighteval.tasks.default_prompts import LETTER_INDICES
3837
from lighteval.tasks.lighteval_task import LightevalTaskConfig
39-
from lighteval.tasks.requests import Doc
38+
from lighteval.tasks.requests import Doc, SamplingMethod
4039

4140

4241
# fmt: off
@@ -104,7 +103,7 @@ def __init__(
104103
hf_subset=hf_subset,
105104
prompt_function=arabic_mmlu_pfn,
106105
hf_repo="MBZUAI/ArabicMMLU",
107-
metric=[Metrics.loglikelihood_acc_norm],
106+
metrics=[Metrics.loglikelihood_acc_norm],
108107
hf_avail_splits=["test"],
109108
evaluation_splits=["test"],
110109
few_shots_split=["dev"],
@@ -166,7 +165,7 @@ def __init__(
166165
hf_subset=hf_subset,
167166
prompt_function=arabic_mmlu_ht_pfn,
168167
hf_repo="MBZUAI/human_translated_arabic_mmlu",
169-
metric=[Metrics.loglikelihood_acc_norm],
168+
metrics=[Metrics.loglikelihood_acc_norm],
170169
hf_avail_splits=["test"],
171170
evaluation_splits=["test"],
172171
few_shots_split=None,
@@ -231,7 +230,7 @@ def __init__(
231230
hf_subset=hf_subset,
232231
prompt_function=arabic_mmlu_mt_pfn,
233232
hf_repo="OALL/Arabic_MMLU",
234-
metric=[Metrics.loglikelihood_acc_norm],
233+
metrics=[Metrics.loglikelihood_acc_norm],
235234
hf_avail_splits=["test", "dev"],
236235
evaluation_splits=["test"],
237236
few_shots_split="dev",
@@ -287,7 +286,7 @@ def __init__(
287286
hf_subset=hf_subset,
288287
prompt_function=acva_pfn,
289288
hf_repo="OALL/ACVA",
290-
metric=[Metrics.loglikelihood_acc_norm],
289+
metrics=[Metrics.loglikelihood_acc_norm],
291290
hf_avail_splits=["test", "validation"],
292291
evaluation_splits=["test"],
293292
few_shots_split="validation",
@@ -344,7 +343,7 @@ def __init__(
344343
hf_subset=hf_subset,
345344
prompt_function=aratrust_pfn,
346345
hf_repo="asas-ai/AraTrust-categorized",
347-
metric=[Metrics.loglikelihood_acc_norm],
346+
metrics=[Metrics.loglikelihood_acc_norm],
348347
hf_avail_splits=["train"],
349348
evaluation_splits=["train"],
350349
few_shots_split=None,
@@ -393,7 +392,7 @@ def arabic_exams_pfn(line, task_name: str = None):
393392
evaluation_splits=["test"],
394393
few_shots_split="validation",
395394
few_shots_select="sequential",
396-
metric=[Metrics.loglikelihood_acc_norm],
395+
metrics=[Metrics.loglikelihood_acc_norm],
397396
trust_dataset=True,
398397
version=0,
399398
)
@@ -444,7 +443,7 @@ def __init__(
444443
hf_subset=hf_subset,
445444
prompt_function=alghafa_pfn,
446445
hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
447-
metric=[Metrics.loglikelihood_acc_norm],
446+
metrics=[Metrics.loglikelihood_acc_norm],
448447
hf_avail_splits=["test", "validation"],
449448
evaluation_splits=["test"],
450449
few_shots_split="validation",
@@ -471,7 +470,7 @@ def __init__(
471470
evaluation_splits=["test"],
472471
few_shots_split="validation",
473472
few_shots_select="sequential",
474-
metric=[Metrics.loglikelihood_acc_norm],
473+
metrics=[Metrics.loglikelihood_acc_norm],
475474
trust_dataset=True,
476475
version=0,
477476
)
@@ -488,7 +487,7 @@ def __init__(
488487
evaluation_splits=["test"],
489488
few_shots_split="validation",
490489
few_shots_select="sequential",
491-
metric=[Metrics.loglikelihood_acc_norm],
490+
metrics=[Metrics.loglikelihood_acc_norm],
492491
trust_dataset=True,
493492
version=0,
494493
)
@@ -505,7 +504,7 @@ def __init__(
505504
evaluation_splits=["test"],
506505
few_shots_split="validation",
507506
few_shots_select="sequential",
508-
metric=[Metrics.loglikelihood_acc_norm],
507+
metrics=[Metrics.loglikelihood_acc_norm],
509508
trust_dataset=True,
510509
version=0,
511510
)
@@ -522,7 +521,7 @@ def __init__(
522521
evaluation_splits=["test"],
523522
few_shots_split="validation",
524523
few_shots_select="sequential",
525-
metric=[Metrics.loglikelihood_acc_norm],
524+
metrics=[Metrics.loglikelihood_acc_norm],
526525
trust_dataset=True,
527526
version=0,
528527
)
@@ -539,7 +538,7 @@ def __init__(
539538
evaluation_splits=["test"],
540539
few_shots_split="validation",
541540
few_shots_select="sequential",
542-
metric=[Metrics.loglikelihood_acc_norm],
541+
metrics=[Metrics.loglikelihood_acc_norm],
543542
trust_dataset=True,
544543
version=0,
545544
)
@@ -556,7 +555,7 @@ def __init__(
556555
evaluation_splits=["test"],
557556
few_shots_split="validation",
558557
few_shots_select="sequential",
559-
metric=[Metrics.loglikelihood_acc_norm],
558+
metrics=[Metrics.loglikelihood_acc_norm],
560559
trust_dataset=True,
561560
version=0,
562561
)
@@ -594,7 +593,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
594593
evaluation_splits=["test"],
595594
few_shots_split="validation",
596595
few_shots_select="sequential",
597-
metric=[Metrics.loglikelihood_acc_norm],
596+
metrics=[Metrics.loglikelihood_acc_norm],
598597
trust_dataset=True,
599598
version=0,
600599
)
@@ -629,7 +628,7 @@ def copa_arabic_pfn(line, task_name: str = None):
629628
evaluation_splits=["test"],
630629
few_shots_split="validation",
631630
few_shots_select="sequential",
632-
metric=[Metrics.loglikelihood_acc_norm],
631+
metrics=[Metrics.loglikelihood_acc_norm],
633632
trust_dataset=True,
634633
version=0,
635634
)
@@ -673,7 +672,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
673672
evaluation_splits=["test"],
674673
few_shots_split="validation",
675674
few_shots_select="sequential",
676-
metric=[Metrics.loglikelihood_acc_norm],
675+
metrics=[Metrics.loglikelihood_acc_norm],
677676
trust_dataset=True,
678677
version=0,
679678
)
@@ -710,7 +709,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
710709
evaluation_splits=["test"],
711710
few_shots_split="validation",
712711
few_shots_select="sequential",
713-
metric=[Metrics.loglikelihood_acc_norm],
712+
metrics=[Metrics.loglikelihood_acc_norm],
714713
trust_dataset=True,
715714
version=0,
716715
)
@@ -761,7 +760,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
761760
evaluation_splits=["test"],
762761
few_shots_split="validation",
763762
few_shots_select="sequential",
764-
metric=[Metrics.loglikelihood_acc_norm],
763+
metrics=[Metrics.loglikelihood_acc_norm],
765764
trust_dataset=True,
766765
version=0,
767766
)
@@ -819,7 +818,7 @@ def __init__(
819818
hf_subset=hf_subset,
820819
prompt_function=madinah_qa_pfn,
821820
hf_repo="MBZUAI/MadinahQA",
822-
metric=[Metrics.loglikelihood_acc_norm],
821+
metrics=[Metrics.loglikelihood_acc_norm],
823822
hf_avail_splits=["test"],
824823
evaluation_splits=["test"],
825824
few_shots_split=["dev"],
@@ -849,11 +848,10 @@ def __init__(self, judge: JudgeLM):
849848
"""
850849
self.judge = judge
851850
self.metric_name = "llm_as_judge"
852-
self.category = MetricCategory.LLM_AS_JUDGE
851+
self.category = SamplingMethod.GENERATIVE
853852
self.corpus_level_fn = self.aggregate_scores
854853
self.sample_level_fn = self._sample_level_fn
855854
self.higher_is_better = True # Fixed tuple syntax
856-
self.use_case = MetricUseCase.NONE
857855

858856
def compute(self, responses: list[str], formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
859857
"""
@@ -1039,7 +1037,7 @@ def process_judge_response(response) -> float:
10391037
hf_subset=None,
10401038
hf_avail_splits=["train"],
10411039
evaluation_splits=["train"],
1042-
metric=[wrapped_judge],
1040+
metrics=[wrapped_judge],
10431041
trust_dataset=True,
10441042
generation_size=200,
10451043
stop_sequence=[],

community_tasks/french_evals.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,7 @@
3232

3333
import random
3434

35-
import numpy as np
36-
from aenum import extend_enum
37-
38-
import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry
39-
from lighteval.metrics.metrics import Metrics, SampleLevelMetric
40-
from lighteval.metrics.utils.metric_utils import (
41-
MetricCategory,
42-
MetricUseCase,
43-
SampleLevelMetricGrouping,
44-
)
35+
from lighteval.metrics.metrics import Metrics
4536
from lighteval.tasks.default_prompts import LETTER_INDICES
4637
from lighteval.tasks.extended.ifeval.main import ifeval_metrics
4738
from lighteval.tasks.lighteval_task import LightevalTaskConfig
@@ -106,7 +97,7 @@ def prompt_bac_fr(line, task_name: str = None):
10697
suite=["community"],
10798
hf_repo="fr-gouv-coordination-ia/IFEval-fr",
10899
hf_subset="default",
109-
metric=[ifeval_metrics],
100+
metrics=[ifeval_metrics],
110101
hf_avail_splits=["train"],
111102
evaluation_splits=["train"],
112103
few_shots_split="train",
@@ -128,7 +119,7 @@ def prompt_bac_fr(line, task_name: str = None):
128119
few_shots_split=None,
129120
few_shots_select="random_sampling",
130121
generation_size=1,
131-
metric=[Metrics.loglikelihood_acc],
122+
metrics=[Metrics.loglikelihood_acc],
132123
stop_sequence=["\n"],
133124
trust_dataset=True,
134125
version=0,
@@ -146,7 +137,7 @@ def prompt_bac_fr(line, task_name: str = None):
146137
few_shots_split=None,
147138
few_shots_select="random_sampling",
148139
generation_size=1,
149-
metric=[Metrics.quasi_exact_match_math, Metrics.exact_match],
140+
metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match],
150141
stop_sequence=["\n"],
151142
trust_dataset=True,
152143
version=0,

docs/source/_toctree.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@
4141
- local: package_reference/evaluation_tracker
4242
title: EvaluationTracker
4343
- local: package_reference/models
44-
title: Models and ModelConfigs
44+
title: Model Configs
4545
- local: package_reference/pipeline
4646
title: Pipeline
47+
- local: package_reference/models_outputs
48+
title: Model's Output
4749
title: Main classes
4850
- local: package_reference/metrics
4951
title: Metrics

docs/source/adding-a-custom-task.mdx

Lines changed: 2 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def prompt_fn(line, task_name: str = None):
4141
query=line["question"],
4242
choices=[f" {c}" for c in line["choices"]],
4343
gold_index=line["gold"],
44-
instruction="",
4544
)
4645
```
4746

@@ -53,8 +52,7 @@ in [`lighteval.metrics.metrics.Metrics`]) or [create a custom one](adding-a-new-
5352
custom_metric = SampleLevelMetric(
5453
metric_name="my_custom_metric_name",
5554
higher_is_better=True,
56-
category=MetricCategory.IGNORED,
57-
use_case=MetricUseCase.NONE,
55+
category=SamplingMethod.{GENERATIVE,LOGPROBS},
5856
sample_level_fn=lambda x: x, # how to compute score for one sample
5957
corpus_level_fn=np.mean, # How to aggregate the samples metrics
6058
)
@@ -77,7 +75,7 @@ task = LightevalTaskConfig(
7775
evaluation_splits=[],
7876
few_shots_split=None,
7977
few_shots_select=None,
80-
metric=[], # select your metric in Metrics
78+
metrics=[], # select your metric in Metrics
8179
)
8280
```
8381

@@ -111,50 +109,6 @@ class CustomSubsetTask(LightevalTaskConfig):
111109
SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
112110
```
113111

114-
Here is a list of the parameters and their meaning:
115-
116-
- `name` (str), your evaluation name
117-
- `suite` (list), the suite(s) to which your evaluation should belong. This
118-
field allows us to compare different task implementations and is used as a
119-
task selection to differentiate the versions to launch. At the moment, you'll
120-
find the keywords ["helm", "bigbench", "original", "lighteval", "community",
121-
"custom"]; for core evals, please choose `lighteval`.
122-
- `prompt_function` (Callable), the prompt function you defined in the step
123-
above
124-
- `hf_repo` (str), the path to your evaluation dataset on the hub
125-
- `hf_subset` (str), the specific subset you want to use for your evaluation
126-
(note: when the dataset has no subset, fill this field with `"default"`, not
127-
with `None` or `""`)
128-
- `hf_avail_splits` (list), all the splits available for your dataset (train,
129-
valid or validation, test, other...)
130-
- `evaluation_splits` (list), the splits you want to use for evaluation
131-
- `few_shots_split` (str, can be `null`), the specific split from which you
132-
want to select samples for your few-shot examples. It should be different
133-
from the sets included in `evaluation_splits`
134-
- `few_shots_select` (str, can be `null`), the method that you will use to
135-
select items for your few-shot examples. Can be `null`, or one of:
136-
- `balanced` select examples from the `few_shots_split` with balanced
137-
labels, to avoid skewing the few shot examples (hence the model
138-
generations) toward one specific label
139-
- `random` selects examples at random from the `few_shots_split`
140-
- `random_sampling` selects new examples at random from the
141-
`few_shots_split` for every new item, but if a sampled item is equal to
142-
the current one, it is removed from the available samples
143-
- `random_sampling_from_train` selects new examples at random from the
144-
`few_shots_split` for every new item, but if a sampled item is equal to
145-
the current one, it is kept! Only use this if you know what you are
146-
doing.
147-
- `sequential` selects the first `n` examples of the `few_shots_split`
148-
- `generation_size` (int), the maximum number of tokens allowed for a
149-
generative evaluation. If your evaluation is a log likelihood evaluation
150-
(multi-choice), this value should be -1
151-
- `stop_sequence` (list), a list of strings acting as end of sentence tokens
152-
for your generation
153-
- `metric` (list), the metrics you want to use for your evaluation (see next
154-
section for a detailed explanation)
155-
- `trust_dataset` (bool), set to True if you trust the dataset.
156-
157-
158112
Then you need to add your task to the `TASKS_TABLE` list.
159113

160114
```python

0 commit comments

Comments
 (0)