32
32
from typing import Any , Dict , List , Optional , Union
33
33
34
34
from lighteval .metrics .llm_as_judge import JudgeLM
35
- from lighteval .metrics .metrics import Metric , MetricCategory , Metrics
36
- from lighteval .metrics .utils .metric_utils import MetricUseCase
35
+ from lighteval .metrics .metrics import Metric , Metrics
37
36
from lighteval .tasks .default_prompts import LETTER_INDICES
38
37
from lighteval .tasks .lighteval_task import LightevalTaskConfig
39
- from lighteval .tasks .requests import Doc
38
+ from lighteval .tasks .requests import Doc , SamplingMethod
40
39
41
40
42
41
# fmt: off
@@ -104,7 +103,7 @@ def __init__(
104
103
hf_subset = hf_subset ,
105
104
prompt_function = arabic_mmlu_pfn ,
106
105
hf_repo = "MBZUAI/ArabicMMLU" ,
107
- metric = [Metrics .loglikelihood_acc_norm ],
106
+ metrics = [Metrics .loglikelihood_acc_norm ],
108
107
hf_avail_splits = ["test" ],
109
108
evaluation_splits = ["test" ],
110
109
few_shots_split = ["dev" ],
@@ -166,7 +165,7 @@ def __init__(
166
165
hf_subset = hf_subset ,
167
166
prompt_function = arabic_mmlu_ht_pfn ,
168
167
hf_repo = "MBZUAI/human_translated_arabic_mmlu" ,
169
- metric = [Metrics .loglikelihood_acc_norm ],
168
+ metrics = [Metrics .loglikelihood_acc_norm ],
170
169
hf_avail_splits = ["test" ],
171
170
evaluation_splits = ["test" ],
172
171
few_shots_split = None ,
@@ -231,7 +230,7 @@ def __init__(
231
230
hf_subset = hf_subset ,
232
231
prompt_function = arabic_mmlu_mt_pfn ,
233
232
hf_repo = "OALL/Arabic_MMLU" ,
234
- metric = [Metrics .loglikelihood_acc_norm ],
233
+ metrics = [Metrics .loglikelihood_acc_norm ],
235
234
hf_avail_splits = ["test" , "dev" ],
236
235
evaluation_splits = ["test" ],
237
236
few_shots_split = "dev" ,
@@ -287,7 +286,7 @@ def __init__(
287
286
hf_subset = hf_subset ,
288
287
prompt_function = acva_pfn ,
289
288
hf_repo = "OALL/ACVA" ,
290
- metric = [Metrics .loglikelihood_acc_norm ],
289
+ metrics = [Metrics .loglikelihood_acc_norm ],
291
290
hf_avail_splits = ["test" , "validation" ],
292
291
evaluation_splits = ["test" ],
293
292
few_shots_split = "validation" ,
@@ -344,7 +343,7 @@ def __init__(
344
343
hf_subset = hf_subset ,
345
344
prompt_function = aratrust_pfn ,
346
345
hf_repo = "asas-ai/AraTrust-categorized" ,
347
- metric = [Metrics .loglikelihood_acc_norm ],
346
+ metrics = [Metrics .loglikelihood_acc_norm ],
348
347
hf_avail_splits = ["train" ],
349
348
evaluation_splits = ["train" ],
350
349
few_shots_split = None ,
@@ -393,7 +392,7 @@ def arabic_exams_pfn(line, task_name: str = None):
393
392
evaluation_splits = ["test" ],
394
393
few_shots_split = "validation" ,
395
394
few_shots_select = "sequential" ,
396
- metric = [Metrics .loglikelihood_acc_norm ],
395
+ metrics = [Metrics .loglikelihood_acc_norm ],
397
396
trust_dataset = True ,
398
397
version = 0 ,
399
398
)
@@ -444,7 +443,7 @@ def __init__(
444
443
hf_subset = hf_subset ,
445
444
prompt_function = alghafa_pfn ,
446
445
hf_repo = "OALL/AlGhafa-Arabic-LLM-Benchmark-Native" ,
447
- metric = [Metrics .loglikelihood_acc_norm ],
446
+ metrics = [Metrics .loglikelihood_acc_norm ],
448
447
hf_avail_splits = ["test" , "validation" ],
449
448
evaluation_splits = ["test" ],
450
449
few_shots_split = "validation" ,
@@ -471,7 +470,7 @@ def __init__(
471
470
evaluation_splits = ["test" ],
472
471
few_shots_split = "validation" ,
473
472
few_shots_select = "sequential" ,
474
- metric = [Metrics .loglikelihood_acc_norm ],
473
+ metrics = [Metrics .loglikelihood_acc_norm ],
475
474
trust_dataset = True ,
476
475
version = 0 ,
477
476
)
@@ -488,7 +487,7 @@ def __init__(
488
487
evaluation_splits = ["test" ],
489
488
few_shots_split = "validation" ,
490
489
few_shots_select = "sequential" ,
491
- metric = [Metrics .loglikelihood_acc_norm ],
490
+ metrics = [Metrics .loglikelihood_acc_norm ],
492
491
trust_dataset = True ,
493
492
version = 0 ,
494
493
)
@@ -505,7 +504,7 @@ def __init__(
505
504
evaluation_splits = ["test" ],
506
505
few_shots_split = "validation" ,
507
506
few_shots_select = "sequential" ,
508
- metric = [Metrics .loglikelihood_acc_norm ],
507
+ metrics = [Metrics .loglikelihood_acc_norm ],
509
508
trust_dataset = True ,
510
509
version = 0 ,
511
510
)
@@ -522,7 +521,7 @@ def __init__(
522
521
evaluation_splits = ["test" ],
523
522
few_shots_split = "validation" ,
524
523
few_shots_select = "sequential" ,
525
- metric = [Metrics .loglikelihood_acc_norm ],
524
+ metrics = [Metrics .loglikelihood_acc_norm ],
526
525
trust_dataset = True ,
527
526
version = 0 ,
528
527
)
@@ -539,7 +538,7 @@ def __init__(
539
538
evaluation_splits = ["test" ],
540
539
few_shots_split = "validation" ,
541
540
few_shots_select = "sequential" ,
542
- metric = [Metrics .loglikelihood_acc_norm ],
541
+ metrics = [Metrics .loglikelihood_acc_norm ],
543
542
trust_dataset = True ,
544
543
version = 0 ,
545
544
)
@@ -556,7 +555,7 @@ def __init__(
556
555
evaluation_splits = ["test" ],
557
556
few_shots_split = "validation" ,
558
557
few_shots_select = "sequential" ,
559
- metric = [Metrics .loglikelihood_acc_norm ],
558
+ metrics = [Metrics .loglikelihood_acc_norm ],
560
559
trust_dataset = True ,
561
560
version = 0 ,
562
561
)
@@ -594,7 +593,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
594
593
evaluation_splits = ["test" ],
595
594
few_shots_split = "validation" ,
596
595
few_shots_select = "sequential" ,
597
- metric = [Metrics .loglikelihood_acc_norm ],
596
+ metrics = [Metrics .loglikelihood_acc_norm ],
598
597
trust_dataset = True ,
599
598
version = 0 ,
600
599
)
@@ -629,7 +628,7 @@ def copa_arabic_pfn(line, task_name: str = None):
629
628
evaluation_splits = ["test" ],
630
629
few_shots_split = "validation" ,
631
630
few_shots_select = "sequential" ,
632
- metric = [Metrics .loglikelihood_acc_norm ],
631
+ metrics = [Metrics .loglikelihood_acc_norm ],
633
632
trust_dataset = True ,
634
633
version = 0 ,
635
634
)
@@ -673,7 +672,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
673
672
evaluation_splits = ["test" ],
674
673
few_shots_split = "validation" ,
675
674
few_shots_select = "sequential" ,
676
- metric = [Metrics .loglikelihood_acc_norm ],
675
+ metrics = [Metrics .loglikelihood_acc_norm ],
677
676
trust_dataset = True ,
678
677
version = 0 ,
679
678
)
@@ -710,7 +709,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
710
709
evaluation_splits = ["test" ],
711
710
few_shots_split = "validation" ,
712
711
few_shots_select = "sequential" ,
713
- metric = [Metrics .loglikelihood_acc_norm ],
712
+ metrics = [Metrics .loglikelihood_acc_norm ],
714
713
trust_dataset = True ,
715
714
version = 0 ,
716
715
)
@@ -761,7 +760,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
761
760
evaluation_splits = ["test" ],
762
761
few_shots_split = "validation" ,
763
762
few_shots_select = "sequential" ,
764
- metric = [Metrics .loglikelihood_acc_norm ],
763
+ metrics = [Metrics .loglikelihood_acc_norm ],
765
764
trust_dataset = True ,
766
765
version = 0 ,
767
766
)
@@ -819,7 +818,7 @@ def __init__(
819
818
hf_subset = hf_subset ,
820
819
prompt_function = madinah_qa_pfn ,
821
820
hf_repo = "MBZUAI/MadinahQA" ,
822
- metric = [Metrics .loglikelihood_acc_norm ],
821
+ metrics = [Metrics .loglikelihood_acc_norm ],
823
822
hf_avail_splits = ["test" ],
824
823
evaluation_splits = ["test" ],
825
824
few_shots_split = ["dev" ],
@@ -849,11 +848,10 @@ def __init__(self, judge: JudgeLM):
849
848
"""
850
849
self .judge = judge
851
850
self .metric_name = "llm_as_judge"
852
- self .category = MetricCategory . LLM_AS_JUDGE
851
+ self .category = SamplingMethod . GENERATIVE
853
852
self .corpus_level_fn = self .aggregate_scores
854
853
self .sample_level_fn = self ._sample_level_fn
855
854
self .higher_is_better = True # Fixed tuple syntax
856
- self .use_case = MetricUseCase .NONE
857
855
858
856
def compute (self , responses : list [str ], formatted_docs : list [Doc ], ** kwargs ) -> dict [str , float ]:
859
857
"""
@@ -1039,7 +1037,7 @@ def process_judge_response(response) -> float:
1039
1037
hf_subset = None ,
1040
1038
hf_avail_splits = ["train" ],
1041
1039
evaluation_splits = ["train" ],
1042
- metric = [wrapped_judge ],
1040
+ metrics = [wrapped_judge ],
1043
1041
trust_dataset = True ,
1044
1042
generation_size = 200 ,
1045
1043
stop_sequence = [],
0 commit comments