From 65275d571c4fa27d5b3f7fcd90186b0bc3db77c6 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 15 May 2025 11:37:38 +0000 Subject: [PATCH 1/9] adds RULE --- src/lighteval/metrics/metrics.py | 22 +- src/lighteval/models/vllm/vllm_model.py | 3 +- src/lighteval/tasks/default_prompts.py | 17 +- src/lighteval/tasks/default_tasks.py | 1335 ++++++++++++++++++++++- src/lighteval/tasks/lighteval_task.py | 5 +- 5 files changed, 1365 insertions(+), 17 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 0aede953d..e355c4e92 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -133,7 +133,27 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - + ruler_match_any = SampleLevelMetric( + metric_name="ruler_match_any", + sample_level_fn=lambda predictions, golds, formatted_doc: max( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ruler_match_all = SampleLevelMetric( + metric_name="ruler_match_all", + sample_level_fn=lambda predictions, golds, formatted_doc: sum( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ) + / len(golds), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) bleurt = SampleLevelMetric( metric_name="bleurt", sample_level_fn=BLEURT().compute, diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 40352b4da..6992261e5 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -266,8 +266,9 @@ def greedy_until( if max_new_tokens is not None: if context_size + max_new_tokens > self.max_length: logger.warning( - f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length=} - {max_new_tokens=} = {self.max_length - max_new_tokens} tokens." ) + breakpoint() context_size = self.max_length - max_new_tokens if context_size < 0: logger.critical( diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 2745b63c5..3bd0bd844 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -43,13 +43,28 @@ # fmt: on +def ruler(line, task_name: str = None): + query = line["input"] + choices = line["outputs"] + gold_index = 0 + + return Doc(query=query, choices=choices, gold_index=gold_index, task_name=task_name) + + def simpleqa(line, task_name: str = None): query = line["problem"] choices = [line["answer"]] gold_index = 0 + instruction = "Only answer the question to complete the prompt, without any additional text.\n" + query = f"{instruction}{query}" return Doc( - task_name=task_name, query=query, choices=choices, gold_index=gold_index, specific={**eval(line["metadata"])} + task_name=task_name, + query=query, + choices=choices, + instruction=instruction, + gold_index=gold_index, + specific={**eval(line["metadata"])}, ) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 3960e6f5c..2d885532a 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -24,22 +24,1333 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig -abstract_narrative_understanding_bigbench = LightevalTaskConfig( - name="abstract_narrative_understanding", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="bigbench", - hf_subset="abstract_narrative_understanding", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], +ruler_niah_single_1_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_131072 = LightevalTaskConfig( + name="ruler_131072:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_131072 = LightevalTaskConfig( + name="ruler_131072:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_131072 = LightevalTaskConfig( + name="ruler_131072:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_131072 = LightevalTaskConfig( + name="ruler_131072:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_131072 = LightevalTaskConfig( + name="ruler_131072:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_65536 = LightevalTaskConfig( + name="ruler_65536:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_65536 = LightevalTaskConfig( + name="ruler_65536:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_65536 = LightevalTaskConfig( + name="ruler_65536:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_65536 = LightevalTaskConfig( + name="ruler_65536:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_65536 = LightevalTaskConfig( + name="ruler_65536:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_32768 = LightevalTaskConfig( + name="ruler_32768:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_32768 = LightevalTaskConfig( + name="ruler_32768:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_32768 = LightevalTaskConfig( + name="ruler_32768:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_32768 = LightevalTaskConfig( + name="ruler_32768:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_32768 = LightevalTaskConfig( + name="ruler_32768:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_16384 = LightevalTaskConfig( + name="ruler_16384:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_16384 = LightevalTaskConfig( + name="ruler_16384:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_16384 = LightevalTaskConfig( + name="ruler_16384:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_16384 = LightevalTaskConfig( + name="ruler_16384:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_16384 = LightevalTaskConfig( + name="ruler_16384:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], few_shots_split=None, few_shots_select=None, - generation_size=1, - metric=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - trust_dataset=True, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_8192 = LightevalTaskConfig( + name="ruler_8192:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_8192 = LightevalTaskConfig( + name="ruler_8192:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_8192 = LightevalTaskConfig( + name="ruler_8192:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_8192 = LightevalTaskConfig( + name="ruler_8192:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_8192 = LightevalTaskConfig( + name="ruler_8192:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_4096 = LightevalTaskConfig( + name="ruler_4096:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_4096 = LightevalTaskConfig( + name="ruler_4096:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_4096 = LightevalTaskConfig( + name="ruler_4096:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_4096 = LightevalTaskConfig( + name="ruler_4096:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_4096 = LightevalTaskConfig( + name="ruler_4096:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, version=0, ) + + agieval_aqua_rat_lighteval = LightevalTaskConfig( name="agieval:aqua-rat", suite=["lighteval"], diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index da09ec000..e34f73eb5 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -31,6 +31,7 @@ from huggingface_hub import TextGenerationInputGrammarType from multiprocess import Pool from pytablewriter import MarkdownTableWriter +from tqdm import tqdm from lighteval.metrics import ( apply_generative_metric, @@ -551,7 +552,7 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = task.dataset_filter, task.dataset_revision, ) - for task in tasks + for task in tqdm(tasks) ] else: with Pool(processes=dataset_loading_processes) as pool: @@ -618,7 +619,7 @@ def create_requests_from_tasks( # noqa: C901 task_dict_items = [(name, task) for name, task in task_dict.items() if len(task.eval_docs()) > 0] # Get lists of each type of request - for task_name, task in task_dict_items: + for task_name, task in tqdm(task_dict_items): task_docs = list(task.eval_docs()) n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs) evaluation_tracker.task_config_logger.log_num_docs(task_name, len(task_docs), n_samples) From 0ef15e3f920d76ee1b79f0d5dc32d86780672df1 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 15 May 2025 11:42:26 +0000 Subject: [PATCH 2/9] adds RULE --- src/lighteval/tasks/default_prompts.py | 7 +- src/lighteval/tasks/default_tasks.py | 95 -------------------------- 2 files changed, 3 insertions(+), 99 deletions(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 3bd0bd844..073fa8c15 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -47,22 +47,21 @@ def ruler(line, task_name: str = None): query = line["input"] choices = line["outputs"] gold_index = 0 + instruction = "Only answer the question to complete the prompt, without any additional text.\n" + query = f"{instruction}{query}" - return Doc(query=query, choices=choices, gold_index=gold_index, task_name=task_name) + return Doc(query=query, instruction=instruction, choices=choices, gold_index=gold_index, task_name=task_name) def simpleqa(line, task_name: str = None): query = line["problem"] choices = [line["answer"]] gold_index = 0 - instruction = "Only answer the question to complete the prompt, without any additional text.\n" - query = f"{instruction}{query}" return Doc( task_name=task_name, query=query, choices=choices, - instruction=instruction, gold_index=gold_index, specific={**eval(line["metadata"])}, ) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 2d885532a..4cb1fa416 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -40,7 +40,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_131072 = LightevalTaskConfig( name="ruler_131072:niah_single_3", suite=["lighteval"], @@ -57,7 +56,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_131072 = LightevalTaskConfig( name="ruler_131072:niah_single_2", suite=["lighteval"], @@ -74,7 +72,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_131072 = LightevalTaskConfig( name="ruler_131072:niah_multikey_1", suite=["lighteval"], @@ -91,7 +88,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_131072 = LightevalTaskConfig( name="ruler_131072:niah_multikey_2", suite=["lighteval"], @@ -108,7 +104,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_131072 = LightevalTaskConfig( name="ruler_131072:niah_multiquery", suite=["lighteval"], @@ -125,7 +120,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_131072 = LightevalTaskConfig( name="ruler_131072:niah_multikey_3", suite=["lighteval"], @@ -142,7 +136,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_131072 = LightevalTaskConfig( name="ruler_131072:niah_multivalue", suite=["lighteval"], @@ -159,7 +152,6 @@ trust_dataset=False, version=0, ) - ruler_vt_131072 = LightevalTaskConfig( name="ruler_131072:vt", suite=["lighteval"], @@ -176,7 +168,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_131072 = LightevalTaskConfig( name="ruler_131072:cwe", suite=["lighteval"], @@ -193,7 +184,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_131072 = LightevalTaskConfig( name="ruler_131072:fwe", suite=["lighteval"], @@ -210,7 +200,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_131072 = LightevalTaskConfig( name="ruler_131072:qa_1", suite=["lighteval"], @@ -227,7 +216,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_131072 = LightevalTaskConfig( name="ruler_131072:qa_2", suite=["lighteval"], @@ -244,24 +232,6 @@ trust_dataset=False, version=0, ) - -ruler_niah_single_1_65536 = LightevalTaskConfig( - name="ruler_65536:niah_single_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", - hf_subset="default", - hf_avail_splits=["niah_single_1"], - evaluation_splits=["niah_single_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) - ruler_niah_single_3_65536 = LightevalTaskConfig( name="ruler_65536:niah_single_3", suite=["lighteval"], @@ -278,7 +248,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_65536 = LightevalTaskConfig( name="ruler_65536:niah_single_2", suite=["lighteval"], @@ -295,7 +264,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_65536 = LightevalTaskConfig( name="ruler_65536:niah_multikey_1", suite=["lighteval"], @@ -312,7 +280,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_65536 = LightevalTaskConfig( name="ruler_65536:niah_multikey_2", suite=["lighteval"], @@ -329,7 +296,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_65536 = LightevalTaskConfig( name="ruler_65536:niah_multiquery", suite=["lighteval"], @@ -346,7 +312,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_65536 = LightevalTaskConfig( name="ruler_65536:niah_multikey_3", suite=["lighteval"], @@ -363,7 +328,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_65536 = LightevalTaskConfig( name="ruler_65536:niah_multivalue", suite=["lighteval"], @@ -380,7 +344,6 @@ trust_dataset=False, version=0, ) - ruler_vt_65536 = LightevalTaskConfig( name="ruler_65536:vt", suite=["lighteval"], @@ -397,7 +360,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_65536 = LightevalTaskConfig( name="ruler_65536:cwe", suite=["lighteval"], @@ -414,7 +376,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_65536 = LightevalTaskConfig( name="ruler_65536:fwe", suite=["lighteval"], @@ -431,7 +392,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_65536 = LightevalTaskConfig( name="ruler_65536:qa_1", suite=["lighteval"], @@ -448,7 +408,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_65536 = LightevalTaskConfig( name="ruler_65536:qa_2", suite=["lighteval"], @@ -465,7 +424,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_32768 = LightevalTaskConfig( name="ruler_32768:niah_single_1", suite=["lighteval"], @@ -482,7 +440,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_32768 = LightevalTaskConfig( name="ruler_32768:niah_single_3", suite=["lighteval"], @@ -499,7 +456,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_32768 = LightevalTaskConfig( name="ruler_32768:niah_single_2", suite=["lighteval"], @@ -516,7 +472,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_32768 = LightevalTaskConfig( name="ruler_32768:niah_multikey_1", suite=["lighteval"], @@ -533,7 +488,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_32768 = LightevalTaskConfig( name="ruler_32768:niah_multikey_2", suite=["lighteval"], @@ -550,7 +504,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_32768 = LightevalTaskConfig( name="ruler_32768:niah_multiquery", suite=["lighteval"], @@ -567,7 +520,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_32768 = LightevalTaskConfig( name="ruler_32768:niah_multikey_3", suite=["lighteval"], @@ -584,7 +536,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_32768 = LightevalTaskConfig( name="ruler_32768:niah_multivalue", suite=["lighteval"], @@ -601,7 +552,6 @@ trust_dataset=False, version=0, ) - ruler_vt_32768 = LightevalTaskConfig( name="ruler_32768:vt", suite=["lighteval"], @@ -618,7 +568,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_32768 = LightevalTaskConfig( name="ruler_32768:cwe", suite=["lighteval"], @@ -635,7 +584,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_32768 = LightevalTaskConfig( name="ruler_32768:fwe", suite=["lighteval"], @@ -652,7 +600,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_32768 = LightevalTaskConfig( name="ruler_32768:qa_1", suite=["lighteval"], @@ -669,7 +616,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_32768 = LightevalTaskConfig( name="ruler_32768:qa_2", suite=["lighteval"], @@ -686,7 +632,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_16384 = LightevalTaskConfig( name="ruler_16384:niah_single_1", suite=["lighteval"], @@ -703,7 +648,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_16384 = LightevalTaskConfig( name="ruler_16384:niah_single_3", suite=["lighteval"], @@ -720,7 +664,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_16384 = LightevalTaskConfig( name="ruler_16384:niah_single_2", suite=["lighteval"], @@ -737,7 +680,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_16384 = LightevalTaskConfig( name="ruler_16384:niah_multikey_1", suite=["lighteval"], @@ -754,7 +696,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_16384 = LightevalTaskConfig( name="ruler_16384:niah_multikey_2", suite=["lighteval"], @@ -771,7 +712,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_16384 = LightevalTaskConfig( name="ruler_16384:niah_multiquery", suite=["lighteval"], @@ -788,7 +728,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_16384 = LightevalTaskConfig( name="ruler_16384:niah_multikey_3", suite=["lighteval"], @@ -805,7 +744,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_16384 = LightevalTaskConfig( name="ruler_16384:niah_multivalue", suite=["lighteval"], @@ -822,7 +760,6 @@ trust_dataset=False, version=0, ) - ruler_vt_16384 = LightevalTaskConfig( name="ruler_16384:vt", suite=["lighteval"], @@ -839,7 +776,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_16384 = LightevalTaskConfig( name="ruler_16384:cwe", suite=["lighteval"], @@ -856,7 +792,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_16384 = LightevalTaskConfig( name="ruler_16384:fwe", suite=["lighteval"], @@ -873,7 +808,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_16384 = LightevalTaskConfig( name="ruler_16384:qa_1", suite=["lighteval"], @@ -890,7 +824,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_16384 = LightevalTaskConfig( name="ruler_16384:qa_2", suite=["lighteval"], @@ -907,7 +840,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_8192 = LightevalTaskConfig( name="ruler_8192:niah_single_1", suite=["lighteval"], @@ -924,7 +856,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_8192 = LightevalTaskConfig( name="ruler_8192:niah_single_3", suite=["lighteval"], @@ -941,7 +872,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_8192 = LightevalTaskConfig( name="ruler_8192:niah_single_2", suite=["lighteval"], @@ -958,7 +888,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_8192 = LightevalTaskConfig( name="ruler_8192:niah_multikey_1", suite=["lighteval"], @@ -975,7 +904,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_8192 = LightevalTaskConfig( name="ruler_8192:niah_multikey_2", suite=["lighteval"], @@ -992,7 +920,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_8192 = LightevalTaskConfig( name="ruler_8192:niah_multiquery", suite=["lighteval"], @@ -1009,7 +936,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_8192 = LightevalTaskConfig( name="ruler_8192:niah_multikey_3", suite=["lighteval"], @@ -1026,7 +952,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_8192 = LightevalTaskConfig( name="ruler_8192:niah_multivalue", suite=["lighteval"], @@ -1043,7 +968,6 @@ trust_dataset=False, version=0, ) - ruler_vt_8192 = LightevalTaskConfig( name="ruler_8192:vt", suite=["lighteval"], @@ -1060,7 +984,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_8192 = LightevalTaskConfig( name="ruler_8192:cwe", suite=["lighteval"], @@ -1077,7 +1000,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_8192 = LightevalTaskConfig( name="ruler_8192:fwe", suite=["lighteval"], @@ -1094,7 +1016,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_8192 = LightevalTaskConfig( name="ruler_8192:qa_1", suite=["lighteval"], @@ -1111,7 +1032,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_8192 = LightevalTaskConfig( name="ruler_8192:qa_2", suite=["lighteval"], @@ -1128,7 +1048,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_4096 = LightevalTaskConfig( name="ruler_4096:niah_single_1", suite=["lighteval"], @@ -1145,7 +1064,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_4096 = LightevalTaskConfig( name="ruler_4096:niah_single_3", suite=["lighteval"], @@ -1162,7 +1080,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_4096 = LightevalTaskConfig( name="ruler_4096:niah_single_2", suite=["lighteval"], @@ -1179,7 +1096,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_4096 = LightevalTaskConfig( name="ruler_4096:niah_multikey_1", suite=["lighteval"], @@ -1196,7 +1112,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_4096 = LightevalTaskConfig( name="ruler_4096:niah_multikey_2", suite=["lighteval"], @@ -1213,7 +1128,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_4096 = LightevalTaskConfig( name="ruler_4096:niah_multiquery", suite=["lighteval"], @@ -1230,7 +1144,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_4096 = LightevalTaskConfig( name="ruler_4096:niah_multikey_3", suite=["lighteval"], @@ -1247,7 +1160,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_4096 = LightevalTaskConfig( name="ruler_4096:niah_multivalue", suite=["lighteval"], @@ -1264,7 +1176,6 @@ trust_dataset=False, version=0, ) - ruler_vt_4096 = LightevalTaskConfig( name="ruler_4096:vt", suite=["lighteval"], @@ -1281,7 +1192,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_4096 = LightevalTaskConfig( name="ruler_4096:cwe", suite=["lighteval"], @@ -1298,7 +1208,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_4096 = LightevalTaskConfig( name="ruler_4096:fwe", suite=["lighteval"], @@ -1315,7 +1224,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_4096 = LightevalTaskConfig( name="ruler_4096:qa_1", suite=["lighteval"], @@ -1332,7 +1240,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_4096 = LightevalTaskConfig( name="ruler_4096:qa_2", suite=["lighteval"], @@ -1349,8 +1256,6 @@ trust_dataset=False, version=0, ) - - agieval_aqua_rat_lighteval = LightevalTaskConfig( name="agieval:aqua-rat", suite=["lighteval"], From 9cafd755dd48a4d059977c026087721ce1698e28 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 15 May 2025 11:44:37 +0000 Subject: [PATCH 3/9] adds RULE --- src/lighteval/tasks/default_tasks.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 4cb1fa416..d488b6e53 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -1256,6 +1256,22 @@ trust_dataset=False, version=0, ) +abstract_narrative_understanding_bigbench = LightevalTaskConfig( + name="abstract_narrative_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function=prompt.bigbench, + hf_repo="bigbench", + hf_subset="abstract_narrative_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + trust_dataset=True, + version=0, +) agieval_aqua_rat_lighteval = LightevalTaskConfig( name="agieval:aqua-rat", suite=["lighteval"], From ed3d9076f343c733fc9d54837b42d9592185bf75 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 19 May 2025 08:59:44 +0000 Subject: [PATCH 4/9] use llama 3.2 no chat template --- src/lighteval/tasks/default_tasks.py | 154 +++++++++++++-------------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index d488b6e53..abd45ee1f 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -28,7 +28,7 @@ name="ruler_131072:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -44,7 +44,7 @@ name="ruler_131072:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -60,7 +60,7 @@ name="ruler_131072:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -76,7 +76,7 @@ name="ruler_131072:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -92,7 +92,7 @@ name="ruler_131072:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -108,7 +108,7 @@ name="ruler_131072:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -124,7 +124,7 @@ name="ruler_131072:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -140,7 +140,7 @@ name="ruler_131072:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -156,7 +156,7 @@ name="ruler_131072:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -172,7 +172,7 @@ name="ruler_131072:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -188,7 +188,7 @@ name="ruler_131072:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -204,7 +204,7 @@ name="ruler_131072:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -220,7 +220,7 @@ name="ruler_131072:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -236,7 +236,7 @@ name="ruler_65536:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -252,7 +252,7 @@ name="ruler_65536:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -268,7 +268,7 @@ name="ruler_65536:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -284,7 +284,7 @@ name="ruler_65536:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -300,7 +300,7 @@ name="ruler_65536:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -316,7 +316,7 @@ name="ruler_65536:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -332,7 +332,7 @@ name="ruler_65536:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -348,7 +348,7 @@ name="ruler_65536:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -364,7 +364,7 @@ name="ruler_65536:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -380,7 +380,7 @@ name="ruler_65536:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -396,7 +396,7 @@ name="ruler_65536:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -412,7 +412,7 @@ name="ruler_65536:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -428,7 +428,7 @@ name="ruler_32768:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -444,7 +444,7 @@ name="ruler_32768:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -460,7 +460,7 @@ name="ruler_32768:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -476,7 +476,7 @@ name="ruler_32768:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -492,7 +492,7 @@ name="ruler_32768:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -508,7 +508,7 @@ name="ruler_32768:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -524,7 +524,7 @@ name="ruler_32768:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -540,7 +540,7 @@ name="ruler_32768:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -556,7 +556,7 @@ name="ruler_32768:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -572,7 +572,7 @@ name="ruler_32768:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -588,7 +588,7 @@ name="ruler_32768:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -604,7 +604,7 @@ name="ruler_32768:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -620,7 +620,7 @@ name="ruler_32768:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -636,7 +636,7 @@ name="ruler_16384:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -652,7 +652,7 @@ name="ruler_16384:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -668,7 +668,7 @@ name="ruler_16384:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -684,7 +684,7 @@ name="ruler_16384:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -700,7 +700,7 @@ name="ruler_16384:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -716,7 +716,7 @@ name="ruler_16384:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -732,7 +732,7 @@ name="ruler_16384:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -748,7 +748,7 @@ name="ruler_16384:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -764,7 +764,7 @@ name="ruler_16384:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -780,7 +780,7 @@ name="ruler_16384:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -796,7 +796,7 @@ name="ruler_16384:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -812,7 +812,7 @@ name="ruler_16384:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -828,7 +828,7 @@ name="ruler_16384:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -844,7 +844,7 @@ name="ruler_8192:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -860,7 +860,7 @@ name="ruler_8192:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -876,7 +876,7 @@ name="ruler_8192:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -892,7 +892,7 @@ name="ruler_8192:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -908,7 +908,7 @@ name="ruler_8192:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -924,7 +924,7 @@ name="ruler_8192:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -940,7 +940,7 @@ name="ruler_8192:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -956,7 +956,7 @@ name="ruler_8192:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -972,7 +972,7 @@ name="ruler_8192:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -988,7 +988,7 @@ name="ruler_8192:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -1004,7 +1004,7 @@ name="ruler_8192:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -1020,7 +1020,7 @@ name="ruler_8192:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -1036,7 +1036,7 @@ name="ruler_8192:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -1052,7 +1052,7 @@ name="ruler_4096:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -1068,7 +1068,7 @@ name="ruler_4096:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -1084,7 +1084,7 @@ name="ruler_4096:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -1100,7 +1100,7 @@ name="ruler_4096:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -1116,7 +1116,7 @@ name="ruler_4096:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -1132,7 +1132,7 @@ name="ruler_4096:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -1148,7 +1148,7 @@ name="ruler_4096:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -1164,7 +1164,7 @@ name="ruler_4096:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -1180,7 +1180,7 @@ name="ruler_4096:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -1196,7 +1196,7 @@ name="ruler_4096:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -1212,7 +1212,7 @@ name="ruler_4096:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -1228,7 +1228,7 @@ name="ruler_4096:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -1244,7 +1244,7 @@ name="ruler_4096:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], From 248bb678519f0046ec3096b2cac7ca56d00c179d Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 21 May 2025 16:53:03 +0200 Subject: [PATCH 5/9] Update src/lighteval/tasks/default_prompts.py --- src/lighteval/tasks/default_prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 72671a607..cc842628e 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -265,7 +265,7 @@ def arc_with_options(line, task_name: str = None): query += "".join([f"\n{key}. {choice}" for key, choice in zip(LETTER_INDICES, line["choices"]["text"])]) query += "\nAnswer:" return Doc( -mm task_name=task_name, + task_name=task_name, query=query, choices=line["choices"]["text"], gold_index=line["choices"]["label"].index(line["answerKey"]), From a1aee68187a3d280d3e821bad12c855abc7760f7 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 21 May 2025 16:53:38 +0200 Subject: [PATCH 6/9] Update src/lighteval/models/vllm/vllm_model.py --- src/lighteval/models/vllm/vllm_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 1e4dc92dc..57d424c0c 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -268,7 +268,6 @@ def greedy_until( logger.warning( f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length=} - {max_new_tokens=} = {self.max_length - max_new_tokens} tokens." ) - breakpoint() context_size = self.max_length - max_new_tokens if context_size < 0: logger.critical( From 461b8cbfca4797a09084706adf36a00153b56fb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Celiebak=E2=80=9D?= Date: Mon, 2 Jun 2025 13:09:29 +0000 Subject: [PATCH 7/9] fix typo --- src/lighteval/tasks/default_tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index cd2961b28..e933c9826 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -16139,6 +16139,7 @@ metric=[Metrics.ruler_match_any], stop_sequence=None, trust_dataset=False, +) ruin_names_bigbench = LightevalTaskConfig( name="ruin_names", suite=["bigbench", "bigbench_json"], From 57f292153d527df81d1bd5e6831f5dfe0fa63279 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 18 Jun 2025 10:52:04 +0000 Subject: [PATCH 8/9] put tuler in extedned tasks --- src/lighteval/tasks/default_tasks.py | 1231 -------------------- src/lighteval/tasks/extended/__init__.py | 3 +- src/lighteval/tasks/extended/ruler/main.py | 69 ++ 3 files changed, 71 insertions(+), 1232 deletions(-) create mode 100644 src/lighteval/tasks/extended/ruler/main.py diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index e933c9826..b77b27d52 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -14909,1237 +14909,6 @@ trust_dataset=True, version=0, ) -ruler_niah_single_1_131072 = LightevalTaskConfig( - name="ruler_131072:niah_single_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_1"], - evaluation_splits=["niah_single_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_3_131072 = LightevalTaskConfig( - name="ruler_131072:niah_single_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_3"], - evaluation_splits=["niah_single_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_2_131072 = LightevalTaskConfig( - name="ruler_131072:niah_single_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_2"], - evaluation_splits=["niah_single_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_1_131072 = LightevalTaskConfig( - name="ruler_131072:niah_multikey_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_1"], - evaluation_splits=["niah_multikey_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_2_131072 = LightevalTaskConfig( - name="ruler_131072:niah_multikey_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_2"], - evaluation_splits=["niah_multikey_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multiquery_131072 = LightevalTaskConfig( - name="ruler_131072:niah_multiquery", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multiquery"], - evaluation_splits=["niah_multiquery"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_3_131072 = LightevalTaskConfig( - name="ruler_131072:niah_multikey_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_3"], - evaluation_splits=["niah_multikey_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multivalue_131072 = LightevalTaskConfig( - name="ruler_131072:niah_multivalue", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multivalue"], - evaluation_splits=["niah_multivalue"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_vt_131072 = LightevalTaskConfig( - name="ruler_131072:vt", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["vt"], - evaluation_splits=["vt"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_cwe_131072 = LightevalTaskConfig( - name="ruler_131072:cwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["cwe"], - evaluation_splits=["cwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=120, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_fwe_131072 = LightevalTaskConfig( - name="ruler_131072:fwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["fwe"], - evaluation_splits=["fwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_1_131072 = LightevalTaskConfig( - name="ruler_131072:qa_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_1"], - evaluation_splits=["qa_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_2_131072 = LightevalTaskConfig( - name="ruler_131072:qa_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_2"], - evaluation_splits=["qa_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_3_65536 = LightevalTaskConfig( - name="ruler_65536:niah_single_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_3"], - evaluation_splits=["niah_single_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_2_65536 = LightevalTaskConfig( - name="ruler_65536:niah_single_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_2"], - evaluation_splits=["niah_single_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_1_65536 = LightevalTaskConfig( - name="ruler_65536:niah_multikey_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_1"], - evaluation_splits=["niah_multikey_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_2_65536 = LightevalTaskConfig( - name="ruler_65536:niah_multikey_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_2"], - evaluation_splits=["niah_multikey_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multiquery_65536 = LightevalTaskConfig( - name="ruler_65536:niah_multiquery", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multiquery"], - evaluation_splits=["niah_multiquery"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_3_65536 = LightevalTaskConfig( - name="ruler_65536:niah_multikey_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_3"], - evaluation_splits=["niah_multikey_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multivalue_65536 = LightevalTaskConfig( - name="ruler_65536:niah_multivalue", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multivalue"], - evaluation_splits=["niah_multivalue"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_vt_65536 = LightevalTaskConfig( - name="ruler_65536:vt", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["vt"], - evaluation_splits=["vt"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_cwe_65536 = LightevalTaskConfig( - name="ruler_65536:cwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["cwe"], - evaluation_splits=["cwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=120, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_fwe_65536 = LightevalTaskConfig( - name="ruler_65536:fwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["fwe"], - evaluation_splits=["fwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_1_65536 = LightevalTaskConfig( - name="ruler_65536:qa_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_1"], - evaluation_splits=["qa_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_2_65536 = LightevalTaskConfig( - name="ruler_65536:qa_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_2"], - evaluation_splits=["qa_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_1_32768 = LightevalTaskConfig( - name="ruler_32768:niah_single_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_1"], - evaluation_splits=["niah_single_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_3_32768 = LightevalTaskConfig( - name="ruler_32768:niah_single_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_3"], - evaluation_splits=["niah_single_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_2_32768 = LightevalTaskConfig( - name="ruler_32768:niah_single_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_2"], - evaluation_splits=["niah_single_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_1_32768 = LightevalTaskConfig( - name="ruler_32768:niah_multikey_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_1"], - evaluation_splits=["niah_multikey_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_2_32768 = LightevalTaskConfig( - name="ruler_32768:niah_multikey_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_2"], - evaluation_splits=["niah_multikey_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multiquery_32768 = LightevalTaskConfig( - name="ruler_32768:niah_multiquery", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multiquery"], - evaluation_splits=["niah_multiquery"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_3_32768 = LightevalTaskConfig( - name="ruler_32768:niah_multikey_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_3"], - evaluation_splits=["niah_multikey_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multivalue_32768 = LightevalTaskConfig( - name="ruler_32768:niah_multivalue", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multivalue"], - evaluation_splits=["niah_multivalue"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_vt_32768 = LightevalTaskConfig( - name="ruler_32768:vt", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["vt"], - evaluation_splits=["vt"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_cwe_32768 = LightevalTaskConfig( - name="ruler_32768:cwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["cwe"], - evaluation_splits=["cwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=120, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_fwe_32768 = LightevalTaskConfig( - name="ruler_32768:fwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["fwe"], - evaluation_splits=["fwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_1_32768 = LightevalTaskConfig( - name="ruler_32768:qa_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_1"], - evaluation_splits=["qa_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_2_32768 = LightevalTaskConfig( - name="ruler_32768:qa_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_2"], - evaluation_splits=["qa_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_1_16384 = LightevalTaskConfig( - name="ruler_16384:niah_single_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_1"], - evaluation_splits=["niah_single_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_3_16384 = LightevalTaskConfig( - name="ruler_16384:niah_single_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_3"], - evaluation_splits=["niah_single_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_2_16384 = LightevalTaskConfig( - name="ruler_16384:niah_single_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_2"], - evaluation_splits=["niah_single_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_1_16384 = LightevalTaskConfig( - name="ruler_16384:niah_multikey_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_1"], - evaluation_splits=["niah_multikey_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_2_16384 = LightevalTaskConfig( - name="ruler_16384:niah_multikey_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_2"], - evaluation_splits=["niah_multikey_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multiquery_16384 = LightevalTaskConfig( - name="ruler_16384:niah_multiquery", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multiquery"], - evaluation_splits=["niah_multiquery"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_3_16384 = LightevalTaskConfig( - name="ruler_16384:niah_multikey_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_3"], - evaluation_splits=["niah_multikey_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multivalue_16384 = LightevalTaskConfig( - name="ruler_16384:niah_multivalue", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multivalue"], - evaluation_splits=["niah_multivalue"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_vt_16384 = LightevalTaskConfig( - name="ruler_16384:vt", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["vt"], - evaluation_splits=["vt"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_cwe_16384 = LightevalTaskConfig( - name="ruler_16384:cwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["cwe"], - evaluation_splits=["cwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=120, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_fwe_16384 = LightevalTaskConfig( - name="ruler_16384:fwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["fwe"], - evaluation_splits=["fwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_1_16384 = LightevalTaskConfig( - name="ruler_16384:qa_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_1"], - evaluation_splits=["qa_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_2_16384 = LightevalTaskConfig( - name="ruler_16384:qa_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_2"], - evaluation_splits=["qa_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_1_8192 = LightevalTaskConfig( - name="ruler_8192:niah_single_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_1"], - evaluation_splits=["niah_single_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_3_8192 = LightevalTaskConfig( - name="ruler_8192:niah_single_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_3"], - evaluation_splits=["niah_single_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_2_8192 = LightevalTaskConfig( - name="ruler_8192:niah_single_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_2"], - evaluation_splits=["niah_single_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_1_8192 = LightevalTaskConfig( - name="ruler_8192:niah_multikey_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_1"], - evaluation_splits=["niah_multikey_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_2_8192 = LightevalTaskConfig( - name="ruler_8192:niah_multikey_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_2"], - evaluation_splits=["niah_multikey_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multiquery_8192 = LightevalTaskConfig( - name="ruler_8192:niah_multiquery", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multiquery"], - evaluation_splits=["niah_multiquery"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_3_8192 = LightevalTaskConfig( - name="ruler_8192:niah_multikey_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_3"], - evaluation_splits=["niah_multikey_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multivalue_8192 = LightevalTaskConfig( - name="ruler_8192:niah_multivalue", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multivalue"], - evaluation_splits=["niah_multivalue"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_vt_8192 = LightevalTaskConfig( - name="ruler_8192:vt", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["vt"], - evaluation_splits=["vt"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_cwe_8192 = LightevalTaskConfig( - name="ruler_8192:cwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["cwe"], - evaluation_splits=["cwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=120, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_fwe_8192 = LightevalTaskConfig( - name="ruler_8192:fwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["fwe"], - evaluation_splits=["fwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_1_8192 = LightevalTaskConfig( - name="ruler_8192:qa_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_1"], - evaluation_splits=["qa_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_2_8192 = LightevalTaskConfig( - name="ruler_8192:qa_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_2"], - evaluation_splits=["qa_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_1_4096 = LightevalTaskConfig( - name="ruler_4096:niah_single_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_1"], - evaluation_splits=["niah_single_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_3_4096 = LightevalTaskConfig( - name="ruler_4096:niah_single_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_3"], - evaluation_splits=["niah_single_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_single_2_4096 = LightevalTaskConfig( - name="ruler_4096:niah_single_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_single_2"], - evaluation_splits=["niah_single_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_1_4096 = LightevalTaskConfig( - name="ruler_4096:niah_multikey_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_1"], - evaluation_splits=["niah_multikey_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_2_4096 = LightevalTaskConfig( - name="ruler_4096:niah_multikey_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_2"], - evaluation_splits=["niah_multikey_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multiquery_4096 = LightevalTaskConfig( - name="ruler_4096:niah_multiquery", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multiquery"], - evaluation_splits=["niah_multiquery"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multikey_3_4096 = LightevalTaskConfig( - name="ruler_4096:niah_multikey_3", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multikey_3"], - evaluation_splits=["niah_multikey_3"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_niah_multivalue_4096 = LightevalTaskConfig( - name="ruler_4096:niah_multivalue", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["niah_multivalue"], - evaluation_splits=["niah_multivalue"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_vt_4096 = LightevalTaskConfig( - name="ruler_4096:vt", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["vt"], - evaluation_splits=["vt"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_cwe_4096 = LightevalTaskConfig( - name="ruler_4096:cwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["cwe"], - evaluation_splits=["cwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=120, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_fwe_4096 = LightevalTaskConfig( - name="ruler_4096:fwe", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["fwe"], - evaluation_splits=["fwe"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_1_4096 = LightevalTaskConfig( - name="ruler_4096:qa_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_1"], - evaluation_splits=["qa_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, - version=0, -) -ruler_qa_2_4096 = LightevalTaskConfig( - name="ruler_4096:qa_2", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", - hf_subset="default", - hf_avail_splits=["qa_2"], - evaluation_splits=["qa_2"], - few_shots_split=None, - few_shots_select=None, - generation_size=32, - metric=[Metrics.ruler_match_any], - stop_sequence=None, - trust_dataset=False, -) ruin_names_bigbench = LightevalTaskConfig( name="ruin_names", suite=["bigbench", "bigbench_json"], diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py index 39963eac1..a2cdb6e46 100644 --- a/src/lighteval/tasks/extended/__init__.py +++ b/src/lighteval/tasks/extended/__init__.py @@ -30,9 +30,10 @@ import lighteval.tasks.extended.mix_eval.main as mix_eval import lighteval.tasks.extended.mt_bench.main as mt_bench import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench + import lighteval.tasks.extended.ruler.main as ruler import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks - AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb] + AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb, ruler] else: AVAILABLE_EXTENDED_TASKS_MODULES = [] diff --git a/src/lighteval/tasks/extended/ruler/main.py b/src/lighteval/tasks/extended/ruler/main.py new file mode 100644 index 000000000..62f6cc6ab --- /dev/null +++ b/src/lighteval/tasks/extended/ruler/main.py @@ -0,0 +1,69 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +subsets = [ + "niah_single_1", + "niah_single_2", + "niah_single_3", + "niah_multikey_1", + "niah_multikey_2", + "niah_multikey_3", + "niah_multiquery", + "niah_multivalue", + "vt", + "cwe", + "fwe", + "qa_1", + "qa_2", +] + +lengths = [131072, 65536, 32768, 16384, 8192, 4096] + +task_configs = [] + +for subset in subsets: + for length in lengths: + task_configs.append( + LightevalTaskConfig( + name=f"ruler_{length}:{subset}", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo=f"SaylorTwift/RULER-{length}-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=[subset], + evaluation_splits=[subset], + few_shots_split=None, + few_shots_select=None, + generation_size=128 if "niah" in subset else 30 if subset == "vt" else 120 if subset == "cwe" else 50, + metric=[Metrics.ruler_match_any] if subset in ["qa_1", "qa_2"] else [Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, + ) + ) + +TASKS_TABLE = task_configs From 79e6a6e09d78e3b31722bff4e397e8e19f08f402 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 23 Jun 2025 14:40:57 +0000 Subject: [PATCH 9/9] added params for Nouamane --- src/lighteval/models/vllm/vllm_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 32301aa55..41f31d912 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -105,6 +105,8 @@ class VLLMModelConfig(ModelConfig): max_num_batched_tokens: PositiveInt = 2048 # maximum number of tokens per batch subfolder: str | None = None is_async: bool = False # Whether to use the async version or sync version of the model + use_dual_chunk_attention: bool = False + enforce_eager: bool = False class VLLMModel(LightevalModel): @@ -187,6 +189,8 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: "seed": int(config.seed), "max_num_seqs": int(config.max_num_seqs), "max_num_batched_tokens": int(config.max_num_batched_tokens), + "enforce_eager": config.enforce_eager, + "use_dual_chunk_attention": config.use_dual_chunk_attention, } if config.quantization is not None: