diff --git a/examples/tasks/instruct_multilingual.txt b/examples/tasks/instruct_multilingual.txt new file mode 100644 index 000000000..f5bdcb4c2 --- /dev/null +++ b/examples/tasks/instruct_multilingual.txt @@ -0,0 +1,180 @@ +extended|belebele_native_instruct_deu_Latn|0|0 +extended|belebele_native_instruct_fra_Latn|0|0 +extended|belebele_native_instruct_ita_Latn|0|0 +extended|belebele_native_instruct_por_Latn|0|0 +extended|belebele_native_instruct_spa_Latn|0|0 +extended|belebele_en_instruct_acm_Arab|0|0 +extended|belebele_en_instruct_arz_Arab|0|0 +extended|belebele_en_instruct_ceb_Latn|0|0 +extended|belebele_en_instruct_fin_Latn|0|0 +extended|belebele_en_instruct_hin_Deva|0|0 +extended|belebele_en_instruct_ita_Latn|0|0 +extended|belebele_en_instruct_khm_Khmr|0|0 +extended|belebele_en_instruct_lvs_Latn|0|0 +extended|belebele_en_instruct_npi_Deva|0|0 +extended|belebele_en_instruct_pol_Latn|0|0 +extended|belebele_en_instruct_slv_Latn|0|0 +extended|belebele_en_instruct_swe_Latn|0|0 +extended|belebele_en_instruct_afr_Latn|0|0 +extended|belebele_en_instruct_asm_Beng|0|0 +extended|belebele_en_instruct_ces_Latn|0|0 +extended|belebele_en_instruct_fra_Latn|0|0 +extended|belebele_en_instruct_hin_Latn|0|0 +extended|belebele_en_instruct_jav_Latn|0|0 +extended|belebele_en_instruct_mal_Mlym|0|0 +extended|belebele_en_instruct_npi_Latn|0|0 +extended|belebele_en_instruct_por_Latn|0|0 +extended|belebele_en_instruct_swh_Latn|0|0 +extended|belebele_en_instruct_tur_Latn|0|0 +extended|belebele_en_instruct_yor_Latn|0|0 +extended|belebele_en_instruct_als_Latn|0|0 +extended|belebele_en_instruct_azj_Latn|0|0 +extended|belebele_en_instruct_ckb_Arab|0|0 +extended|belebele_en_instruct_hrv_Latn|0|0 +extended|belebele_en_instruct_jpn_Jpan|0|0 +extended|belebele_en_instruct_kir_Cyrl|0|0 +extended|belebele_en_instruct_mar_Deva|0|0 +extended|belebele_en_instruct_snd_Arab|0|0 +extended|belebele_en_instruct_tam_Taml|0|0 +extended|belebele_en_instruct_ukr_Cyrl|0|0 +extended|belebele_en_instruct_zho_Hans|0|0 +extended|belebele_en_instruct_amh_Ethi|0|0 +extended|belebele_en_instruct_dan_Latn|0|0 +extended|belebele_en_instruct_hun_Latn|0|0 +extended|belebele_en_instruct_kor_Hang|0|0 +extended|belebele_en_instruct_mkd_Cyrl|0|0 +extended|belebele_en_instruct_ron_Latn|0|0 +extended|belebele_en_instruct_som_Latn|0|0 +extended|belebele_en_instruct_tel_Telu|0|0 +extended|belebele_en_instruct_urd_Arab|0|0 +extended|belebele_en_instruct_zho_Hant|0|0 +extended|belebele_en_instruct_apc_Arab|0|0 +extended|belebele_en_instruct_ben_Beng|0|0 +extended|belebele_en_instruct_deu_Latn|0|0 +extended|belebele_en_instruct_hye_Armn|0|0 +extended|belebele_en_instruct_kan_Knda|0|0 +extended|belebele_en_instruct_lao_Laoo|0|0 +extended|belebele_en_instruct_mlt_Latn|0|0 +extended|belebele_en_instruct_ory_Orya|0|0 +extended|belebele_en_instruct_rus_Cyrl|0|0 +extended|belebele_en_instruct_tgk_Cyrl|0|0 +extended|belebele_en_instruct_urd_Latn|0|0 +extended|belebele_en_instruct_zsm_Latn|0|0 +extended|belebele_en_instruct_arb_Arab|0|0 +extended|belebele_en_instruct_ben_Latn|0|0 +extended|belebele_en_instruct_ell_Grek|0|0 +extended|belebele_en_instruct_guj_Gujr|0|0 +extended|belebele_en_instruct_kat_Geor|0|0 +extended|belebele_en_instruct_pan_Guru|0|0 +extended|belebele_en_instruct_spa_Latn|0|0 +extended|belebele_en_instruct_tgl_Latn|0|0 +extended|belebele_en_instruct_uzn_Latn|0|0 +extended|belebele_en_instruct_arb_Latn|0|0 +extended|belebele_en_instruct_eng_Latn|0|0 +extended|belebele_en_instruct_kaz_Cyrl|0|0 +extended|belebele_en_instruct_lit_Latn|0|0 +extended|belebele_en_instruct_mya_Mymr|0|0 +extended|belebele_en_instruct_pbt_Arab|0|0 +extended|belebele_en_instruct_sin_Latn|0|0 +extended|belebele_en_instruct_srp_Cyrl|0|0 +extended|belebele_en_instruct_tha_Thai|0|0 +extended|belebele_en_instruct_vie_Latn|0|0 +extended|belebele_en_instruct_ars_Arab|0|0 +extended|belebele_en_instruct_bul_Cyrl|0|0 +extended|belebele_en_instruct_est_Latn|0|0 +extended|belebele_en_instruct_ind_Latn|0|0 +extended|belebele_en_instruct_nld_Latn|0|0 +extended|belebele_en_instruct_pes_Arab|0|0 +extended|belebele_en_instruct_sin_Sinh|0|0 +extended|belebele_en_instruct_war_Latn|0|0 +extended|belebele_en_instruct_ary_Arab|0|0 +extended|belebele_en_instruct_cat_Latn|0|0 +extended|belebele_en_instruct_eus_Latn|0|0 +extended|belebele_en_instruct_heb_Hebr|0|0 +extended|belebele_en_instruct_isl_Latn|0|0 +extended|belebele_en_instruct_nob_Latn|0|0 +extended|belebele_en_instruct_plt_Latn|0|0 +extended|belebele_en_instruct_slk_Latn|0|0 +extended|global_mmlu_instruct_amh|0|0 +extended|global_mmlu_instruct_ara|0|0 +extended|global_mmlu_instruct_ben|0|0 +extended|global_mmlu_instruct_ces|0|0 +extended|global_mmlu_instruct_deu|0|0 +extended|global_mmlu_instruct_ell|0|0 +extended|global_mmlu_instruct_eng|0|0 +extended|global_mmlu_instruct_spa|0|0 +extended|global_mmlu_instruct_fas|0|0 +extended|global_mmlu_instruct_fra|0|0 +extended|global_mmlu_instruct_hau|0|0 +extended|global_mmlu_instruct_heb|0|0 +extended|global_mmlu_instruct_hin|0|0 +extended|global_mmlu_instruct_ind|0|0 +extended|global_mmlu_instruct_ibo|0|0 +extended|global_mmlu_instruct_ita|0|0 +extended|global_mmlu_instruct_jpn|0|0 +extended|global_mmlu_instruct_kor|0|0 +extended|global_mmlu_instruct_kir|0|0 +extended|global_mmlu_instruct_lit|0|0 +extended|global_mmlu_instruct_mlg|0|0 +extended|global_mmlu_instruct_msa|0|0 +extended|global_mmlu_instruct_nep|0|0 +extended|global_mmlu_instruct_nld|0|0 +extended|global_mmlu_instruct_nor|0|0 +extended|global_mmlu_instruct_pol|0|0 +extended|global_mmlu_instruct_por|0|0 +extended|global_mmlu_instruct_ron|0|0 +extended|global_mmlu_instruct_rus|0|0 +extended|global_mmlu_instruct_sin|0|0 +extended|global_mmlu_instruct_sna|0|0 +extended|global_mmlu_instruct_som|0|0 +extended|global_mmlu_instruct_srp|0|0 +extended|global_mmlu_instruct_swe|0|0 +extended|global_mmlu_instruct_swa|0|0 +extended|global_mmlu_instruct_tel|0|0 +extended|global_mmlu_instruct_tur|0|0 +extended|global_mmlu_instruct_ukr|0|0 +extended|global_mmlu_instruct_vie|0|0 +extended|global_mmlu_instruct_yor|0|0 +extended|global_mmlu_instruct_zho|0|0 +extended|global_mmlu_lite_instruct_amh|0|0 +extended|global_mmlu_lite_instruct_ara|0|0 +extended|global_mmlu_lite_instruct_ben|0|0 +extended|global_mmlu_lite_instruct_ces|0|0 +extended|global_mmlu_lite_instruct_deu|0|0 +extended|global_mmlu_lite_instruct_ell|0|0 +extended|global_mmlu_lite_instruct_eng|0|0 +extended|global_mmlu_lite_instruct_spa|0|0 +extended|global_mmlu_lite_instruct_fas|0|0 +extended|global_mmlu_lite_instruct_fra|0|0 +extended|global_mmlu_lite_instruct_hau|0|0 +extended|global_mmlu_lite_instruct_heb|0|0 +extended|global_mmlu_lite_instruct_hin|0|0 +extended|global_mmlu_lite_instruct_ind|0|0 +extended|global_mmlu_lite_instruct_ibo|0|0 +extended|global_mmlu_lite_instruct_ita|0|0 +extended|global_mmlu_lite_instruct_jpn|0|0 +extended|global_mmlu_lite_instruct_kor|0|0 +extended|global_mmlu_lite_instruct_kir|0|0 +extended|global_mmlu_lite_instruct_lit|0|0 +extended|global_mmlu_lite_instruct_mlg|0|0 +extended|global_mmlu_lite_instruct_msa|0|0 +extended|global_mmlu_lite_instruct_nep|0|0 +extended|global_mmlu_lite_instruct_nld|0|0 +extended|global_mmlu_lite_instruct_nor|0|0 +extended|global_mmlu_lite_instruct_pol|0|0 +extended|global_mmlu_lite_instruct_por|0|0 +extended|global_mmlu_lite_instruct_ron|0|0 +extended|global_mmlu_lite_instruct_rus|0|0 +extended|global_mmlu_lite_instruct_sin|0|0 +extended|global_mmlu_lite_instruct_sna|0|0 +extended|global_mmlu_lite_instruct_som|0|0 +extended|global_mmlu_lite_instruct_srp|0|0 +extended|global_mmlu_lite_instruct_swe|0|0 +extended|global_mmlu_lite_instruct_swa|0|0 +extended|global_mmlu_lite_instruct_tel|0|0 +extended|global_mmlu_lite_instruct_tur|0|0 +extended|global_mmlu_lite_instruct_ukr|0|0 +extended|global_mmlu_lite_instruct_vie|0|0 +extended|global_mmlu_lite_instruct_yor|0|0 +extended|global_mmlu_lite_instruct_zho|0|0 +extended|mmlu_pro|0|0 diff --git a/examples/tasks/instruct_multilingual_test.txt b/examples/tasks/instruct_multilingual_test.txt new file mode 100644 index 000000000..fcfb99a9e --- /dev/null +++ b/examples/tasks/instruct_multilingual_test.txt @@ -0,0 +1,6 @@ +extended|global_mmlu_lite_instruct_deu|0|0 +extended|global_mmlu_lite_instruct_eng|0|0 +extended|global_mmlu_lite_instruct_spa|0|0 +extended|global_mmlu_lite_instruct_fra|0|0 +extended|global_mmlu_lite_instruct_ita|0|0 +extended|global_mmlu_lite_instruct_por|0|0 diff --git a/src/lighteval/metrics/utils/extractive_match_utils.py b/src/lighteval/metrics/utils/extractive_match_utils.py index b8c529d05..13a7e6281 100644 --- a/src/lighteval/metrics/utils/extractive_match_utils.py +++ b/src/lighteval/metrics/utils/extractive_match_utils.py @@ -85,11 +85,12 @@ class IndicesExtractionConfig: Attributes: prefix_for_extraction (ChoicePrefix): The style to use for extracting choice indices (e.g. A,B,C or 1,2,3) - try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is" + try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is". + Recommended False for indices extraction, as some indices (for example `A` which is also a word) can lead to a lot of false positives. """ prefix_for_extraction: ChoicePrefix - try_extract_without_anchor: bool = True + try_extract_without_anchor: bool = False ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | IndicesExtractionConfig diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index d1bdf7328..256d0219e 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -105,6 +105,7 @@ class VLLMModelConfig(ModelConfig): max_num_batched_tokens: PositiveInt = 2048 # maximum number of tokens per batch subfolder: str | None = None is_async: bool = False # Whether to use the async version or sync version of the model + enforce_eager: bool = False class VLLMModel(LightevalModel): @@ -187,6 +188,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: "seed": int(config.seed), "max_num_seqs": int(config.max_num_seqs), "max_num_batched_tokens": int(config.max_num_batched_tokens), + "enforce_eager": bool(config.enforce_eager), } if config.quantization is not None: diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 786c4a0b1..910d0790b 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -899,15 +899,24 @@ def gpqa_instruct(line, task_name: str = None): gold_index = random.randint(0, 3) choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]] choices.insert(gold_index, line["Correct Answer"]) - query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}" - query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"]) + instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering." + query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}" + query = query_template.format( + # Stripping to avoid accidental extra whitespaces, present in GPQA + A=choices[0].strip(), + B=choices[1].strip(), + C=choices[2].strip(), + D=choices[3].strip(), + Question=line["Question"].strip(), + Instruction=instruction, + ) return Doc( task_name=task_name, query=query, choices=LETTER_INDICES[: len(choices)], gold_index=gold_index, - instruction=query, + instruction=instruction, ) diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py index 39963eac1..d9c9dc7c8 100644 --- a/src/lighteval/tasks/extended/__init__.py +++ b/src/lighteval/tasks/extended/__init__.py @@ -20,9 +20,15 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import lighteval.tasks.extended.instruct.belebele as belebele +import lighteval.tasks.extended.instruct.global_mmlu as global_mmlu +import lighteval.tasks.extended.instruct.mgsm as mgsm +import lighteval.tasks.extended.instruct.mmlu_pro as mmlu_pro from lighteval.utils.imports import can_load_extended_tasks +AVAILABLE_EXTENDED_TASKS_MODULES = [belebele, mmlu_pro, mgsm, global_mmlu] + if can_load_extended_tasks(): import lighteval.tasks.extended.hle.main as hle import lighteval.tasks.extended.ifeval.main as ifeval @@ -32,7 +38,4 @@ import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks - AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb] - -else: - AVAILABLE_EXTENDED_TASKS_MODULES = [] + AVAILABLE_EXTENDED_TASKS_MODULES.extend([ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]) diff --git a/src/lighteval/tasks/extended/instruct/belebele.py b/src/lighteval/tasks/extended/instruct/belebele.py new file mode 100644 index 000000000..a1ecbc741 --- /dev/null +++ b/src/lighteval/tasks/extended/instruct/belebele.py @@ -0,0 +1,293 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import numpy as np + +from lighteval.metrics.dynamic_metrics import ( + IndicesExtractionConfig, + multilingual_extractive_match_metric, +) +from lighteval.metrics.metrics import MetricCategory, MetricUseCase, SampleLevelMetric +from lighteval.metrics.metrics_sample import ( + PassAtK, +) +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.tasks import LangCodeLanguage, iso_639_3_ind_to_iso_639_3_macro +from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language + + +TASKS_TABLE = [] + +lang_to_literal = { + "deu": Language.GERMAN, + "fra": Language.FRENCH, + "ita": Language.ITALIAN, + "por": Language.PORTUGUESE, + "spa": Language.SPANISH, +} + + +def belebele_prompt_en_instruct(line, task_name: str = None): + line["dialect"] = "eng_Latn" + return belebele_prompt(line, task_name) + + +def belebele_prompt(line, task_name: str = None): + lang_to_template = { + "eng_Latn": "Given the following passage, query, and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n\n###\nPassage:\n{Passage}\n###\nQuery:\n{Question}\n###\nChoices:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "deu_Latn": "Gib basierend auf dem folgenden Textabschnitt, der Frage und den Antwortmöglichkeiten den Buchstaben aus, der der richtigen Antwort entspricht. Die letzte Zeile deiner Antwort sollte folgendes Format haben: 'Antwort: $BUCHSTABE' (ohne Anführungszeichen), wobei BUCHSTABE einer der folgenden ist: A, B, C oder D. Denke Schritt für Schritt, bevor du antwortest.\n\n###\nTextabschnitt:\n{Passage}\n###\nFrage:\n{Question}\n###\nAntwortmöglichkeiten:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "fra_Latn": "A partir du passage suivant, de la question et des choix de réponses, indiquez la lettre correspondant à la bonne réponse. La dernière ligne de votre réponse doit avoir le format suivant : 'Réponse: '$LETTRE' (sans les guillemets) où LETTRE est l'une des lettres: A, B, C ou D. Réfléchissez étape par étape avant de répondre.\n\n###\nPassage:\n{Passage}\n###\nRequête:\n{Question}\n###\nChoix:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "ita_Latn": "Dato il seguente passaggio, un quesito e le diverse opzioni per una risposta, indicare la lettera corrispondente alla risposta corretta. L'ultima riga della risposta deve avere il seguente formato: 'Risposta: $LETTERA' (senza virgolette), e LETTERA è necessariamente una tra A, B, C, D. Prima di rispondere, è importante che si ragioni passo per passo.\n\n###\nPassaggio:\n{Passage}\n###\nQuesito:\n{Question}\n###\nOpzioni:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "por_Latn": "Tendo em conta a seguinte passagem, pergunta e opções de resposta, indique a letra correspondente à resposta correta. A última linha da sua resposta deve ter o seguinte formato: 'Resposta: $LETRA' (sem aspas) em que LETRA é uma de A, B, C ou D. Pense passo a passo antes de responder.\n\n###\nPassagem:\n{Passage}\n###\nPergunta:\n{Question}\n###\nOpções:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "spa_Latn": "Dado el siguiente contexto, pregunta y opciones para la respuesta, escriba la letra correspondiente a la respuesta correcta. La última línea de su respuesta debe seguir el siguiente formato: 'Respuesta: $LETTER' (sin comillas) donde LETTER es A, B, C o D. Piense paso a paso antes de responder.\n\n###\nContexto:\n{Passage}\n###\nPregunta:\n{Question}\n###\nOpciones:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + } + + gold_index = int(line["correct_answer_num"]) - 1 + choices = [line["mc_answer1"], line["mc_answer2"], line["mc_answer3"], line["mc_answer4"]] + query_template = lang_to_template[line["dialect"]] + query = query_template.format( + A=choices[0], + B=choices[1], + C=choices[2], + D=choices[3], + Passage=line["flores_passage"], + Question=line["question"], + ) + instruction = query_template.split("###\n")[0] + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES[: len(choices)], + gold_index=gold_index, + instruction=instruction, + ) + + +BELEBELE_TASKS_NATIVE_INSTRUCT = [ + LightevalTaskConfig( + name=f"belebele_native_instruct_{lang}_Latn", + prompt_function=belebele_prompt, + suite=["extended"], + hf_repo="facebook/belebele", + hf_subset=f"{lang}_Latn", + evaluation_splits=["test"], + hf_avail_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, # needed for reasoning models like R1 + metric=[ + SampleLevelMetric( + metric_name="pass@1:1_samples", + sample_level_fn=PassAtK( + k=1, + n=1, + sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric( + language=lang_to_literal[lang], + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ).sample_level_fn([ref], [pred], doc), + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.REASONING, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ], + stop_sequence=[], # no stop sequence, will use eos token + trust_dataset=True, + version=1, + ) + for lang in [ + "deu", + "fra", + "ita", + "por", + "spa", + ] +] + +BELEBELE_TASKS_EN_INSTRUCT = [ + LightevalTaskConfig( + name=f"belebele_en_instruct_{lang}", + prompt_function=belebele_prompt_en_instruct, + suite=["extended"], + hf_repo="facebook/belebele", + hf_subset=lang, + evaluation_splits=["test"], + hf_avail_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, # needed for reasoning models like R1 + metric=[ + SampleLevelMetric( + metric_name="pass@1:1_samples", + sample_level_fn=PassAtK( + k=1, + n=1, + sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric( + language=iso_639_3_ind_to_iso_639_3_macro[LangCodeLanguage.get(lang).to_alpha3()], + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ).sample_level_fn([ref], [pred], doc), + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.REASONING, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ], + stop_sequence=[], # no stop sequence, will use eos token + trust_dataset=True, + version=1, + ) + for lang in [ + "acm_Arab", + "arz_Arab", + "ceb_Latn", + "fin_Latn", + "hin_Deva", + "ita_Latn", + "khm_Khmr", + "lvs_Latn", + "npi_Deva", + "pol_Latn", + "slv_Latn", + "swe_Latn", + # "tso_Latn", + # "xho_Latn", + "afr_Latn", + "asm_Beng", + "ces_Latn", + "fra_Latn", + "hin_Latn", + "jav_Latn", + # "kin_Latn", + "mal_Mlym", + "npi_Latn", + "por_Latn", + # "sna_Latn", + "swh_Latn", + "tur_Latn", + "yor_Latn", + "als_Latn", + "azj_Latn", + "ckb_Arab", + # "fuv_Latn", + "hrv_Latn", + "jpn_Jpan", + "kir_Cyrl", + "mar_Deva", + # "nso_Latn", + "snd_Arab", + "tam_Taml", + "ukr_Cyrl", + "zho_Hans", + "amh_Ethi", + # "bam_Latn", + "dan_Latn", + # "gaz_Latn", + "hun_Latn", + # "kac_Latn", + "kor_Hang", + "mkd_Cyrl", + # "nya_Latn", + "ron_Latn", + "som_Latn", + "tel_Telu", + "urd_Arab", + "zho_Hant", + "apc_Arab", + "ben_Beng", + "deu_Latn", + # "grn_Latn", + "hye_Armn", + "kan_Knda", + "lao_Laoo", + "mlt_Latn", + "ory_Orya", + "rus_Cyrl", + # "sot_Latn", + "tgk_Cyrl", + "urd_Latn", + "zsm_Latn", + "arb_Arab", + "ben_Latn", + "ell_Grek", + "guj_Gujr", + # "ibo_Latn", + "kat_Geor", + # "lin_Latn", + # "mri_Latn", + "pan_Guru", + # "shn_Mymr", + "spa_Latn", + "tgl_Latn", + "uzn_Latn", + # "zul_Latn", + "arb_Latn", + # "bod_Tibt", + "eng_Latn", + # "hat_Latn", + # "ilo_Latn", + "kaz_Cyrl", + "lit_Latn", + "mya_Mymr", + "pbt_Arab", + "sin_Latn", + "srp_Cyrl", + "tha_Thai", + "vie_Latn", + "ars_Arab", + "bul_Cyrl", + "est_Latn", + # "hau_Latn", + "ind_Latn", + # "kea_Latn", + # "lug_Latn", + "nld_Latn", + "pes_Arab", + "sin_Sinh", + # "ssw_Latn", + # "tir_Ethi", + "war_Latn", + "ary_Arab", + "cat_Latn", + "eus_Latn", + "heb_Hebr", + "isl_Latn", + # "khk_Cyrl", + # "luo_Latn", + "nob_Latn", + "plt_Latn", + "slk_Latn", + # "sun_Latn", + # "tsn_Latn", + # "wol_Latn", + ] +] +TASKS_TABLE.extend(BELEBELE_TASKS_NATIVE_INSTRUCT + BELEBELE_TASKS_EN_INSTRUCT) diff --git a/src/lighteval/tasks/extended/instruct/global_mmlu.py b/src/lighteval/tasks/extended/instruct/global_mmlu.py new file mode 100644 index 000000000..2038854c9 --- /dev/null +++ b/src/lighteval/tasks/extended/instruct/global_mmlu.py @@ -0,0 +1,232 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import numpy as np + +from lighteval.metrics.dynamic_metrics import ( + IndicesExtractionConfig, + multilingual_extractive_match_metric, +) +from lighteval.metrics.metrics import MetricCategory, MetricUseCase, SampleLevelMetric +from lighteval.metrics.metrics_sample import ( + PassAtK, +) +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language + + +TASKS_TABLE = [] + + +class GlobalMMLUPrompt: + def __init__(self, lang): + self.lang = lang + self.lang_to_template = { + "eng": "Given the following query and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n\n###\nQuery:\n{Question}\n###\nChoices:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "deu": "Gib basierend auf der folgenden Frage und den Antwortmöglichkeiten den Buchstaben aus, der der richtigen Antwort entspricht. Die letzte Zeile deiner Antwort sollte folgendes Format haben: 'Antwort: $BUCHSTABE' (ohne Anführungszeichen), wobei BUCHSTABE einer der folgenden ist: A, B, C oder D. Denke Schritt für Schritt, bevor du antwortest.\n\n###\nFrage:\n{Question}\n###\nAntwortmöglichkeiten:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "fra": "A partir de la question et des choix de réponses suivants, indiquez la lettre correspondant à la bonne réponse. La dernière ligne de votre réponse doit avoir le format suivant : 'Réponse: '$LETTRE' (sans les guillemets) où LETTRE est l'une des lettres: A, B, C ou D. Réfléchissez étape par étape avant de répondre.\n\n###\nRequête:\n{Question}\n###\nChoix:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "ita": "Dato il seguente quesito e le diverse opzioni per una risposta, indicare la lettera corrispondente alla risposta corretta. L'ultima riga della risposta deve avere il seguente formato: 'Risposta: $LETTERA' (senza virgolette), e LETTERA è necessariamente una tra A, B, C, D. Prima di rispondere, è importante che si ragioni passo per passo.\n\n###\nQuesito:\n{Question}\n###\nOpzioni:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "por": "Tendo em conta a seguinte pergunta e opções de resposta, indique a letra correspondente à resposta correta. A última linha da sua resposta deve ter o seguinte formato: 'Resposta: $LETRA' (sem aspas) em que LETRA é uma de A, B, C ou D. Pense passo a passo antes de responder.\n\n###\nPergunta:\n{Question}\n###\nOpções:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + "spa": "Dado el siguiente pregunta y opciones para la respuesta, escriba la letra correspondiente a la respuesta correcta. La última línea de su respuesta debe seguir el siguiente formato: 'Respuesta: $LETTER' (sin comillas) donde LETTER es A, B, C o D. Piense paso a paso antes de responder.\n\n###\nPregunta:\n{Question}\n###\nOpciones:\nA) {A}\nB) {B}\nC) {C}\nD) {D}", + } + + def prompt(self, line, task_name: str = None): + gold_index = LETTER_INDICES.index(line["answer"]) + choices = [line["option_a"], line["option_b"], line["option_c"], line["option_d"]] + lang = self.lang if self.lang in self.lang_to_template.keys() else "eng" + query_template = self.lang_to_template[lang] + query = query_template.format( + A=choices[0], + B=choices[1], + C=choices[2], + D=choices[3], + Question=line["question"], + ) + instruction = query_template.split("###\n")[0] + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES[: len(choices)], + gold_index=gold_index, + instruction=instruction, + ) + + +GLOBAL_MMLU_TASKS = [ + LightevalTaskConfig( + name=f"global_mmlu_instruct_{language.value}", + prompt_function=GlobalMMLUPrompt(language.value).prompt, + suite=["extended"], + hf_repo="CohereForAI/Global-MMLU", + hf_subset=lang, + evaluation_splits=("test",), + few_shots_split="dev", + metric=[ + SampleLevelMetric( + metric_name="pass@1:1_samples", + sample_level_fn=PassAtK( + k=1, + n=1, + sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric( + language=language, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ).sample_level_fn([ref], [pred], doc), + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.REASONING, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ], + generation_size=32768, # needed for reasoning models like R1 + stop_sequence=[], # no stop sequence, will use eos token + ) + for lang, language in [ + ("am", Language.AMHARIC), + ("ar", Language.ARABIC), + ("bn", Language.BENGALI), + ("cs", Language.CZECH), + ("de", Language.GERMAN), + ("el", Language.GREEK), + ("en", Language.ENGLISH), + ("es", Language.SPANISH), + ("fa", Language.PERSIAN), + # ("fil", Language.FILIPINO), + ("fr", Language.FRENCH), + ("ha", Language.HAUSA), + ("he", Language.HEBREW), + ("hi", Language.HINDI), + ("id", Language.INDONESIAN), + ("ig", Language.IGBO), + ("it", Language.ITALIAN), + ("ja", Language.JAPANESE), + ("ko", Language.KOREAN), + ("ky", Language.KYRGYZ), + ("lt", Language.LITHUANIAN), + ("mg", Language.MALAGASY), + ("ms", Language.MALAY), + ("ne", Language.NEPALI), + ("nl", Language.DUTCH), + ("ny", Language.NORWEGIAN), + ("pl", Language.POLISH), + ("pt", Language.PORTUGUESE), + ("ro", Language.ROMANIAN), + ("ru", Language.RUSSIAN), + ("si", Language.SINHALA), + ("sn", Language.SHONA), + ("so", Language.SOMALI), + ("sr", Language.SERBIAN), + ("sv", Language.SWEDISH), + ("sw", Language.SWAHILI), + ("te", Language.TELUGU), + ("tr", Language.TURKISH), + ("uk", Language.UKRAINIAN), + ("vi", Language.VIETNAMESE), + ("yo", Language.YORUBA), + ("zh", Language.CHINESE), + ] +] +TASKS_TABLE.extend(GLOBAL_MMLU_TASKS) + +GLOBAL_MMLU_LITE_TASKS = [ + LightevalTaskConfig( + name=f"global_mmlu_lite_instruct_{language.value}", + prompt_function=GlobalMMLUPrompt(language.value).prompt, + suite=["extended"], + hf_repo="CohereForAI/Global-MMLU-Lite", + hf_subset=lang, + evaluation_splits=("test",), + few_shots_split="dev", + metric=[ + SampleLevelMetric( + metric_name="pass@1:1_samples", + sample_level_fn=PassAtK( + k=1, + n=1, + sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric( + language=language, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ).sample_level_fn([ref], [pred], doc), + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.REASONING, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ], + generation_size=32768, # needed for reasoning models like R1 + stop_sequence=[], # no stop sequence, will use eos token + ) + for lang, language in [ + ("am", Language.AMHARIC), + ("ar", Language.ARABIC), + ("bn", Language.BENGALI), + ("cs", Language.CZECH), + ("de", Language.GERMAN), + ("el", Language.GREEK), + ("en", Language.ENGLISH), + ("es", Language.SPANISH), + ("fa", Language.PERSIAN), + # ("fil", Language.FILIPINO), + ("fr", Language.FRENCH), + ("ha", Language.HAUSA), + ("he", Language.HEBREW), + ("hi", Language.HINDI), + ("id", Language.INDONESIAN), + ("ig", Language.IGBO), + ("it", Language.ITALIAN), + ("ja", Language.JAPANESE), + ("ko", Language.KOREAN), + ("ky", Language.KYRGYZ), + ("lt", Language.LITHUANIAN), + ("mg", Language.MALAGASY), + ("ms", Language.MALAY), + ("ne", Language.NEPALI), + ("nl", Language.DUTCH), + ("ny", Language.NORWEGIAN), + ("pl", Language.POLISH), + ("pt", Language.PORTUGUESE), + ("ro", Language.ROMANIAN), + ("ru", Language.RUSSIAN), + ("si", Language.SINHALA), + ("sn", Language.SHONA), + ("so", Language.SOMALI), + ("sr", Language.SERBIAN), + ("sv", Language.SWEDISH), + ("sw", Language.SWAHILI), + ("te", Language.TELUGU), + ("tr", Language.TURKISH), + ("uk", Language.UKRAINIAN), + ("vi", Language.VIETNAMESE), + ("yo", Language.YORUBA), + ("zh", Language.CHINESE), + ] +] +TASKS_TABLE.extend(GLOBAL_MMLU_LITE_TASKS) diff --git a/src/lighteval/tasks/extended/instruct/mgsm.py b/src/lighteval/tasks/extended/instruct/mgsm.py new file mode 100644 index 000000000..35e92d8b7 --- /dev/null +++ b/src/lighteval/tasks/extended/instruct/mgsm.py @@ -0,0 +1,106 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import numpy as np + +from lighteval.metrics.dynamic_metrics import ( + IndicesExtractionConfig, + multilingual_extractive_match_metric, +) +from lighteval.metrics.metrics import MetricCategory, MetricUseCase, SampleLevelMetric +from lighteval.metrics.metrics_sample import ( + PassAtK, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language + + +TASKS_TABLE = [] + + +def mgsm_prompt(line, task_name: str = None): + query_template = ( + "Given the following question, output the value corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $NUMBER' (without quotes) where NUMBER is a number. Think step by step before answering.\n\n###\nQuery:\n{Question}", + ) + + query = query_template.format( + Question=line["question"], + ) + instruction = query_template.split("###\n")[0] + + return Doc( + task_name=task_name, + query=query, + choices=[str(line["answer_number"])], + gold_index=0, + instruction=instruction, + ) + + +MGSM_TASKS = [ + LightevalTaskConfig( + name=f"mgsm_instruct_{language.value}", + prompt_function=mgsm_prompt, + suite=("lighteval",), + hf_repo="juletxara/mgsm", + hf_subset=lang, + evaluation_splits=("test",), + few_shots_split="train", + metric=[ + SampleLevelMetric( + metric_name="pass@1:1_samples", + sample_level_fn=PassAtK( + k=1, + n=1, + sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric( + language=language, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ).sample_level_fn([ref], [pred], doc), + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.REASONING, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ], + generation_size=32768, # needed for reasoning models like R1 + stop_sequence=[], # no stop sequence, will use eos token + ) + for lang, language in [ + ("bn", Language.BENGALI), + ("de", Language.GERMAN), + ("en", Language.ENGLISH), + ("es", Language.SPANISH), + ("fr", Language.FRENCH), + ("ja", Language.JAPANESE), + ("ru", Language.RUSSIAN), + ("sw", Language.SWAHILI), + ("te", Language.TELUGU), + ("th", Language.THAI), + ("zh", Language.CHINESE), + ] +] +TASKS_TABLE.extend(MGSM_TASKS) diff --git a/src/lighteval/tasks/extended/instruct/mmlu_pro.py b/src/lighteval/tasks/extended/instruct/mmlu_pro.py new file mode 100644 index 000000000..55d01cef7 --- /dev/null +++ b/src/lighteval/tasks/extended/instruct/mmlu_pro.py @@ -0,0 +1,90 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import numpy as np + +from lighteval.metrics.dynamic_metrics import ( + IndicesExtractionConfig, + multilingual_extractive_match_metric, +) +from lighteval.metrics.metrics import MetricCategory, MetricUseCase, SampleLevelMetric +from lighteval.metrics.metrics_sample import ( + PassAtK, +) +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language + + +def mmlu_pro(line, task_name: str = None): + num_choices = len(line["options"]) + instruction = f"Given the following question about {line['category']} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of {' ,'.join(LETTER_INDICES[: num_choices - 1])}, or {LETTER_INDICES[num_choices]}. Think step by step before answering.\n\n" + query = f"{instruction}###\nQuery:\n{line['question']}\n###\nChoices:" + query += "".join([f"\n{key}) {choice}" for key, choice in zip(LETTER_INDICES, line["options"])]) + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES[:num_choices], + gold_index=line["answer_index"], + instruction=instruction, + ) + + +mmlu_pro = LightevalTaskConfig( + name="mmlu_pro", + suite=["extended"], + prompt_function=mmlu_pro, + hf_repo="TIGER-Lab/MMLU-Pro", + hf_subset="default", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select=None, + generation_size=32768, # needed for reasoning models like R1 + stop_sequence=[], # no stop sequence, will use eos token + metric=[ + SampleLevelMetric( + metric_name="pass@1:1_samples", + sample_level_fn=PassAtK( + k=1, + n=1, + sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric( + language=Language.ENGLISH, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ).sample_level_fn([ref], [pred], doc), + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.REASONING, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ], + trust_dataset=True, + version=0, +) + +TASKS_TABLE = [mmlu_pro] diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index efbb2841a..5121ae585 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -392,10 +392,19 @@ def __getattribute__(self, name: str) -> str: ), Language.ESPERANTO: TranslationLiterals(language=Language.ESPERANTO), Language.ESTONIAN: TranslationLiterals( - # From https://github.com/EleutherAI/lm-evaluation-harness/blob/0845b588303f1f59af98dd1c5bdbd78a9e75a1e2/lm_eval/tasks/xcopa/utils.py language=Language.ESTONIAN, + question_word="küsimus", + answer="vastus", + confirmation_word="eks", + yes="jah", + no="ei", + also="samuti", cause_word="sest", - effect_word="seetõttu", + effect_word="seega", + or_word="või", + true="õige", + false="vale", + neither="mitte kumbki", ), Language.EWE: TranslationLiterals(language=Language.EWE), Language.FAROESE: TranslationLiterals(language=Language.FAROESE), diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py index c3a12c813..8a3ce9c8a 100644 --- a/tests/metrics/test_extractive_match.py +++ b/tests/metrics/test_extractive_match.py @@ -56,7 +56,9 @@ def compare_strings( elif match_type == "expr": extraction_targets.append(ExprExtractionConfig()) elif match_type == "NativeLetters": - extraction_targets.append(IndicesExtractionConfig(prefix_for_extraction="NativeLetters")) + extraction_targets.append( + IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + ) extraction_targets = tuple(extraction_targets) # Convert to tuple