From 8b8990231e1cd21c71ae5ba8965dece05d4f67a5 Mon Sep 17 00:00:00 2001
From: Jan Kazlouski <jan.kazlouski@elastic.co>
Date: Tue, 22 Jul 2025 16:59:39 +0000
Subject: [PATCH 1/6] Update inference API specification to include new Llama
 Service

---
 output/openapi/elasticsearch-openapi.json     | 198 ++++++-
 .../elasticsearch-serverless-openapi.json     | 198 ++++++-
 output/schema/schema.json                     | 481 ++++++++++++++++--
 output/typescript/types.ts                    |  36 ++
 specification/_doc_ids/table.csv              |   2 +
 .../_json_spec/inference.put_llama.json       |  35 ++
 specification/inference/_types/CommonTypes.ts |  62 +++
 specification/inference/_types/Services.ts    |  13 +
 specification/inference/_types/TaskType.ts    |   6 +
 specification/inference/put/PutRequest.ts     |   1 +
 .../inference/put_llama/PutLlamaRequest.ts    |  79 +++
 .../inference/put_llama/PutLlamaResponse.ts   |  25 +
 .../request/PutLlamaRequestExample1.yaml      |  13 +
 .../request/PutLlamaRequestExample2.yaml      |  13 +
 .../request/PutLlamaRequestExample3.yaml      |  13 +
 15 files changed, 1123 insertions(+), 52 deletions(-)
 create mode 100644 specification/_json_spec/inference.put_llama.json
 create mode 100644 specification/inference/put_llama/PutLlamaRequest.ts
 create mode 100644 specification/inference/put_llama/PutLlamaResponse.ts
 create mode 100644 specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
 create mode 100644 specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
 create mode 100644 specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml

diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index b5dd4f4c8b..f75453f44c 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -20573,7 +20573,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -20694,7 +20694,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -22234,6 +22234,107 @@
         ]
       }
     },
+    "/_inference/{task_type}/{llama_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create a Llama inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `llama` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-llama",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.LlamaTaskType"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "llama_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              },
+              "examples": {
+                "PutLlamaRequestExample1": {
+                  "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                },
+                "PutLlamaRequestExample2": {
+                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                },
+                "PutLlamaRequestExample3": {
+                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                }
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoLlama"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available; Added in 9.2.0",
+        "x-metaTags": [
+          {
+            "content": "elasticsearch, machine-learning",
+            "name": "x-product-feature"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{mistral_inference_id}": {
       "put": {
         "tags": [
@@ -88694,7 +88795,7 @@
         "type": "object",
         "properties": {
           "requests_per_minute": {
-            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
             "type": "number"
           }
         }
@@ -89965,6 +90066,97 @@
           "rerank"
         ]
       },
+      "inference._types.LlamaTaskType": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion"
+        ]
+      },
+      "inference._types.LlamaServiceType": {
+        "type": "string",
+        "enum": [
+          "llama"
+        ]
+      },
+      "inference._types.LlamaServiceSettings": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "type": "string"
+          },
+          "model_id": {
+            "externalDocs": {
+              "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
+            },
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "type": "string"
+          },
+          "api_key": {
+            "description": "A valid API key for accessing Llama stack endpoint that is going to be sent as part of Bearer authentication header.\nThis field is optional because Llama stack doesn't provide authentication by default.\n\nIMPORTANT: You need to provide the API key only once, during the inference model creation.\nThe get inference endpoint API does not retrieve your API key.\nAfter creating the inference model, you cannot change the associated API key.\nIf you want to use a different API key, delete the inference model and recreate it with the same name and the updated API key.",
+            "type": "string"
+          },
+          "max_input_tokens": {
+            "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+            "type": "number"
+          },
+          "dimensions": {
+            "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.",
+            "type": "number"
+          },
+          "similarity": {
+            "$ref": "#/components/schemas/inference._types.LlamaSimilarityType"
+          },
+          "rate_limit": {
+            "$ref": "#/components/schemas/inference._types.RateLimitSetting"
+          }
+        },
+        "required": [
+          "url",
+          "model_id"
+        ]
+      },
+      "inference._types.LlamaSimilarityType": {
+        "type": "string",
+        "enum": [
+          "cosine",
+          "dot_product",
+          "l2_norm"
+        ]
+      },
+      "inference._types.InferenceEndpointInfoLlama": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeLlama"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
+      "inference._types.TaskTypeLlama": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "chat_completion",
+          "completion"
+        ]
+      },
       "inference._types.MistralTaskType": {
         "type": "string",
         "enum": [
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index a6af45774f..5d4b6cd28d 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -11368,7 +11368,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -11489,7 +11489,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -13029,6 +13029,107 @@
         ]
       }
     },
+    "/_inference/{task_type}/{llama_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create a Llama inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `llama` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-llama",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.LlamaTaskType"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "llama_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              },
+              "examples": {
+                "PutLlamaRequestExample1": {
+                  "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                },
+                "PutLlamaRequestExample2": {
+                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                },
+                "PutLlamaRequestExample3": {
+                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                }
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoLlama"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available",
+        "x-metaTags": [
+          {
+            "content": "elasticsearch, machine-learning",
+            "name": "x-product-feature"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{mistral_inference_id}": {
       "put": {
         "tags": [
@@ -56016,7 +56117,7 @@
         "type": "object",
         "properties": {
           "requests_per_minute": {
-            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
             "type": "number"
           }
         }
@@ -57287,6 +57388,97 @@
           "rerank"
         ]
       },
+      "inference._types.LlamaTaskType": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion"
+        ]
+      },
+      "inference._types.LlamaServiceType": {
+        "type": "string",
+        "enum": [
+          "llama"
+        ]
+      },
+      "inference._types.LlamaServiceSettings": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "type": "string"
+          },
+          "model_id": {
+            "externalDocs": {
+              "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
+            },
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "type": "string"
+          },
+          "api_key": {
+            "description": "A valid API key for accessing Llama stack endpoint that is going to be sent as part of Bearer authentication header.\nThis field is optional because Llama stack doesn't provide authentication by default.\n\nIMPORTANT: You need to provide the API key only once, during the inference model creation.\nThe get inference endpoint API does not retrieve your API key.\nAfter creating the inference model, you cannot change the associated API key.\nIf you want to use a different API key, delete the inference model and recreate it with the same name and the updated API key.",
+            "type": "string"
+          },
+          "max_input_tokens": {
+            "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+            "type": "number"
+          },
+          "dimensions": {
+            "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.",
+            "type": "number"
+          },
+          "similarity": {
+            "$ref": "#/components/schemas/inference._types.LlamaSimilarityType"
+          },
+          "rate_limit": {
+            "$ref": "#/components/schemas/inference._types.RateLimitSetting"
+          }
+        },
+        "required": [
+          "url",
+          "model_id"
+        ]
+      },
+      "inference._types.LlamaSimilarityType": {
+        "type": "string",
+        "enum": [
+          "cosine",
+          "dot_product",
+          "l2_norm"
+        ]
+      },
+      "inference._types.InferenceEndpointInfoLlama": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeLlama"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
+      "inference._types.TaskTypeLlama": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "chat_completion",
+          "completion"
+        ]
+      },
       "inference._types.MistralTaskType": {
         "type": "string",
         "enum": [
diff --git a/output/schema/schema.json b/output/schema/schema.json
index ae1b2d2326..f940da504b 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -9920,7 +9920,7 @@
           "visibility": "public"
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "docId": "inference-api-put",
       "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put",
       "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/put-inference-api.html",
@@ -10635,6 +10635,51 @@
         }
       ]
     },
+    {
+      "availability": {
+        "serverless": {
+          "stability": "stable",
+          "visibility": "public"
+        },
+        "stack": {
+          "since": "9.2.0",
+          "stability": "stable",
+          "visibility": "public"
+        }
+      },
+      "description": "Create a Llama inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `llama` service.",
+      "docId": "inference-api-put-llama",
+      "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama",
+      "name": "inference.put_llama",
+      "privileges": {
+        "cluster": [
+          "manage_inference"
+        ]
+      },
+      "request": {
+        "name": "Request",
+        "namespace": "inference.put_llama"
+      },
+      "requestBodyRequired": false,
+      "requestMediaType": [
+        "application/json"
+      ],
+      "response": {
+        "name": "Response",
+        "namespace": "inference.put_llama"
+      },
+      "responseMediaType": [
+        "application/json"
+      ],
+      "urls": [
+        {
+          "methods": [
+            "PUT"
+          ],
+          "path": "/_inference/{task_type}/{llama_inference_id}"
+        }
+      ]
+    },
     {
       "availability": {
         "serverless": {
@@ -169793,7 +169838,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L276-L332"
+      "specLocation": "inference/_types/Services.ts#L288-L344"
     },
     {
       "kind": "interface",
@@ -169852,7 +169897,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L44-L64"
+      "specLocation": "inference/_types/Services.ts#L45-L65"
     },
     {
       "kind": "interface",
@@ -169893,7 +169938,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L66-L78"
+      "specLocation": "inference/_types/Services.ts#L67-L79"
     },
     {
       "kind": "interface",
@@ -169933,7 +169978,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L79-L88"
+      "specLocation": "inference/_types/Services.ts#L80-L89"
     },
     {
       "kind": "interface",
@@ -169973,7 +170018,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L90-L99"
+      "specLocation": "inference/_types/Services.ts#L91-L100"
     },
     {
       "kind": "interface",
@@ -170013,7 +170058,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L101-L110"
+      "specLocation": "inference/_types/Services.ts#L102-L111"
     },
     {
       "kind": "interface",
@@ -170053,7 +170098,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L112-L121"
+      "specLocation": "inference/_types/Services.ts#L113-L122"
     },
     {
       "kind": "interface",
@@ -170093,7 +170138,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L123-L132"
+      "specLocation": "inference/_types/Services.ts#L124-L133"
     },
     {
       "kind": "interface",
@@ -170133,7 +170178,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L134-L143"
+      "specLocation": "inference/_types/Services.ts#L135-L144"
     },
     {
       "kind": "interface",
@@ -170173,7 +170218,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L145-L154"
+      "specLocation": "inference/_types/Services.ts#L146-L155"
     },
     {
       "kind": "interface",
@@ -170213,7 +170258,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L155-L164"
+      "specLocation": "inference/_types/Services.ts#L156-L165"
     },
     {
       "kind": "interface",
@@ -170253,7 +170298,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L177-L186"
+      "specLocation": "inference/_types/Services.ts#L178-L187"
     },
     {
       "kind": "interface",
@@ -170293,7 +170338,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L166-L175"
+      "specLocation": "inference/_types/Services.ts#L167-L176"
     },
     {
       "kind": "interface",
@@ -170333,7 +170378,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L188-L197"
+      "specLocation": "inference/_types/Services.ts#L189-L198"
     },
     {
       "kind": "interface",
@@ -170373,7 +170418,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L199-L208"
+      "specLocation": "inference/_types/Services.ts#L200-L209"
     },
     {
       "kind": "interface",
@@ -170413,7 +170458,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L210-L219"
+      "specLocation": "inference/_types/Services.ts#L211-L220"
     },
     {
       "kind": "interface",
@@ -170453,7 +170498,47 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L221-L230"
+      "specLocation": "inference/_types/Services.ts#L222-L231"
+    },
+    {
+      "kind": "interface",
+      "inherits": {
+        "type": {
+          "name": "InferenceEndpoint",
+          "namespace": "inference._types"
+        }
+      },
+      "name": {
+        "name": "InferenceEndpointInfoLlama",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The inference Id",
+          "name": "inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The task type",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "TaskTypeLlama",
+              "namespace": "inference._types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/Services.ts#L233-L242"
     },
     {
       "kind": "interface",
@@ -170493,7 +170578,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L232-L241"
+      "specLocation": "inference/_types/Services.ts#L244-L253"
     },
     {
       "kind": "interface",
@@ -170533,7 +170618,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L243-L252"
+      "specLocation": "inference/_types/Services.ts#L255-L264"
     },
     {
       "kind": "interface",
@@ -170573,7 +170658,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L254-L263"
+      "specLocation": "inference/_types/Services.ts#L266-L275"
     },
     {
       "kind": "interface",
@@ -170613,7 +170698,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L265-L274"
+      "specLocation": "inference/_types/Services.ts#L277-L286"
     },
     {
       "kind": "interface",
@@ -170891,6 +170976,153 @@
       },
       "specLocation": "inference/_types/CommonTypes.ts#L1363-L1368"
     },
+    {
+      "kind": "interface",
+      "name": {
+        "name": "LlamaServiceSettings",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+          "name": "url",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+          "extDocId": "llama-api-models",
+          "extDocUrl": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/",
+          "name": "model_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "A valid API key for accessing Llama stack endpoint that is going to be sent as part of Bearer authentication header.\nThis field is optional because Llama stack doesn't provide authentication by default.\n\nIMPORTANT: You need to provide the API key only once, during the inference model creation.\nThe get inference endpoint API does not retrieve your API key.\nAfter creating the inference model, you cannot change the associated API key.\nIf you want to use a different API key, delete the inference model and recreate it with the same name and the updated API key.",
+          "name": "api_key",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+          "name": "max_input_tokens",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "integer",
+              "namespace": "_types"
+            }
+          }
+        },
+        {
+          "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.",
+          "name": "dimensions",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "integer",
+              "namespace": "_types"
+            }
+          }
+        },
+        {
+          "description": "For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.",
+          "name": "similarity",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "LlamaSimilarityType",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "This setting helps to minimize the number of rate limit errors returned from the Llama API.\nBy default, the `llama` service sets the number of requests allowed per minute to 3000.",
+          "name": "rate_limit",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "RateLimitSetting",
+              "namespace": "inference._types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/CommonTypes.ts#L1370-L1414"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "llama"
+        }
+      ],
+      "name": {
+        "name": "LlamaServiceType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1422-L1424"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "cosine"
+        },
+        {
+          "name": "dot_product"
+        },
+        {
+          "name": "l2_norm"
+        }
+      ],
+      "name": {
+        "name": "LlamaSimilarityType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1426-L1430"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "text_embedding"
+        },
+        {
+          "name": "completion"
+        },
+        {
+          "name": "chat_completion"
+        }
+      ],
+      "name": {
+        "name": "LlamaTaskType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1416-L1420"
+    },
     {
       "kind": "interface",
       "description": "An object representing part of the conversation.",
@@ -171047,7 +171279,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1370-L1397"
+      "specLocation": "inference/_types/CommonTypes.ts#L1432-L1459"
     },
     {
       "kind": "enum",
@@ -171060,7 +171292,7 @@
         "name": "MistralServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1405-L1407"
+      "specLocation": "inference/_types/CommonTypes.ts#L1467-L1469"
     },
     {
       "kind": "enum",
@@ -171079,7 +171311,7 @@
         "name": "MistralTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1399-L1403"
+      "specLocation": "inference/_types/CommonTypes.ts#L1461-L1465"
     },
     {
       "kind": "interface",
@@ -171166,7 +171398,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1409-L1451"
+      "specLocation": "inference/_types/CommonTypes.ts#L1471-L1513"
     },
     {
       "kind": "enum",
@@ -171179,7 +171411,7 @@
         "name": "OpenAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1467-L1469"
+      "specLocation": "inference/_types/CommonTypes.ts#L1529-L1531"
     },
     {
       "kind": "interface",
@@ -171201,7 +171433,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1453-L1459"
+      "specLocation": "inference/_types/CommonTypes.ts#L1515-L1521"
     },
     {
       "kind": "enum",
@@ -171220,7 +171452,7 @@
         "name": "OpenAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1461-L1465"
+      "specLocation": "inference/_types/CommonTypes.ts#L1523-L1527"
     },
     {
       "kind": "interface",
@@ -171275,7 +171507,7 @@
       },
       "properties": [
         {
-          "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+          "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
           "name": "requests_per_minute",
           "required": false,
           "type": {
@@ -171287,7 +171519,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L338-L364"
+      "specLocation": "inference/_types/Services.ts#L350-L377"
     },
     {
       "kind": "interface",
@@ -171435,7 +171667,7 @@
         "name": "ServiceSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L334-L334",
+      "specLocation": "inference/_types/Services.ts#L346-L346",
       "type": {
         "kind": "user_defined_value"
       }
@@ -171519,7 +171751,7 @@
         "name": "TaskSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L336-L336",
+      "specLocation": "inference/_types/Services.ts#L348-L348",
       "type": {
         "kind": "user_defined_value"
       }
@@ -171805,7 +172037,7 @@
         }
       ],
       "name": {
-        "name": "TaskTypeMistral",
+        "name": "TaskTypeLlama",
         "namespace": "inference._types"
       },
       "specLocation": "inference/_types/TaskType.ts#L107-L111"
@@ -171824,11 +172056,30 @@
         }
       ],
       "name": {
-        "name": "TaskTypeOpenAI",
+        "name": "TaskTypeMistral",
         "namespace": "inference._types"
       },
       "specLocation": "inference/_types/TaskType.ts#L113-L117"
     },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "text_embedding"
+        },
+        {
+          "name": "chat_completion"
+        },
+        {
+          "name": "completion"
+        }
+      ],
+      "name": {
+        "name": "TaskTypeOpenAI",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/TaskType.ts#L119-L123"
+    },
     {
       "kind": "enum",
       "members": [
@@ -171843,7 +172094,7 @@
         "name": "TaskTypeVoyageAI",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L119-L122"
+      "specLocation": "inference/_types/TaskType.ts#L125-L128"
     },
     {
       "kind": "enum",
@@ -171862,7 +172113,7 @@
         "name": "TaskTypeWatsonx",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L124-L128"
+      "specLocation": "inference/_types/TaskType.ts#L130-L134"
     },
     {
       "kind": "interface",
@@ -172108,7 +172359,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1471-L1502"
+      "specLocation": "inference/_types/CommonTypes.ts#L1533-L1564"
     },
     {
       "kind": "enum",
@@ -172121,7 +172372,7 @@
         "name": "VoyageAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1535-L1537"
+      "specLocation": "inference/_types/CommonTypes.ts#L1597-L1599"
     },
     {
       "kind": "interface",
@@ -172181,7 +172432,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1504-L1528"
+      "specLocation": "inference/_types/CommonTypes.ts#L1566-L1590"
     },
     {
       "kind": "enum",
@@ -172197,7 +172448,7 @@
         "name": "VoyageAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1530-L1533"
+      "specLocation": "inference/_types/CommonTypes.ts#L1592-L1595"
     },
     {
       "kind": "interface",
@@ -172285,7 +172536,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1539-L1577"
+      "specLocation": "inference/_types/CommonTypes.ts#L1601-L1639"
     },
     {
       "kind": "enum",
@@ -172298,7 +172549,7 @@
         "name": "WatsonxServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1585-L1587"
+      "specLocation": "inference/_types/CommonTypes.ts#L1647-L1649"
     },
     {
       "kind": "enum",
@@ -172317,7 +172568,7 @@
         "name": "WatsonxTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1579-L1583"
+      "specLocation": "inference/_types/CommonTypes.ts#L1641-L1645"
     },
     {
       "kind": "request",
@@ -173044,7 +173295,7 @@
           }
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "examples": {
         "InferencePutExample1": {
           "alternatives": [
@@ -173129,7 +173380,7 @@
           }
         }
       ],
-      "specLocation": "inference/put/PutRequest.ts#L26-L87"
+      "specLocation": "inference/put/PutRequest.ts#L26-L88"
     },
     {
       "kind": "response",
@@ -175903,6 +176154,144 @@
       },
       "specLocation": "inference/put_jinaai/PutJinaAiResponse.ts#L22-L25"
     },
+    {
+      "kind": "request",
+      "attachedBehaviors": [
+        "CommonQueryParameters"
+      ],
+      "body": {
+        "kind": "properties",
+        "properties": [
+          {
+            "description": "The chunking configuration object.",
+            "extDocId": "inference-chunking",
+            "extDocUrl": "https://www.elastic.co/docs/explore-analyze/elastic-inference/inference-api#infer-chunking-config",
+            "name": "chunking_settings",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "InferenceChunkingSettings",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "The type of service supported for the specified task type. In this case, `llama`.",
+            "name": "service",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "LlamaServiceType",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "Settings used to install the inference model. These settings are specific to the `llama` service.",
+            "name": "service_settings",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "LlamaServiceSettings",
+                "namespace": "inference._types"
+              }
+            }
+          }
+        ]
+      },
+      "description": "Create a Llama inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `llama` service.",
+      "examples": {
+        "PutLlamaRequestExample1": {
+          "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+          "method_request": "PUT _inference/text_embedding/llama-text-embedding",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+        },
+        "PutLlamaRequestExample2": {
+          "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.",
+          "method_request": "PUT _inference/completion/llama-text-completion",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+        },
+        "PutLlamaRequestExample3": {
+          "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.",
+          "method_request": "PUT _inference/chat-completion/llama-text-chat-completion",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+        }
+      },
+      "inherits": {
+        "type": {
+          "name": "RequestBase",
+          "namespace": "_types"
+        }
+      },
+      "name": {
+        "name": "Request",
+        "namespace": "inference.put_llama"
+      },
+      "path": [
+        {
+          "description": "The type of the inference task that the model will perform.",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "LlamaTaskType",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "The unique identifier of the inference endpoint.",
+          "name": "llama_inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Id",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "query": [
+        {
+          "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+          "name": "timeout",
+          "required": false,
+          "serverDefault": "30s",
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Duration",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/put_llama/PutLlamaRequest.ts#L30-L79"
+    },
+    {
+      "kind": "response",
+      "body": {
+        "kind": "value",
+        "codegenName": "endpoint_info",
+        "value": {
+          "kind": "instance_of",
+          "type": {
+            "name": "InferenceEndpointInfoLlama",
+            "namespace": "inference._types"
+          }
+        }
+      },
+      "name": {
+        "name": "Response",
+        "namespace": "inference.put_llama"
+      },
+      "specLocation": "inference/put_llama/PutLlamaResponse.ts#L22-L25"
+    },
     {
       "kind": "request",
       "attachedBehaviors": [
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
index 3c360caa7e..75760e1481 100644
--- a/output/typescript/types.ts
+++ b/output/typescript/types.ts
@@ -14086,6 +14086,11 @@ export interface InferenceInferenceEndpointInfoJinaAi extends InferenceInference
   task_type: InferenceTaskTypeJinaAi
 }
 
+export interface InferenceInferenceEndpointInfoLlama extends InferenceInferenceEndpoint {
+  inference_id: string
+  task_type: InferenceTaskTypeLlama
+}
+
 export interface InferenceInferenceEndpointInfoMistral extends InferenceInferenceEndpoint {
   inference_id: string
   task_type: InferenceTaskTypeMistral
@@ -14136,6 +14141,22 @@ export type InferenceJinaAITaskType = 'rerank' | 'text_embedding'
 
 export type InferenceJinaAITextEmbeddingTask = 'classification' | 'clustering' | 'ingest' | 'search'
 
+export interface InferenceLlamaServiceSettings {
+  url: string
+  model_id: string
+  api_key?: string
+  max_input_tokens?: integer
+  dimensions?: integer
+  similarity?: InferenceLlamaSimilarityType
+  rate_limit?: InferenceRateLimitSetting
+}
+
+export type InferenceLlamaServiceType = 'llama'
+
+export type InferenceLlamaSimilarityType = 'cosine' | 'dot_product' | 'l2_norm'
+
+export type InferenceLlamaTaskType = 'text_embedding' | 'completion' | 'chat_completion'
+
 export interface InferenceMessage {
   content?: InferenceMessageContent
   role: string
@@ -14242,6 +14263,8 @@ export type InferenceTaskTypeHuggingFace = 'chat_completion' | 'completion' | 'r
 
 export type InferenceTaskTypeJinaAi = 'text_embedding' | 'rerank'
 
+export type InferenceTaskTypeLlama = 'text_embedding' | 'chat_completion' | 'completion'
+
 export type InferenceTaskTypeMistral = 'text_embedding' | 'chat_completion' | 'completion'
 
 export type InferenceTaskTypeOpenAI = 'text_embedding' | 'chat_completion' | 'completion'
@@ -14558,6 +14581,19 @@ export interface InferencePutJinaaiRequest extends RequestBase {
 
 export type InferencePutJinaaiResponse = InferenceInferenceEndpointInfoJinaAi
 
+export interface InferencePutLlamaRequest extends RequestBase {
+  task_type: InferenceLlamaTaskType
+  llama_inference_id: Id
+  timeout?: Duration
+  body?: {
+    chunking_settings?: InferenceInferenceChunkingSettings
+    service: InferenceLlamaServiceType
+    service_settings: InferenceLlamaServiceSettings
+  }
+}
+
+export type InferencePutLlamaResponse = InferenceInferenceEndpointInfoLlama
+
 export interface InferencePutMistralRequest extends RequestBase {
   task_type: InferenceMistralTaskType
   mistral_inference_id: Id
diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv
index 666c4f7659..125c0a2bf1 100644
--- a/specification/_doc_ids/table.csv
+++ b/specification/_doc_ids/table.csv
@@ -368,6 +368,7 @@ inference-api-put-googleaistudio,https://www.elastic.co/docs/api/doc/elasticsear
 inference-api-put-googlevertexai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-googlevertexai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-google-vertex-ai.html,
 inference-api-put-huggingface,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-hugging-face,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-hugging-face.html,
 inference-api-put-jinaai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-jinaai,,
+inference-api-put-llama,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama,,
 inference-api-put-mistral,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-mistral,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-mistral.html,
 inference-api-put-openai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-openai.html,
 inference-api-put-voyageai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-voyageai,,
@@ -397,6 +398,7 @@ knn-inner-hits,https://www.elastic.co/docs/solutions/search/vector/knn#nested-kn
 license-management,https://www.elastic.co/docs/deploy-manage/license/manage-your-license-in-self-managed-cluster,,
 list-analytics-collection,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-search-application-get-behavioral-analytics,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-analytics-collection.html,
 list-synonyms-sets,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-synonyms-get-synonyms-sets,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-synonyms-sets.html,
+llama-api-models,https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/,,
 logstash-api-delete-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-delete-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-delete-pipeline.html,
 logstash-api-get-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-get-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-get-pipeline.html,
 logstash-api-put-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-put-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-put-pipeline.html,
diff --git a/specification/_json_spec/inference.put_llama.json b/specification/_json_spec/inference.put_llama.json
new file mode 100644
index 0000000000..5551f655cb
--- /dev/null
+++ b/specification/_json_spec/inference.put_llama.json
@@ -0,0 +1,35 @@
+{
+  "inference.put_llama": {
+    "documentation": {
+      "url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-llama.html",
+      "description": "Configure a Llama inference endpoint"
+    },
+    "stability": "stable",
+    "visibility": "public",
+    "headers": {
+      "accept": ["application/json"],
+      "content_type": ["application/json"]
+    },
+    "url": {
+      "paths": [
+        {
+          "path": "/_inference/{task_type}/{llama_inference_id}",
+          "methods": ["PUT"],
+          "parts": {
+            "task_type": {
+              "type": "string",
+              "description": "The task type"
+            },
+            "llama_inference_id": {
+              "type": "string",
+              "description": "The inference ID"
+            }
+          }
+        }
+      ]
+    },
+    "body": {
+      "description": "The inference endpoint's task and service settings"
+    }
+  }
+}
diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
index 4941cb9210..9c0304b851 100644
--- a/specification/inference/_types/CommonTypes.ts
+++ b/specification/inference/_types/CommonTypes.ts
@@ -1367,6 +1367,68 @@ export enum JinaAITextEmbeddingTask {
   search
 }
 
+export class LlamaServiceSettings {
+  /**
+   * The URL endpoint of the Llama stack endpoint.
+   * URL must contain:
+   * * For `text_embedding` task - `/v1/inference/embeddings`.
+   * * For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.
+   */
+  url: string
+  /**
+   * The name of the model to use for the inference task.
+   * Refer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.
+   * Service has been tested and confirmed to be working with the following models:
+   * * For `text_embedding` task - `all-MiniLM-L6-v2`.
+   * * For `completion` and `chat_completion` tasks - `llama3.2:3b`.
+   * @ext_doc_id llama-api-models
+   */
+  model_id: string
+  /**
+   * A valid API key for accessing Llama stack endpoint that is going to be sent as part of Bearer authentication header.
+   * This field is optional because Llama stack doesn't provide authentication by default.
+   *
+   * IMPORTANT: You need to provide the API key only once, during the inference model creation.
+   * The get inference endpoint API does not retrieve your API key.
+   * After creating the inference model, you cannot change the associated API key.
+   * If you want to use a different API key, delete the inference model and recreate it with the same name and the updated API key.
+   */
+  api_key?: string
+  /**
+   * For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.
+   */
+  max_input_tokens?: integer
+  /**
+   * For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.
+   */
+  dimensions?: integer
+  /**
+   * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
+   */
+  similarity?: LlamaSimilarityType
+  /**
+   * This setting helps to minimize the number of rate limit errors returned from the Llama API.
+   * By default, the `llama` service sets the number of requests allowed per minute to 3000.
+   */
+  rate_limit?: RateLimitSetting
+}
+
+export enum LlamaTaskType {
+  text_embedding,
+  completion,
+  chat_completion
+}
+
+export enum LlamaServiceType {
+  llama
+}
+
+export enum LlamaSimilarityType {
+  cosine,
+  dot_product,
+  l2_norm
+}
+
 export class MistralServiceSettings {
   /**
    * A valid API key of your Mistral account.
diff --git a/specification/inference/_types/Services.ts b/specification/inference/_types/Services.ts
index 304fc477a3..7ce5d3d410 100644
--- a/specification/inference/_types/Services.ts
+++ b/specification/inference/_types/Services.ts
@@ -35,6 +35,7 @@ import {
   TaskTypeGoogleVertexAI,
   TaskTypeHuggingFace,
   TaskTypeJinaAi,
+  TaskTypeLlama,
   TaskTypeMistral,
   TaskTypeOpenAI,
   TaskTypeVoyageAI,
@@ -229,6 +230,17 @@ export class InferenceEndpointInfoJinaAi extends InferenceEndpoint {
   task_type: TaskTypeJinaAi
 }
 
+export class InferenceEndpointInfoLlama extends InferenceEndpoint {
+  /**
+   * The inference Id
+   */
+  inference_id: string
+  /**
+   * The task type
+   */
+  task_type: TaskTypeLlama
+}
+
 export class InferenceEndpointInfoMistral extends InferenceEndpoint {
   /**
    * The inference Id
@@ -354,6 +366,7 @@ export class RateLimitSetting {
    * * `googlevertexai` service: `30000`
    * * `hugging_face` service: `3000`
    * * `jinaai` service: `2000`
+   * * `llama` service: `3000`
    * * `mistral` service: `240`
    * * `openai` service and task type `text_embedding`: `3000`
    * * `openai` service and task type `completion`: `500`
diff --git a/specification/inference/_types/TaskType.ts b/specification/inference/_types/TaskType.ts
index 6daed0d281..eaba333a66 100644
--- a/specification/inference/_types/TaskType.ts
+++ b/specification/inference/_types/TaskType.ts
@@ -104,6 +104,12 @@ export enum TaskTypeHuggingFace {
   text_embedding
 }
 
+export enum TaskTypeLlama {
+  text_embedding,
+  chat_completion,
+  completion
+}
+
 export enum TaskTypeMistral {
   text_embedding,
   chat_completion,
diff --git a/specification/inference/put/PutRequest.ts b/specification/inference/put/PutRequest.ts
index 4554574e32..9c8094c589 100644
--- a/specification/inference/put/PutRequest.ts
+++ b/specification/inference/put/PutRequest.ts
@@ -43,6 +43,7 @@ import { TaskType } from '@inference/_types/TaskType'
  * * Google AI Studio (`completion`, `text_embedding`)
  * * Google Vertex AI (`rerank`, `text_embedding`)
  * * Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)
+ * * Llama (`chat_completion`, `completion`, `text_embedding`)
  * * Mistral (`chat_completion`, `completion`, `text_embedding`)
  * * OpenAI (`chat_completion`, `completion`, `text_embedding`)
  * * VoyageAI (`text_embedding`, `rerank`)
diff --git a/specification/inference/put_llama/PutLlamaRequest.ts b/specification/inference/put_llama/PutLlamaRequest.ts
new file mode 100644
index 0000000000..966f83cc19
--- /dev/null
+++ b/specification/inference/put_llama/PutLlamaRequest.ts
@@ -0,0 +1,79 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { RequestBase } from '@_types/Base'
+import { Id } from '@_types/common'
+import { Duration } from '@_types/Time'
+import {
+  LlamaServiceSettings,
+  LlamaServiceType,
+  LlamaTaskType
+} from '@inference/_types/CommonTypes'
+import { InferenceChunkingSettings } from '@inference/_types/Services'
+
+/**
+ * Create a Llama inference endpoint.
+ *
+ * Create an inference endpoint to perform an inference task with the `llama` service.
+ * @rest_spec_name inference.put_llama
+ * @availability stack since=9.2.0 stability=stable visibility=public
+ * @availability serverless stability=stable visibility=public
+ * @cluster_privileges manage_inference
+ * @doc_id inference-api-put-llama
+ */
+export interface Request extends RequestBase {
+  urls: [
+    {
+      path: '/_inference/{task_type}/{llama_inference_id}'
+      methods: ['PUT']
+    }
+  ]
+  path_parts: {
+    /**
+     * The type of the inference task that the model will perform.
+     */
+    task_type: LlamaTaskType
+    /**
+     * The unique identifier of the inference endpoint.
+     */
+    llama_inference_id: Id
+  }
+  query_parameters: {
+    /**
+     * Specifies the amount of time to wait for the inference endpoint to be created.
+     * @server_default 30s
+     */
+    timeout?: Duration
+  }
+  body: {
+    /**
+     * The chunking configuration object.
+     * @ext_doc_id inference-chunking
+     */
+    chunking_settings?: InferenceChunkingSettings
+    /**
+     * The type of service supported for the specified task type. In this case, `llama`.
+     */
+    service: LlamaServiceType
+    /**
+     * Settings used to install the inference model. These settings are specific to the `llama` service.
+     */
+    service_settings: LlamaServiceSettings
+  }
+}
diff --git a/specification/inference/put_llama/PutLlamaResponse.ts b/specification/inference/put_llama/PutLlamaResponse.ts
new file mode 100644
index 0000000000..858e05875b
--- /dev/null
+++ b/specification/inference/put_llama/PutLlamaResponse.ts
@@ -0,0 +1,25 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { InferenceEndpointInfoLlama } from '@inference/_types/Services'
+
+export class Response {
+  /** @codegen_name endpoint_info */
+  body: InferenceEndpointInfoLlama
+}
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
new file mode 100644
index 0000000000..d23940fce6
--- /dev/null
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
@@ -0,0 +1,13 @@
+# summary:
+description: Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.
+method_request: 'PUT _inference/text_embedding/llama-text-embedding'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/inference/embeddings"
+      "api_key": "llama-api-key",
+      "model_id": "all-MiniLM-L6-v2" 
+    }
+  }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
new file mode 100644
index 0000000000..bc7b2d39f5
--- /dev/null
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
@@ -0,0 +1,13 @@
+# summary:
+description: Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.
+method_request: 'PUT _inference/completion/llama-text-completion'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/openai/v1/chat/completions"
+      "api_key": "llama-api-key",
+      "model_id": "llama3.2:3b" 
+    }
+  }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
new file mode 100644
index 0000000000..476c729d84
--- /dev/null
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
@@ -0,0 +1,13 @@
+# summary:
+description: Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.
+method_request: 'PUT _inference/chat-completion/llama-text-chat-completion'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/openai/v1/chat/completions"
+      "api_key": "llama-api-key",
+      "model_id": "llama3.2:3b" 
+    }
+  }

From 659c6ca104bbd166e20241dfcdb06a10768bf203 Mon Sep 17 00:00:00 2001
From: Jan Kazlouski <jan.kazlouski@elastic.co>
Date: Tue, 22 Jul 2025 17:15:56 +0000
Subject: [PATCH 2/6] Fix typos

---
 output/openapi/elasticsearch-openapi.json                 | 4 ++--
 output/openapi/elasticsearch-serverless-openapi.json      | 4 ++--
 output/schema/schema.json                                 | 8 ++++----
 .../examples/request/PutLlamaRequestExample2.yaml         | 4 ++--
 .../examples/request/PutLlamaRequestExample3.yaml         | 4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index f75453f44c..4a4b3db7bd 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -22303,11 +22303,11 @@
                   "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
                 },
                 "PutLlamaRequestExample2": {
-                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
                   "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
                 },
                 "PutLlamaRequestExample3": {
-                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.",
+                  "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
                   "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
                 }
               }
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index 5d4b6cd28d..9da3c48741 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -13098,11 +13098,11 @@
                   "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
                 },
                 "PutLlamaRequestExample2": {
-                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
                   "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
                 },
                 "PutLlamaRequestExample3": {
-                  "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.",
+                  "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
                   "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
                 }
               }
diff --git a/output/schema/schema.json b/output/schema/schema.json
index f940da504b..e319788a19 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -176210,13 +176210,13 @@
           "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
         },
         "PutLlamaRequestExample2": {
-          "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.",
-          "method_request": "PUT _inference/completion/llama-text-completion",
+          "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
+          "method_request": "PUT _inference/completion/llama-completion",
           "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
         },
         "PutLlamaRequestExample3": {
-          "description": "Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.",
-          "method_request": "PUT _inference/chat-completion/llama-text-chat-completion",
+          "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
+          "method_request": "PUT _inference/chat-completion/llama-chat-completion",
           "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
         }
       },
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
index bc7b2d39f5..1a8417eaa2 100644
--- a/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
@@ -1,6 +1,6 @@
 # summary:
-description: Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `completion` task.
-method_request: 'PUT _inference/completion/llama-text-completion'
+description: Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.
+method_request: 'PUT _inference/completion/llama-completion'
 # type: "request"
 value: |-
   {
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
index 476c729d84..b7c510305a 100644
--- a/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
@@ -1,6 +1,6 @@
 # summary:
-description: Run `PUT _inference/completion/llama-text-completion` to create a Llama inference endpoint that performs a `chat-completion` task.
-method_request: 'PUT _inference/chat-completion/llama-text-chat-completion'
+description: Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.
+method_request: 'PUT _inference/chat-completion/llama-chat-completion'
 # type: "request"
 value: |-
   {

From 23dd73fdacf474159d880f207dcd9229d1308cb5 Mon Sep 17 00:00:00 2001
From: Jan Kazlouski <jan.kazlouski@elastic.co>
Date: Wed, 23 Jul 2025 12:03:06 +0000
Subject: [PATCH 3/6] Fixed Typo

---
 specification/inference/_types/CommonTypes.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
index bdafa4b942..71800e52f0 100644
--- a/specification/inference/_types/CommonTypes.ts
+++ b/specification/inference/_types/CommonTypes.ts
@@ -1387,7 +1387,7 @@ export class LlamaServiceSettings {
   url: string
   /**
    * The name of the model to use for the inference task.
-   * Refer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.
+   * Refer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.
    * Service has been tested and confirmed to be working with the following models:
    * * For `text_embedding` task - `all-MiniLM-L6-v2`.
    * * For `completion` and `chat_completion` tasks - `llama3.2:3b`.

From 73fc8afbed031f90ec4dd984ab60b790e70dc4c3 Mon Sep 17 00:00:00 2001
From: Jan Kazlouski <jan.kazlouski@elastic.co>
Date: Wed, 23 Jul 2025 12:11:16 +0000
Subject: [PATCH 4/6] Update json outputs

---
 output/openapi/elasticsearch-openapi.json            | 4 ++--
 output/openapi/elasticsearch-serverless-openapi.json | 4 ++--
 output/schema/schema.json                            | 2 +-
 package-lock.json                                    | 2 ++
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index 753a8eb413..47688dc63a 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -22334,7 +22334,7 @@
         "x-state": "Generally available; Added in 9.2.0",
         "x-metaTags": [
           {
-            "content": "elasticsearch, Machine Learning",
+            "content": "Elasticsearch, Machine Learning",
             "name": "product_name"
           }
         ]
@@ -90111,7 +90111,7 @@
             "externalDocs": {
               "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
             },
-            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
             "type": "string"
           },
           "api_key": {
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index 162767d304..c0142c072f 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -13129,7 +13129,7 @@
         "x-state": "Generally available",
         "x-metaTags": [
           {
-            "content": "elasticsearch, Machine Learning",
+            "content": "Elasticsearch, Machine Learning",
             "name": "product_name"
           }
         ]
@@ -57433,7 +57433,7 @@
             "externalDocs": {
               "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
             },
-            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
             "type": "string"
           },
           "api_key": {
diff --git a/output/schema/schema.json b/output/schema/schema.json
index 83747f6050..cae8cf13c3 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -171035,7 +171035,7 @@
           }
         },
         {
-          "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+          "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
           "extDocId": "llama-api-models",
           "extDocUrl": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/",
           "name": "model_id",
diff --git a/package-lock.json b/package-lock.json
index d72f33ac58..cd855fd21a 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,9 +1,11 @@
 {
   "name": "elasticsearch-specification",
+  "version": "overlay",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
+      "version": "overlay",
       "dependencies": {
         "@redocly/cli": "^1.34.5",
         "@stoplight/spectral-cli": "^6.14.2"

From 207638f21f9a9feb50e5ba835db1680b1ad3cbde Mon Sep 17 00:00:00 2001
From: Jan Kazlouski <jan.kazlouski@elastic.co>
Date: Tue, 29 Jul 2025 22:18:56 +0000
Subject: [PATCH 5/6] Update specification

---
 output/openapi/elasticsearch-openapi.json     | 274 ++++++-
 .../elasticsearch-serverless-openapi.json     | 270 +++++-
 output/schema/schema.json                     | 770 +++++++++++++++---
 output/typescript/types.ts                    |   5 +
 specification/inference/_types/CommonTypes.ts |  14 +-
 .../inference/put_llama/PutLlamaRequest.ts    |   6 +
 .../request/PutLlamaRequestExample1.yaml      |   3 +-
 7 files changed, 1193 insertions(+), 149 deletions(-)

diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index dd57f97ea3..698110cc99 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -10631,7 +10631,7 @@
           {
             "in": "query",
             "name": "format",
-            "description": "A short version of the Accept header, for example `json` or `yaml`.",
+            "description": "A short version of the Accept header, e.g. json, yaml.\n\n`csv`, `tsv`, and `txt` formats will return results in a tabular format, excluding other metadata fields from the response.\n\nFor async requests, nothing will be returned if the async query doesn't finish within the timeout.\nThe query ID and running status are available in the `X-Elasticsearch-Async-Id` and `X-Elasticsearch-Async-Is-Running` HTTP headers of the response, respectively.",
             "deprecated": false,
             "schema": {
               "$ref": "#/components/schemas/esql._types.EsqlFormat"
@@ -11048,7 +11048,7 @@
           {
             "in": "query",
             "name": "format",
-            "description": "A short version of the Accept header, e.g. json, yaml.",
+            "description": "A short version of the Accept header, e.g. json, yaml.\n\n`csv`, `tsv`, and `txt` formats will return results in a tabular format, excluding other metadata fields from the response.",
             "deprecated": false,
             "schema": {
               "$ref": "#/components/schemas/esql._types.EsqlFormat"
@@ -20573,7 +20573,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -20694,7 +20694,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -21026,6 +21026,96 @@
         ]
       }
     },
+    "/_inference/{task_type}/{amazonsagemaker_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create an Amazon SageMaker inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `amazon_sagemaker` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-amazonsagemaker",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.TaskTypeAmazonSageMaker"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "amazonsagemaker_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.AmazonSageMakerServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.AmazonSageMakerServiceSettings"
+                  },
+                  "task_settings": {
+                    "$ref": "#/components/schemas/inference._types.AmazonSageMakerTaskSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoAmazonSageMaker"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available; Added in 9.1.0",
+        "x-metaTags": [
+          {
+            "content": "Elasticsearch, Machine Learning",
+            "name": "product_name"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{anthropic_inference_id}": {
       "put": {
         "tags": [
@@ -22295,6 +22385,9 @@
                   },
                   "service_settings": {
                     "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  },
+                  "task_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaTaskSettings"
                   }
                 },
                 "required": [
@@ -22305,7 +22398,7 @@
               "examples": {
                 "PutLlamaRequestExample1": {
                   "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
-                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/embeddings\"\n    \"dimensions\": 384,\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
                 },
                 "PutLlamaRequestExample2": {
                   "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
@@ -83475,9 +83568,11 @@
             "type": "object",
             "properties": {
               "id": {
+                "description": "The ID of the async query, to be used in subsequent requests to check the status or retrieve results.\n\nAlso available in the `X-Elasticsearch-Async-Id` HTTP header.",
                 "type": "string"
               },
               "is_running": {
+                "description": "Indicates whether the async query is still running or has completed.\n\nAlso available in the `X-Elasticsearch-Async-Is-Running` HTTP header.",
                 "type": "boolean"
               }
             },
@@ -88963,6 +89058,162 @@
           "completion"
         ]
       },
+      "inference._types.TaskTypeAmazonSageMaker": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion",
+          "sparse_embedding",
+          "rerank"
+        ]
+      },
+      "inference._types.AmazonSageMakerServiceType": {
+        "type": "string",
+        "enum": [
+          "amazon_sagemaker"
+        ]
+      },
+      "inference._types.AmazonSageMakerServiceSettings": {
+        "type": "object",
+        "properties": {
+          "access_key": {
+            "description": "A valid AWS access key that has permissions to use Amazon SageMaker and access to models for invoking requests.",
+            "type": "string"
+          },
+          "endpoint_name": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The name of the SageMaker endpoint.",
+            "type": "string"
+          },
+          "api": {
+            "$ref": "#/components/schemas/inference._types.AmazonSageMakerApi"
+          },
+          "region": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The region that your endpoint or Amazon Resource Name (ARN) is deployed in.\nThe list of available regions per model can be found in the Amazon SageMaker documentation.",
+            "type": "string"
+          },
+          "secret_key": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html"
+            },
+            "description": "A valid AWS secret key that is paired with the `access_key`.\nFor information about creating and managing access and secret keys, refer to the AWS documentation.",
+            "type": "string"
+          },
+          "target_model": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The model ID when calling a multi-model endpoint.",
+            "type": "string"
+          },
+          "target_container_hostname": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The container to directly invoke when calling a multi-container endpoint.",
+            "type": "string"
+          },
+          "inference_component_name": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The inference component to directly invoke when calling a multi-component endpoint.",
+            "type": "string"
+          },
+          "batch_size": {
+            "description": "The maximum number of inputs in each batch. This value is used by inference ingestion pipelines\nwhen processing semantic values. It correlates to the number of times the SageMaker endpoint is\ninvoked (one per batch of input).",
+            "default": 256.0,
+            "type": "number"
+          },
+          "dimensions": {
+            "description": "The number of dimensions returned by the text embedding models. If this value is not provided, then\nit is guessed by making invoking the endpoint for the `text_embedding` task.",
+            "type": "number"
+          }
+        },
+        "required": [
+          "access_key",
+          "endpoint_name",
+          "api",
+          "region",
+          "secret_key"
+        ]
+      },
+      "inference._types.AmazonSageMakerApi": {
+        "type": "string",
+        "enum": [
+          "openai",
+          "elastic"
+        ]
+      },
+      "inference._types.AmazonSageMakerTaskSettings": {
+        "type": "object",
+        "properties": {
+          "custom_attributes": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The AWS custom attributes passed verbatim through to the model running in the SageMaker Endpoint.\nValues will be returned in the `X-elastic-sagemaker-custom-attributes` header.",
+            "type": "string"
+          },
+          "enable_explanations": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The optional JMESPath expression used to override the EnableExplanations provided during endpoint creation.",
+            "type": "string"
+          },
+          "inference_id": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The capture data ID when enabled in the endpoint.",
+            "type": "string"
+          },
+          "session_id": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The stateful session identifier for a new or existing session.\nNew sessions will be returned in the `X-elastic-sagemaker-new-session-id` header.\nClosed sessions will be returned in the `X-elastic-sagemaker-closed-session-id` header.",
+            "type": "string"
+          },
+          "target_variant": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "Specifies the variant when running with multi-variant Endpoints.",
+            "type": "string"
+          }
+        }
+      },
+      "inference._types.InferenceEndpointInfoAmazonSageMaker": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeAmazonSageMaker"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
       "inference._types.AnthropicTaskType": {
         "type": "string",
         "enum": [
@@ -90104,7 +90355,7 @@
         "type": "object",
         "properties": {
           "url": {
-            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/openai/v1/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
             "type": "string"
           },
           "model_id": {
@@ -90123,7 +90374,7 @@
             "type": "number"
           },
           "dimensions": {
-            "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.",
+            "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings must have.\nIt is supported only in `text-embedding-3` and later models. If it is not set by user, it defaults to the model returned dimensions.\nIf model returns embeddings with a different number of dimensions, error is returned.",
             "type": "number"
           },
           "similarity": {
@@ -90146,6 +90397,15 @@
           "l2_norm"
         ]
       },
+      "inference._types.LlamaTaskSettings": {
+        "type": "object",
+        "properties": {
+          "user": {
+            "description": "For a `completion` or `text_embedding` task, specify the user issuing the request.\nThis information can be used for abuse detection.",
+            "type": "string"
+          }
+        }
+      },
       "inference._types.InferenceEndpointInfoLlama": {
         "allOf": [
           {
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index ac322f686d..b9afb949cb 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -6246,7 +6246,7 @@
           {
             "in": "query",
             "name": "format",
-            "description": "A short version of the Accept header, e.g. json, yaml.",
+            "description": "A short version of the Accept header, e.g. json, yaml.\n\n`csv`, `tsv`, and `txt` formats will return results in a tabular format, excluding other metadata fields from the response.",
             "deprecated": false,
             "schema": {
               "$ref": "#/components/schemas/esql._types.EsqlFormat"
@@ -11368,7 +11368,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -11489,7 +11489,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -11821,6 +11821,96 @@
         ]
       }
     },
+    "/_inference/{task_type}/{amazonsagemaker_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create an Amazon SageMaker inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `amazon_sagemaker` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-amazonsagemaker",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.TaskTypeAmazonSageMaker"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "amazonsagemaker_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.AmazonSageMakerServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.AmazonSageMakerServiceSettings"
+                  },
+                  "task_settings": {
+                    "$ref": "#/components/schemas/inference._types.AmazonSageMakerTaskSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoAmazonSageMaker"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available",
+        "x-metaTags": [
+          {
+            "content": "Elasticsearch, Machine Learning",
+            "name": "product_name"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{anthropic_inference_id}": {
       "put": {
         "tags": [
@@ -13090,6 +13180,9 @@
                   },
                   "service_settings": {
                     "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  },
+                  "task_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaTaskSettings"
                   }
                 },
                 "required": [
@@ -13100,7 +13193,7 @@
               "examples": {
                 "PutLlamaRequestExample1": {
                   "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
-                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/embeddings\"\n    \"dimensions\": 384,\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
                 },
                 "PutLlamaRequestExample2": {
                   "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
@@ -56285,6 +56378,162 @@
           "completion"
         ]
       },
+      "inference._types.TaskTypeAmazonSageMaker": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion",
+          "sparse_embedding",
+          "rerank"
+        ]
+      },
+      "inference._types.AmazonSageMakerServiceType": {
+        "type": "string",
+        "enum": [
+          "amazon_sagemaker"
+        ]
+      },
+      "inference._types.AmazonSageMakerServiceSettings": {
+        "type": "object",
+        "properties": {
+          "access_key": {
+            "description": "A valid AWS access key that has permissions to use Amazon SageMaker and access to models for invoking requests.",
+            "type": "string"
+          },
+          "endpoint_name": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The name of the SageMaker endpoint.",
+            "type": "string"
+          },
+          "api": {
+            "$ref": "#/components/schemas/inference._types.AmazonSageMakerApi"
+          },
+          "region": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The region that your endpoint or Amazon Resource Name (ARN) is deployed in.\nThe list of available regions per model can be found in the Amazon SageMaker documentation.",
+            "type": "string"
+          },
+          "secret_key": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html"
+            },
+            "description": "A valid AWS secret key that is paired with the `access_key`.\nFor information about creating and managing access and secret keys, refer to the AWS documentation.",
+            "type": "string"
+          },
+          "target_model": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The model ID when calling a multi-model endpoint.",
+            "type": "string"
+          },
+          "target_container_hostname": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The container to directly invoke when calling a multi-container endpoint.",
+            "type": "string"
+          },
+          "inference_component_name": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The inference component to directly invoke when calling a multi-component endpoint.",
+            "type": "string"
+          },
+          "batch_size": {
+            "description": "The maximum number of inputs in each batch. This value is used by inference ingestion pipelines\nwhen processing semantic values. It correlates to the number of times the SageMaker endpoint is\ninvoked (one per batch of input).",
+            "default": 256.0,
+            "type": "number"
+          },
+          "dimensions": {
+            "description": "The number of dimensions returned by the text embedding models. If this value is not provided, then\nit is guessed by making invoking the endpoint for the `text_embedding` task.",
+            "type": "number"
+          }
+        },
+        "required": [
+          "access_key",
+          "endpoint_name",
+          "api",
+          "region",
+          "secret_key"
+        ]
+      },
+      "inference._types.AmazonSageMakerApi": {
+        "type": "string",
+        "enum": [
+          "openai",
+          "elastic"
+        ]
+      },
+      "inference._types.AmazonSageMakerTaskSettings": {
+        "type": "object",
+        "properties": {
+          "custom_attributes": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The AWS custom attributes passed verbatim through to the model running in the SageMaker Endpoint.\nValues will be returned in the `X-elastic-sagemaker-custom-attributes` header.",
+            "type": "string"
+          },
+          "enable_explanations": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The optional JMESPath expression used to override the EnableExplanations provided during endpoint creation.",
+            "type": "string"
+          },
+          "inference_id": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The capture data ID when enabled in the endpoint.",
+            "type": "string"
+          },
+          "session_id": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "The stateful session identifier for a new or existing session.\nNew sessions will be returned in the `X-elastic-sagemaker-new-session-id` header.\nClosed sessions will be returned in the `X-elastic-sagemaker-closed-session-id` header.",
+            "type": "string"
+          },
+          "target_variant": {
+            "externalDocs": {
+              "url": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html"
+            },
+            "description": "Specifies the variant when running with multi-variant Endpoints.",
+            "type": "string"
+          }
+        }
+      },
+      "inference._types.InferenceEndpointInfoAmazonSageMaker": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeAmazonSageMaker"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
       "inference._types.AnthropicTaskType": {
         "type": "string",
         "enum": [
@@ -57426,7 +57675,7 @@
         "type": "object",
         "properties": {
           "url": {
-            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/openai/v1/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
             "type": "string"
           },
           "model_id": {
@@ -57445,7 +57694,7 @@
             "type": "number"
           },
           "dimensions": {
-            "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.",
+            "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings must have.\nIt is supported only in `text-embedding-3` and later models. If it is not set by user, it defaults to the model returned dimensions.\nIf model returns embeddings with a different number of dimensions, error is returned.",
             "type": "number"
           },
           "similarity": {
@@ -57468,6 +57717,15 @@
           "l2_norm"
         ]
       },
+      "inference._types.LlamaTaskSettings": {
+        "type": "object",
+        "properties": {
+          "user": {
+            "description": "For a `completion` or `text_embedding` task, specify the user issuing the request.\nThis information can be used for abuse detection.",
+            "type": "string"
+          }
+        }
+      },
       "inference._types.InferenceEndpointInfoLlama": {
         "allOf": [
           {
diff --git a/output/schema/schema.json b/output/schema/schema.json
index c255db198d..3070c20e57 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -9920,7 +9920,7 @@
           "visibility": "public"
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "docId": "inference-api-put",
       "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put",
       "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/put-inference-api.html",
@@ -10054,20 +10054,38 @@
     },
     {
       "availability": {
+        "serverless": {
+          "stability": "stable",
+          "visibility": "public"
+        },
         "stack": {
+          "since": "9.1.0",
           "stability": "stable",
           "visibility": "public"
         }
       },
-      "description": "Configure a Amazon SageMaker inference endpoint",
-      "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-amazon-sagemaker.html",
+      "description": "Create an Amazon SageMaker inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `amazon_sagemaker` service.",
+      "docId": "inference-api-put-amazonsagemaker",
+      "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-amazonsagemaker",
+      "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-amazon-sagemaker.html",
       "name": "inference.put_amazonsagemaker",
-      "request": null,
+      "privileges": {
+        "cluster": [
+          "manage_inference"
+        ]
+      },
+      "request": {
+        "name": "Request",
+        "namespace": "inference.put_amazonsagemaker"
+      },
       "requestBodyRequired": false,
       "requestMediaType": [
         "application/json"
       ],
-      "response": null,
+      "response": {
+        "name": "Response",
+        "namespace": "inference.put_amazonsagemaker"
+      },
       "responseMediaType": [
         "application/json"
       ],
@@ -137535,6 +137553,7 @@
       },
       "properties": [
         {
+          "description": "The ID of the async query, to be used in subsequent requests to check the status or retrieve results.\n\nAlso available in the `X-Elasticsearch-Async-Id` HTTP header.",
           "name": "id",
           "required": false,
           "type": {
@@ -137546,6 +137565,7 @@
           }
         },
         {
+          "description": "Indicates whether the async query is still running or has completed.\n\nAlso available in the `X-Elasticsearch-Async-Is-Running` HTTP header.",
           "name": "is_running",
           "required": true,
           "type": {
@@ -137557,7 +137577,7 @@
           }
         }
       ],
-      "specLocation": "esql/_types/EsqlResult.ts#L45-L48"
+      "specLocation": "esql/_types/EsqlResult.ts#L45-L58"
     },
     {
       "kind": "interface",
@@ -137634,7 +137654,7 @@
           }
         }
       ],
-      "specLocation": "esql/_types/EsqlResult.ts#L65-L71"
+      "specLocation": "esql/_types/EsqlResult.ts#L75-L81"
     },
     {
       "kind": "interface",
@@ -137732,7 +137752,7 @@
           }
         }
       ],
-      "specLocation": "esql/_types/EsqlResult.ts#L55-L63"
+      "specLocation": "esql/_types/EsqlResult.ts#L65-L73"
     },
     {
       "kind": "enum",
@@ -137757,7 +137777,7 @@
         "name": "EsqlClusterStatus",
         "namespace": "esql._types"
       },
-      "specLocation": "esql/_types/EsqlResult.ts#L73-L79"
+      "specLocation": "esql/_types/EsqlResult.ts#L83-L89"
     },
     {
       "kind": "interface",
@@ -137789,7 +137809,7 @@
           }
         }
       ],
-      "specLocation": "esql/_types/EsqlResult.ts#L50-L53"
+      "specLocation": "esql/_types/EsqlResult.ts#L60-L63"
     },
     {
       "kind": "enum",
@@ -137995,7 +138015,7 @@
           }
         }
       ],
-      "specLocation": "esql/_types/EsqlResult.ts#L88-L93"
+      "specLocation": "esql/_types/EsqlResult.ts#L98-L103"
     },
     {
       "kind": "interface",
@@ -138049,7 +138069,7 @@
           }
         }
       ],
-      "specLocation": "esql/_types/EsqlResult.ts#L81-L86"
+      "specLocation": "esql/_types/EsqlResult.ts#L91-L96"
     },
     {
       "kind": "interface",
@@ -138492,7 +138512,7 @@
           }
         },
         {
-          "description": "A short version of the Accept header, for example `json` or `yaml`.",
+          "description": "A short version of the Accept header, e.g. json, yaml.\n\n`csv`, `tsv`, and `txt` formats will return results in a tabular format, excluding other metadata fields from the response.\n\nFor async requests, nothing will be returned if the async query doesn't finish within the timeout.\nThe query ID and running status are available in the `X-Elasticsearch-Async-Id` and `X-Elasticsearch-Async-Is-Running` HTTP headers of the response, respectively.",
           "name": "format",
           "required": false,
           "type": {
@@ -138504,7 +138524,7 @@
           }
         }
       ],
-      "specLocation": "esql/async_query/AsyncQueryRequest.ts#L28-L133"
+      "specLocation": "esql/async_query/AsyncQueryRequest.ts#L28-L138"
     },
     {
       "kind": "response",
@@ -139267,7 +139287,7 @@
       "path": [],
       "query": [
         {
-          "description": "A short version of the Accept header, e.g. json, yaml.",
+          "description": "A short version of the Accept header, e.g. json, yaml.\n\n`csv`, `tsv`, and `txt` formats will return results in a tabular format, excluding other metadata fields from the response.",
           "name": "format",
           "required": false,
           "type": {
@@ -139317,7 +139337,7 @@
           }
         }
       ],
-      "specLocation": "esql/query/QueryRequest.ts#L27-L113"
+      "specLocation": "esql/query/QueryRequest.ts#L27-L115"
     },
     {
       "kind": "response",
@@ -167947,6 +167967,258 @@
       },
       "specLocation": "inference/_types/CommonTypes.ts#L436-L439"
     },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "openai"
+        },
+        {
+          "name": "elastic"
+        }
+      ],
+      "name": {
+        "name": "AmazonSageMakerApi",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L501-L504"
+    },
+    {
+      "kind": "interface",
+      "name": {
+        "name": "AmazonSageMakerServiceSettings",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "A valid AWS access key that has permissions to use Amazon SageMaker and access to models for invoking requests.",
+          "name": "access_key",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The name of the SageMaker endpoint.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "endpoint_name",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The API format to use when calling SageMaker.\nElasticsearch will convert the POST _inference request to this data format when invoking the SageMaker endpoint.",
+          "name": "api",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "AmazonSageMakerApi",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "The region that your endpoint or Amazon Resource Name (ARN) is deployed in.\nThe list of available regions per model can be found in the Amazon SageMaker documentation.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "region",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "A valid AWS secret key that is paired with the `access_key`.\nFor information about creating and managing access and secret keys, refer to the AWS documentation.",
+          "extDocId": "amazonsagemaker-secret-keys",
+          "extDocUrl": "https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html",
+          "name": "secret_key",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The model ID when calling a multi-model endpoint.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "target_model",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The container to directly invoke when calling a multi-container endpoint.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "target_container_hostname",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The inference component to directly invoke when calling a multi-component endpoint.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "inference_component_name",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The maximum number of inputs in each batch. This value is used by inference ingestion pipelines\nwhen processing semantic values. It correlates to the number of times the SageMaker endpoint is\ninvoked (one per batch of input).",
+          "name": "batch_size",
+          "required": false,
+          "serverDefault": 256,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "integer",
+              "namespace": "_types"
+            }
+          }
+        },
+        {
+          "description": "The number of dimensions returned by the text embedding models. If this value is not provided, then\nit is guessed by making invoking the endpoint for the `text_embedding` task.",
+          "name": "dimensions",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "integer",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/CommonTypes.ts#L445-L499"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "amazon_sagemaker"
+        }
+      ],
+      "name": {
+        "name": "AmazonSageMakerServiceType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L584-L586"
+    },
+    {
+      "kind": "interface",
+      "name": {
+        "name": "AmazonSageMakerTaskSettings",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The AWS custom attributes passed verbatim through to the model running in the SageMaker Endpoint.\nValues will be returned in the `X-elastic-sagemaker-custom-attributes` header.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "custom_attributes",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The optional JMESPath expression used to override the EnableExplanations provided during endpoint creation.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "enable_explanations",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The capture data ID when enabled in the endpoint.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "inference_id",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The stateful session identifier for a new or existing session.\nNew sessions will be returned in the `X-elastic-sagemaker-new-session-id` header.\nClosed sessions will be returned in the `X-elastic-sagemaker-closed-session-id` header.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "session_id",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "Specifies the variant when running with multi-variant Endpoints.",
+          "extDocId": "amazonsagemaker-invoke",
+          "extDocUrl": "https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html",
+          "name": "target_variant",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/CommonTypes.ts#L535-L564"
+    },
     {
       "kind": "interface",
       "name": {
@@ -167992,7 +168264,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L445-L461"
+      "specLocation": "inference/_types/CommonTypes.ts#L588-L604"
     },
     {
       "kind": "enum",
@@ -168005,7 +168277,7 @@
         "name": "AnthropicServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L494-L496"
+      "specLocation": "inference/_types/CommonTypes.ts#L637-L639"
     },
     {
       "kind": "interface",
@@ -168065,7 +168337,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L463-L488"
+      "specLocation": "inference/_types/CommonTypes.ts#L606-L631"
     },
     {
       "kind": "enum",
@@ -168078,7 +168350,7 @@
         "name": "AnthropicTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L490-L492"
+      "specLocation": "inference/_types/CommonTypes.ts#L633-L635"
     },
     {
       "kind": "interface",
@@ -168152,7 +168424,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L498-L540"
+      "specLocation": "inference/_types/CommonTypes.ts#L641-L683"
     },
     {
       "kind": "enum",
@@ -168165,7 +168437,7 @@
         "name": "AzureAiStudioServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L587-L589"
+      "specLocation": "inference/_types/CommonTypes.ts#L730-L732"
     },
     {
       "kind": "interface",
@@ -168260,7 +168532,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L542-L579"
+      "specLocation": "inference/_types/CommonTypes.ts#L685-L722"
     },
     {
       "kind": "enum",
@@ -168279,7 +168551,7 @@
         "name": "AzureAiStudioTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L581-L585"
+      "specLocation": "inference/_types/CommonTypes.ts#L724-L728"
     },
     {
       "kind": "interface",
@@ -168371,7 +168643,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L591-L636"
+      "specLocation": "inference/_types/CommonTypes.ts#L734-L779"
     },
     {
       "kind": "enum",
@@ -168384,7 +168656,7 @@
         "name": "AzureOpenAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L651-L653"
+      "specLocation": "inference/_types/CommonTypes.ts#L794-L796"
     },
     {
       "kind": "interface",
@@ -168406,7 +168678,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L638-L644"
+      "specLocation": "inference/_types/CommonTypes.ts#L781-L787"
     },
     {
       "kind": "enum",
@@ -168422,7 +168694,7 @@
         "name": "AzureOpenAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L646-L649"
+      "specLocation": "inference/_types/CommonTypes.ts#L789-L792"
     },
     {
       "kind": "enum",
@@ -168447,7 +168719,7 @@
         "name": "CohereEmbeddingType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L708-L714"
+      "specLocation": "inference/_types/CommonTypes.ts#L851-L857"
     },
     {
       "kind": "enum",
@@ -168469,7 +168741,7 @@
         "name": "CohereInputType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L716-L721"
+      "specLocation": "inference/_types/CommonTypes.ts#L859-L864"
     },
     {
       "kind": "interface",
@@ -168542,7 +168814,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L655-L696"
+      "specLocation": "inference/_types/CommonTypes.ts#L798-L839"
     },
     {
       "kind": "enum",
@@ -168555,7 +168827,7 @@
         "name": "CohereServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L704-L706"
+      "specLocation": "inference/_types/CommonTypes.ts#L847-L849"
     },
     {
       "kind": "enum",
@@ -168574,7 +168846,7 @@
         "name": "CohereSimilarityType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L723-L727"
+      "specLocation": "inference/_types/CommonTypes.ts#L866-L870"
     },
     {
       "kind": "interface",
@@ -168632,7 +168904,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L735-L767"
+      "specLocation": "inference/_types/CommonTypes.ts#L878-L910"
     },
     {
       "kind": "enum",
@@ -168651,7 +168923,7 @@
         "name": "CohereTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L698-L702"
+      "specLocation": "inference/_types/CommonTypes.ts#L841-L845"
     },
     {
       "kind": "enum",
@@ -168670,7 +168942,7 @@
         "name": "CohereTruncateType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L729-L733"
+      "specLocation": "inference/_types/CommonTypes.ts#L872-L876"
     },
     {
       "kind": "interface",
@@ -168953,7 +169225,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L839-L850"
+      "specLocation": "inference/_types/CommonTypes.ts#L982-L993"
     },
     {
       "kind": "interface",
@@ -168971,7 +169243,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L852-L990"
+      "specLocation": "inference/_types/CommonTypes.ts#L995-L1133"
     },
     {
       "kind": "interface",
@@ -169049,7 +169321,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L769-L837"
+      "specLocation": "inference/_types/CommonTypes.ts#L912-L980"
     },
     {
       "kind": "enum",
@@ -169062,7 +169334,7 @@
         "name": "CustomServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L999-L1001"
+      "specLocation": "inference/_types/CommonTypes.ts#L1142-L1144"
     },
     {
       "kind": "interface",
@@ -169080,7 +169352,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1003-L1017"
+      "specLocation": "inference/_types/CommonTypes.ts#L1146-L1160"
     },
     {
       "kind": "enum",
@@ -169102,7 +169374,7 @@
         "name": "CustomTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L992-L997"
+      "specLocation": "inference/_types/CommonTypes.ts#L1135-L1140"
     },
     {
       "kind": "interface",
@@ -169150,7 +169422,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1031-L1053"
+      "specLocation": "inference/_types/CommonTypes.ts#L1174-L1196"
     },
     {
       "kind": "enum",
@@ -169163,7 +169435,7 @@
         "name": "DeepSeekServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1055-L1057"
+      "specLocation": "inference/_types/CommonTypes.ts#L1198-L1200"
     },
     {
       "kind": "interface",
@@ -169304,7 +169576,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1080-L1114"
+      "specLocation": "inference/_types/CommonTypes.ts#L1223-L1257"
     },
     {
       "kind": "enum",
@@ -169317,7 +169589,7 @@
         "name": "ElasticsearchServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1130-L1132"
+      "specLocation": "inference/_types/CommonTypes.ts#L1273-L1275"
     },
     {
       "kind": "interface",
@@ -169340,7 +169612,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1116-L1122"
+      "specLocation": "inference/_types/CommonTypes.ts#L1259-L1265"
     },
     {
       "kind": "enum",
@@ -169359,7 +169631,7 @@
         "name": "ElasticsearchTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1124-L1128"
+      "specLocation": "inference/_types/CommonTypes.ts#L1267-L1271"
     },
     {
       "kind": "interface",
@@ -169405,7 +169677,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1134-L1160"
+      "specLocation": "inference/_types/CommonTypes.ts#L1277-L1303"
     },
     {
       "kind": "enum",
@@ -169418,7 +169690,7 @@
         "name": "ElserServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1166-L1168"
+      "specLocation": "inference/_types/CommonTypes.ts#L1309-L1311"
     },
     {
       "kind": "enum",
@@ -169431,7 +169703,7 @@
         "name": "ElserTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1162-L1164"
+      "specLocation": "inference/_types/CommonTypes.ts#L1305-L1307"
     },
     {
       "kind": "enum",
@@ -169444,7 +169716,7 @@
         "name": "GoogleAiServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1193-L1195"
+      "specLocation": "inference/_types/CommonTypes.ts#L1336-L1338"
     },
     {
       "kind": "interface",
@@ -169492,7 +169764,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1170-L1186"
+      "specLocation": "inference/_types/CommonTypes.ts#L1313-L1329"
     },
     {
       "kind": "enum",
@@ -169508,7 +169780,7 @@
         "name": "GoogleAiStudioTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1188-L1191"
+      "specLocation": "inference/_types/CommonTypes.ts#L1331-L1334"
     },
     {
       "kind": "interface",
@@ -169582,7 +169854,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1197-L1223"
+      "specLocation": "inference/_types/CommonTypes.ts#L1340-L1366"
     },
     {
       "kind": "enum",
@@ -169595,7 +169867,7 @@
         "name": "GoogleVertexAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1243-L1245"
+      "specLocation": "inference/_types/CommonTypes.ts#L1386-L1388"
     },
     {
       "kind": "interface",
@@ -169629,7 +169901,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1225-L1234"
+      "specLocation": "inference/_types/CommonTypes.ts#L1368-L1377"
     },
     {
       "kind": "enum",
@@ -169651,7 +169923,7 @@
         "name": "GoogleVertexAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1236-L1241"
+      "specLocation": "inference/_types/CommonTypes.ts#L1379-L1384"
     },
     {
       "kind": "interface",
@@ -169713,7 +169985,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1247-L1279"
+      "specLocation": "inference/_types/CommonTypes.ts#L1390-L1422"
     },
     {
       "kind": "enum",
@@ -169726,7 +169998,7 @@
         "name": "HuggingFaceServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1300-L1302"
+      "specLocation": "inference/_types/CommonTypes.ts#L1443-L1445"
     },
     {
       "kind": "interface",
@@ -169760,7 +170032,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1281-L1291"
+      "specLocation": "inference/_types/CommonTypes.ts#L1424-L1434"
     },
     {
       "kind": "enum",
@@ -169782,7 +170054,7 @@
         "name": "HuggingFaceTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1293-L1298"
+      "specLocation": "inference/_types/CommonTypes.ts#L1436-L1441"
     },
     {
       "kind": "interface",
@@ -169874,7 +170146,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L288-L344"
+      "specLocation": "inference/_types/Services.ts#L300-L356"
     },
     {
       "kind": "interface",
@@ -169933,7 +170205,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L45-L65"
+      "specLocation": "inference/_types/Services.ts#L46-L66"
     },
     {
       "kind": "interface",
@@ -169974,7 +170246,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L67-L79"
+      "specLocation": "inference/_types/Services.ts#L68-L80"
     },
     {
       "kind": "interface",
@@ -170014,7 +170286,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L80-L89"
+      "specLocation": "inference/_types/Services.ts#L81-L90"
     },
     {
       "kind": "interface",
@@ -170054,7 +170326,47 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L91-L100"
+      "specLocation": "inference/_types/Services.ts#L92-L101"
+    },
+    {
+      "kind": "interface",
+      "inherits": {
+        "type": {
+          "name": "InferenceEndpoint",
+          "namespace": "inference._types"
+        }
+      },
+      "name": {
+        "name": "InferenceEndpointInfoAmazonSageMaker",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The inference Id",
+          "name": "inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The task type",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "TaskTypeAmazonSageMaker",
+              "namespace": "inference._types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/Services.ts#L103-L112"
     },
     {
       "kind": "interface",
@@ -170094,7 +170406,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L102-L111"
+      "specLocation": "inference/_types/Services.ts#L114-L123"
     },
     {
       "kind": "interface",
@@ -170134,7 +170446,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L113-L122"
+      "specLocation": "inference/_types/Services.ts#L125-L134"
     },
     {
       "kind": "interface",
@@ -170174,7 +170486,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L124-L133"
+      "specLocation": "inference/_types/Services.ts#L136-L145"
     },
     {
       "kind": "interface",
@@ -170214,7 +170526,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L135-L144"
+      "specLocation": "inference/_types/Services.ts#L147-L156"
     },
     {
       "kind": "interface",
@@ -170254,7 +170566,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L146-L155"
+      "specLocation": "inference/_types/Services.ts#L158-L167"
     },
     {
       "kind": "interface",
@@ -170294,7 +170606,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L156-L165"
+      "specLocation": "inference/_types/Services.ts#L168-L177"
     },
     {
       "kind": "interface",
@@ -170334,7 +170646,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L178-L187"
+      "specLocation": "inference/_types/Services.ts#L190-L199"
     },
     {
       "kind": "interface",
@@ -170374,7 +170686,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L167-L176"
+      "specLocation": "inference/_types/Services.ts#L179-L188"
     },
     {
       "kind": "interface",
@@ -170414,7 +170726,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L189-L198"
+      "specLocation": "inference/_types/Services.ts#L201-L210"
     },
     {
       "kind": "interface",
@@ -170454,7 +170766,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L200-L209"
+      "specLocation": "inference/_types/Services.ts#L212-L221"
     },
     {
       "kind": "interface",
@@ -170494,7 +170806,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L211-L220"
+      "specLocation": "inference/_types/Services.ts#L223-L232"
     },
     {
       "kind": "interface",
@@ -170534,7 +170846,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L222-L231"
+      "specLocation": "inference/_types/Services.ts#L234-L243"
     },
     {
       "kind": "interface",
@@ -170574,7 +170886,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L233-L242"
+      "specLocation": "inference/_types/Services.ts#L245-L254"
     },
     {
       "kind": "interface",
@@ -170614,7 +170926,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L244-L253"
+      "specLocation": "inference/_types/Services.ts#L256-L265"
     },
     {
       "kind": "interface",
@@ -170654,7 +170966,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L255-L264"
+      "specLocation": "inference/_types/Services.ts#L267-L276"
     },
     {
       "kind": "interface",
@@ -170694,7 +171006,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L266-L275"
+      "specLocation": "inference/_types/Services.ts#L278-L287"
     },
     {
       "kind": "interface",
@@ -170734,7 +171046,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L277-L286"
+      "specLocation": "inference/_types/Services.ts#L289-L298"
     },
     {
       "kind": "interface",
@@ -170894,7 +171206,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1304-L1333"
+      "specLocation": "inference/_types/CommonTypes.ts#L1447-L1476"
     },
     {
       "kind": "enum",
@@ -170907,7 +171219,7 @@
         "name": "JinaAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1363-L1365"
+      "specLocation": "inference/_types/CommonTypes.ts#L1506-L1508"
     },
     {
       "kind": "enum",
@@ -170926,7 +171238,7 @@
         "name": "JinaAISimilarityType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1367-L1371"
+      "specLocation": "inference/_types/CommonTypes.ts#L1510-L1514"
     },
     {
       "kind": "interface",
@@ -170972,7 +171284,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1335-L1356"
+      "specLocation": "inference/_types/CommonTypes.ts#L1478-L1499"
     },
     {
       "kind": "enum",
@@ -170988,7 +171300,7 @@
         "name": "JinaAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1358-L1361"
+      "specLocation": "inference/_types/CommonTypes.ts#L1501-L1504"
     },
     {
       "kind": "enum",
@@ -171010,7 +171322,7 @@
         "name": "JinaAITextEmbeddingTask",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1373-L1378"
+      "specLocation": "inference/_types/CommonTypes.ts#L1516-L1521"
     },
     {
       "kind": "interface",
@@ -171020,7 +171332,7 @@
       },
       "properties": [
         {
-          "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+          "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/openai/v1/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
           "name": "url",
           "required": true,
           "type": {
@@ -171070,7 +171382,7 @@
           }
         },
         {
-          "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.",
+          "description": "For a `text_embedding` task, the number of dimensions the resulting output embeddings must have.\nIt is supported only in `text-embedding-3` and later models. If it is not set by user, it defaults to the model returned dimensions.\nIf model returns embeddings with a different number of dimensions, error is returned.",
           "name": "dimensions",
           "required": false,
           "type": {
@@ -171106,7 +171418,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1380-L1424"
+      "specLocation": "inference/_types/CommonTypes.ts#L1523-L1569"
     },
     {
       "kind": "enum",
@@ -171119,7 +171431,7 @@
         "name": "LlamaServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1432-L1434"
+      "specLocation": "inference/_types/CommonTypes.ts#L1585-L1587"
     },
     {
       "kind": "enum",
@@ -171138,7 +171450,29 @@
         "name": "LlamaSimilarityType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1436-L1440"
+      "specLocation": "inference/_types/CommonTypes.ts#L1589-L1593"
+    },
+    {
+      "kind": "interface",
+      "name": {
+        "name": "LlamaTaskSettings",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "For a `completion` or `text_embedding` task, specify the user issuing the request.\nThis information can be used for abuse detection.",
+          "name": "user",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/CommonTypes.ts#L1571-L1577"
     },
     {
       "kind": "enum",
@@ -171157,7 +171491,7 @@
         "name": "LlamaTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1426-L1430"
+      "specLocation": "inference/_types/CommonTypes.ts#L1579-L1583"
     },
     {
       "kind": "interface",
@@ -171315,7 +171649,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1442-L1469"
+      "specLocation": "inference/_types/CommonTypes.ts#L1595-L1622"
     },
     {
       "kind": "enum",
@@ -171328,7 +171662,7 @@
         "name": "MistralServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1477-L1479"
+      "specLocation": "inference/_types/CommonTypes.ts#L1630-L1632"
     },
     {
       "kind": "enum",
@@ -171347,7 +171681,7 @@
         "name": "MistralTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1471-L1475"
+      "specLocation": "inference/_types/CommonTypes.ts#L1624-L1628"
     },
     {
       "kind": "interface",
@@ -171434,7 +171768,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1481-L1523"
+      "specLocation": "inference/_types/CommonTypes.ts#L1634-L1676"
     },
     {
       "kind": "enum",
@@ -171447,7 +171781,7 @@
         "name": "OpenAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1539-L1541"
+      "specLocation": "inference/_types/CommonTypes.ts#L1692-L1694"
     },
     {
       "kind": "interface",
@@ -171469,7 +171803,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1525-L1531"
+      "specLocation": "inference/_types/CommonTypes.ts#L1678-L1684"
     },
     {
       "kind": "enum",
@@ -171488,7 +171822,7 @@
         "name": "OpenAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1533-L1537"
+      "specLocation": "inference/_types/CommonTypes.ts#L1686-L1690"
     },
     {
       "kind": "interface",
@@ -171555,7 +171889,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L350-L377"
+      "specLocation": "inference/_types/Services.ts#L362-L389"
     },
     {
       "kind": "interface",
@@ -171703,7 +172037,7 @@
         "name": "ServiceSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L346-L346",
+      "specLocation": "inference/_types/Services.ts#L358-L358",
       "type": {
         "kind": "user_defined_value"
       }
@@ -171787,7 +172121,7 @@
         "name": "TaskSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L348-L348",
+      "specLocation": "inference/_types/Services.ts#L360-L360",
       "type": {
         "kind": "user_defined_value"
       }
@@ -171855,6 +172189,31 @@
       },
       "specLocation": "inference/_types/TaskType.ts#L43-L46"
     },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "text_embedding"
+        },
+        {
+          "name": "completion"
+        },
+        {
+          "name": "chat_completion"
+        },
+        {
+          "name": "sparse_embedding"
+        },
+        {
+          "name": "rerank"
+        }
+      ],
+      "name": {
+        "name": "TaskTypeAmazonSageMaker",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/TaskType.ts#L48-L54"
+    },
     {
       "kind": "enum",
       "members": [
@@ -171866,7 +172225,7 @@
         "name": "TaskTypeAnthropic",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L48-L50"
+      "specLocation": "inference/_types/TaskType.ts#L56-L58"
     },
     {
       "kind": "enum",
@@ -171885,7 +172244,7 @@
         "name": "TaskTypeAzureAIStudio",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L52-L56"
+      "specLocation": "inference/_types/TaskType.ts#L60-L64"
     },
     {
       "kind": "enum",
@@ -171901,7 +172260,7 @@
         "name": "TaskTypeAzureOpenAI",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L58-L61"
+      "specLocation": "inference/_types/TaskType.ts#L66-L69"
     },
     {
       "kind": "enum",
@@ -171920,7 +172279,7 @@
         "name": "TaskTypeCohere",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L63-L67"
+      "specLocation": "inference/_types/TaskType.ts#L71-L75"
     },
     {
       "kind": "enum",
@@ -171942,7 +172301,7 @@
         "name": "TaskTypeCustom",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L69-L74"
+      "specLocation": "inference/_types/TaskType.ts#L77-L82"
     },
     {
       "kind": "enum",
@@ -171958,7 +172317,7 @@
         "name": "TaskTypeDeepSeek",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L76-L79"
+      "specLocation": "inference/_types/TaskType.ts#L84-L87"
     },
     {
       "kind": "enum",
@@ -171971,7 +172330,7 @@
         "name": "TaskTypeELSER",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L87-L89"
+      "specLocation": "inference/_types/TaskType.ts#L95-L97"
     },
     {
       "kind": "enum",
@@ -171990,7 +172349,7 @@
         "name": "TaskTypeElasticsearch",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L81-L85"
+      "specLocation": "inference/_types/TaskType.ts#L89-L93"
     },
     {
       "kind": "enum",
@@ -172006,7 +172365,7 @@
         "name": "TaskTypeGoogleAIStudio",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L91-L94"
+      "specLocation": "inference/_types/TaskType.ts#L99-L102"
     },
     {
       "kind": "enum",
@@ -172022,7 +172381,7 @@
         "name": "TaskTypeGoogleVertexAI",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L96-L99"
+      "specLocation": "inference/_types/TaskType.ts#L104-L107"
     },
     {
       "kind": "enum",
@@ -172044,7 +172403,7 @@
         "name": "TaskTypeHuggingFace",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L101-L106"
+      "specLocation": "inference/_types/TaskType.ts#L109-L114"
     },
     {
       "kind": "enum",
@@ -172079,7 +172438,7 @@
         "name": "TaskTypeLlama",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L108-L112"
+      "specLocation": "inference/_types/TaskType.ts#L116-L120"
     },
     {
       "kind": "enum",
@@ -172098,7 +172457,7 @@
         "name": "TaskTypeMistral",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L114-L118"
+      "specLocation": "inference/_types/TaskType.ts#L122-L126"
     },
     {
       "kind": "enum",
@@ -172117,7 +172476,7 @@
         "name": "TaskTypeOpenAI",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L120-L124"
+      "specLocation": "inference/_types/TaskType.ts#L128-L132"
     },
     {
       "kind": "enum",
@@ -172133,7 +172492,7 @@
         "name": "TaskTypeVoyageAI",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L126-L129"
+      "specLocation": "inference/_types/TaskType.ts#L134-L137"
     },
     {
       "kind": "enum",
@@ -172152,7 +172511,7 @@
         "name": "TaskTypeWatsonx",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L131-L135"
+      "specLocation": "inference/_types/TaskType.ts#L139-L143"
     },
     {
       "kind": "interface",
@@ -172398,7 +172757,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1543-L1574"
+      "specLocation": "inference/_types/CommonTypes.ts#L1696-L1727"
     },
     {
       "kind": "enum",
@@ -172411,7 +172770,7 @@
         "name": "VoyageAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1607-L1609"
+      "specLocation": "inference/_types/CommonTypes.ts#L1760-L1762"
     },
     {
       "kind": "interface",
@@ -172471,7 +172830,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1576-L1600"
+      "specLocation": "inference/_types/CommonTypes.ts#L1729-L1753"
     },
     {
       "kind": "enum",
@@ -172487,7 +172846,7 @@
         "name": "VoyageAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1602-L1605"
+      "specLocation": "inference/_types/CommonTypes.ts#L1755-L1758"
     },
     {
       "kind": "interface",
@@ -172575,7 +172934,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1611-L1649"
+      "specLocation": "inference/_types/CommonTypes.ts#L1764-L1802"
     },
     {
       "kind": "enum",
@@ -172588,7 +172947,7 @@
         "name": "WatsonxServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1657-L1659"
+      "specLocation": "inference/_types/CommonTypes.ts#L1810-L1812"
     },
     {
       "kind": "enum",
@@ -172607,7 +172966,7 @@
         "name": "WatsonxTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1651-L1655"
+      "specLocation": "inference/_types/CommonTypes.ts#L1804-L1808"
     },
     {
       "kind": "request",
@@ -173334,7 +173693,7 @@
           }
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "examples": {
         "InferencePutExample1": {
           "alternatives": [
@@ -173419,7 +173778,7 @@
           }
         }
       ],
-      "specLocation": "inference/put/PutRequest.ts#L26-L88"
+      "specLocation": "inference/put/PutRequest.ts#L26-L89"
     },
     {
       "kind": "response",
@@ -173902,6 +174261,139 @@
       },
       "specLocation": "inference/put_amazonbedrock/PutAmazonBedrockResponse.ts#L22-L25"
     },
+    {
+      "kind": "request",
+      "attachedBehaviors": [
+        "CommonQueryParameters"
+      ],
+      "body": {
+        "kind": "properties",
+        "properties": [
+          {
+            "description": "The chunking configuration object.",
+            "extDocId": "inference-chunking",
+            "extDocUrl": "https://www.elastic.co/docs/explore-analyze/elastic-inference/inference-api#infer-chunking-config",
+            "name": "chunking_settings",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "InferenceChunkingSettings",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "The type of service supported for the specified task type. In this case, `amazon_sagemaker`.",
+            "name": "service",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "AmazonSageMakerServiceType",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "Settings used to install the inference model.\nThese settings are specific to the `amazon_sagemaker` service and `service_settings.api` you specified.",
+            "name": "service_settings",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "AmazonSageMakerServiceSettings",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "Settings to configure the inference task.\nThese settings are specific to the task type and `service_settings.api` you specified.",
+            "name": "task_settings",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "AmazonSageMakerTaskSettings",
+                "namespace": "inference._types"
+              }
+            }
+          }
+        ]
+      },
+      "description": "Create an Amazon SageMaker inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `amazon_sagemaker` service.",
+      "inherits": {
+        "type": {
+          "name": "RequestBase",
+          "namespace": "_types"
+        }
+      },
+      "name": {
+        "name": "Request",
+        "namespace": "inference.put_amazonsagemaker"
+      },
+      "path": [
+        {
+          "description": "The type of the inference task that the model will perform.",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "TaskTypeAmazonSageMaker",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "The unique identifier of the inference endpoint.",
+          "name": "amazonsagemaker_inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Id",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "query": [
+        {
+          "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+          "name": "timeout",
+          "required": false,
+          "serverDefault": "30s",
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Duration",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/put_amazonsagemaker/PutAmazonSageMakerRequest.ts#L31-L86"
+    },
+    {
+      "kind": "response",
+      "body": {
+        "kind": "value",
+        "codegenName": "endpoint_info",
+        "value": {
+          "kind": "instance_of",
+          "type": {
+            "name": "InferenceEndpointInfoAmazonSageMaker",
+            "namespace": "inference._types"
+          }
+        }
+      },
+      "name": {
+        "name": "Response",
+        "namespace": "inference.put_amazonsagemaker"
+      },
+      "specLocation": "inference/put_amazonsagemaker/PutAmazonSageMakerResponse.ts#L22-L25"
+    },
     {
       "kind": "request",
       "attachedBehaviors": [
@@ -176266,6 +176758,18 @@
                 "namespace": "inference._types"
               }
             }
+          },
+          {
+            "description": "Settings to configure the inference task.\nThese settings are specific to the task type you specified.",
+            "name": "task_settings",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "LlamaTaskSettings",
+                "namespace": "inference._types"
+              }
+            }
           }
         ]
       },
@@ -176274,7 +176778,7 @@
         "PutLlamaRequestExample1": {
           "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
           "method_request": "PUT _inference/text_embedding/llama-text-embedding",
-          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/embeddings\"\n    \"dimensions\": 384,\n    \"api_key\": \"llama-api-key\",\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
         },
         "PutLlamaRequestExample2": {
           "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
@@ -176338,7 +176842,7 @@
           }
         }
       ],
-      "specLocation": "inference/put_llama/PutLlamaRequest.ts#L30-L79"
+      "specLocation": "inference/put_llama/PutLlamaRequest.ts#L31-L85"
     },
     {
       "kind": "response",
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
index 1f8f756996..cd992bcfa3 100644
--- a/output/typescript/types.ts
+++ b/output/typescript/types.ts
@@ -14188,6 +14188,10 @@ export type InferenceLlamaServiceType = 'llama'
 
 export type InferenceLlamaSimilarityType = 'cosine' | 'dot_product' | 'l2_norm'
 
+export interface InferenceLlamaTaskSettings {
+  user?: string
+}
+
 export type InferenceLlamaTaskType = 'text_embedding' | 'completion' | 'chat_completion'
 
 export interface InferenceMessage {
@@ -14638,6 +14642,7 @@ export interface InferencePutLlamaRequest extends RequestBase {
     chunking_settings?: InferenceInferenceChunkingSettings
     service: InferenceLlamaServiceType
     service_settings: InferenceLlamaServiceSettings
+    task_settings?: InferenceLlamaTaskSettings
   }
 }
 
diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
index 3ad06c0ca8..403da866e4 100644
--- a/specification/inference/_types/CommonTypes.ts
+++ b/specification/inference/_types/CommonTypes.ts
@@ -1524,7 +1524,7 @@ export class LlamaServiceSettings {
   /**
    * The URL endpoint of the Llama stack endpoint.
    * URL must contain:
-   * * For `text_embedding` task - `/v1/inference/embeddings`.
+   * * For `text_embedding` task - `/v1/openai/v1/embeddings`.
    * * For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.
    */
   url: string
@@ -1552,7 +1552,9 @@ export class LlamaServiceSettings {
    */
   max_input_tokens?: integer
   /**
-   * For a `text_embedding` task, the number of dimensions the resulting output embeddings should have.
+   * For a `text_embedding` task, the number of dimensions the resulting output embeddings must have.
+   * It is supported only in `text-embedding-3` and later models. If it is not set by user, it defaults to the model returned dimensions.
+   * If model returns embeddings with a different number of dimensions, error is returned.
    */
   dimensions?: integer
   /**
@@ -1566,6 +1568,14 @@ export class LlamaServiceSettings {
   rate_limit?: RateLimitSetting
 }
 
+export class LlamaTaskSettings {
+  /**
+   * For a `completion` or `text_embedding` task, specify the user issuing the request.
+   * This information can be used for abuse detection.
+   */
+  user?: string
+}
+
 export enum LlamaTaskType {
   text_embedding,
   completion,
diff --git a/specification/inference/put_llama/PutLlamaRequest.ts b/specification/inference/put_llama/PutLlamaRequest.ts
index 966f83cc19..2a41fb67b3 100644
--- a/specification/inference/put_llama/PutLlamaRequest.ts
+++ b/specification/inference/put_llama/PutLlamaRequest.ts
@@ -23,6 +23,7 @@ import { Duration } from '@_types/Time'
 import {
   LlamaServiceSettings,
   LlamaServiceType,
+  LlamaTaskSettings,
   LlamaTaskType
 } from '@inference/_types/CommonTypes'
 import { InferenceChunkingSettings } from '@inference/_types/Services'
@@ -75,5 +76,10 @@ export interface Request extends RequestBase {
      * Settings used to install the inference model. These settings are specific to the `llama` service.
      */
     service_settings: LlamaServiceSettings
+    /**
+     * Settings to configure the inference task.
+     * These settings are specific to the task type you specified.
+     */
+    task_settings?: LlamaTaskSettings
   }
 }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
index d23940fce6..102a6a605d 100644
--- a/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
@@ -6,7 +6,8 @@ value: |-
   {
     "service": "llama",
     "service_settings": {
-      "url": "http://localhost:8321/v1/inference/embeddings"
+      "url": "http://localhost:8321/v1/openai/v1/embeddings"
+      "dimensions": 384,
       "api_key": "llama-api-key",
       "model_id": "all-MiniLM-L6-v2" 
     }

From c16b9d06cdc39e3c3897536545f0ca94fdd97188 Mon Sep 17 00:00:00 2001
From: Jan Kazlouski <jan.kazlouski@elastic.co>
Date: Mon, 4 Aug 2025 15:25:44 +0000
Subject: [PATCH 6/6] Update llama specification

---
 output/openapi/elasticsearch-openapi.json     | 190 +++++++-
 .../elasticsearch-serverless-openapi.json     | 190 +++++++-
 output/schema/schema.json                     | 461 ++++++++++++++++--
 output/typescript/types.ts                    |   7 -
 specification/inference/_types/CommonTypes.ts |  26 +-
 .../inference/put_llama/PutLlamaRequest.ts    |   6 -
 .../request/PutLlamaRequestExample1.yaml      |   3 +-
 .../request/PutLlamaRequestExample2.yaml      |   1 -
 .../request/PutLlamaRequestExample3.yaml      |   1 -
 9 files changed, 789 insertions(+), 96 deletions(-)

diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index d414976e5e..9387277b74 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -20573,7 +20573,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -20694,7 +20694,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -22423,6 +22423,107 @@
         ]
       }
     },
+    "/_inference/{task_type}/{llama_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create a Llama inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `llama` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-llama",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.LlamaTaskType"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "llama_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              },
+              "examples": {
+                "PutLlamaRequestExample1": {
+                  "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"dimensions\": 384,\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                },
+                "PutLlamaRequestExample2": {
+                  "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                },
+                "PutLlamaRequestExample3": {
+                  "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                }
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoLlama"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available; Added in 9.2.0",
+        "x-metaTags": [
+          {
+            "content": "Elasticsearch, Machine Learning",
+            "name": "product_name"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{mistral_inference_id}": {
       "put": {
         "tags": [
@@ -88877,7 +88978,7 @@
         "type": "object",
         "properties": {
           "requests_per_minute": {
-            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
             "type": "number"
           }
         }
@@ -90392,6 +90493,89 @@
           "rerank"
         ]
       },
+      "inference._types.LlamaTaskType": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion"
+        ]
+      },
+      "inference._types.LlamaServiceType": {
+        "type": "string",
+        "enum": [
+          "llama"
+        ]
+      },
+      "inference._types.LlamaServiceSettings": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "type": "string"
+          },
+          "model_id": {
+            "externalDocs": {
+              "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
+            },
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "type": "string"
+          },
+          "max_input_tokens": {
+            "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+            "type": "number"
+          },
+          "similarity": {
+            "$ref": "#/components/schemas/inference._types.LlamaSimilarityType"
+          },
+          "rate_limit": {
+            "$ref": "#/components/schemas/inference._types.RateLimitSetting"
+          }
+        },
+        "required": [
+          "url",
+          "model_id"
+        ]
+      },
+      "inference._types.LlamaSimilarityType": {
+        "type": "string",
+        "enum": [
+          "cosine",
+          "dot_product",
+          "l2_norm"
+        ]
+      },
+      "inference._types.InferenceEndpointInfoLlama": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeLlama"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
+      "inference._types.TaskTypeLlama": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "chat_completion",
+          "completion"
+        ]
+      },
       "inference._types.MistralTaskType": {
         "type": "string",
         "enum": [
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index fd422ee6b7..72548ef8c3 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -11368,7 +11368,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -11489,7 +11489,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -13218,6 +13218,107 @@
         ]
       }
     },
+    "/_inference/{task_type}/{llama_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create a Llama inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `llama` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-llama",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.LlamaTaskType"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "llama_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              },
+              "examples": {
+                "PutLlamaRequestExample1": {
+                  "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"dimensions\": 384,\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                },
+                "PutLlamaRequestExample2": {
+                  "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                },
+                "PutLlamaRequestExample3": {
+                  "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                }
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoLlama"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available",
+        "x-metaTags": [
+          {
+            "content": "Elasticsearch, Machine Learning",
+            "name": "product_name"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{mistral_inference_id}": {
       "put": {
         "tags": [
@@ -56197,7 +56298,7 @@
         "type": "object",
         "properties": {
           "requests_per_minute": {
-            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
             "type": "number"
           }
         }
@@ -57712,6 +57813,89 @@
           "rerank"
         ]
       },
+      "inference._types.LlamaTaskType": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion"
+        ]
+      },
+      "inference._types.LlamaServiceType": {
+        "type": "string",
+        "enum": [
+          "llama"
+        ]
+      },
+      "inference._types.LlamaServiceSettings": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "type": "string"
+          },
+          "model_id": {
+            "externalDocs": {
+              "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
+            },
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "type": "string"
+          },
+          "max_input_tokens": {
+            "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+            "type": "number"
+          },
+          "similarity": {
+            "$ref": "#/components/schemas/inference._types.LlamaSimilarityType"
+          },
+          "rate_limit": {
+            "$ref": "#/components/schemas/inference._types.RateLimitSetting"
+          }
+        },
+        "required": [
+          "url",
+          "model_id"
+        ]
+      },
+      "inference._types.LlamaSimilarityType": {
+        "type": "string",
+        "enum": [
+          "cosine",
+          "dot_product",
+          "l2_norm"
+        ]
+      },
+      "inference._types.InferenceEndpointInfoLlama": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeLlama"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
+      "inference._types.TaskTypeLlama": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "chat_completion",
+          "completion"
+        ]
+      },
       "inference._types.MistralTaskType": {
         "type": "string",
         "enum": [
diff --git a/output/schema/schema.json b/output/schema/schema.json
index 4af88fb511..d4ba4d467b 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -9920,7 +9920,7 @@
           "visibility": "public"
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "docId": "inference-api-put",
       "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put",
       "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/put-inference-api.html",
@@ -10698,6 +10698,51 @@
         }
       ]
     },
+    {
+      "availability": {
+        "serverless": {
+          "stability": "stable",
+          "visibility": "public"
+        },
+        "stack": {
+          "since": "9.2.0",
+          "stability": "stable",
+          "visibility": "public"
+        }
+      },
+      "description": "Create a Llama inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `llama` service.",
+      "docId": "inference-api-put-llama",
+      "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama",
+      "name": "inference.put_llama",
+      "privileges": {
+        "cluster": [
+          "manage_inference"
+        ]
+      },
+      "request": {
+        "name": "Request",
+        "namespace": "inference.put_llama"
+      },
+      "requestBodyRequired": false,
+      "requestMediaType": [
+        "application/json"
+      ],
+      "response": {
+        "name": "Response",
+        "namespace": "inference.put_llama"
+      },
+      "responseMediaType": [
+        "application/json"
+      ],
+      "urls": [
+        {
+          "methods": [
+            "PUT"
+          ],
+          "path": "/_inference/{task_type}/{llama_inference_id}"
+        }
+      ]
+    },
     {
       "availability": {
         "serverless": {
@@ -170225,7 +170270,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L301-L357"
+      "specLocation": "inference/_types/Services.ts#L313-L369"
     },
     {
       "kind": "interface",
@@ -170284,7 +170329,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L46-L66"
+      "specLocation": "inference/_types/Services.ts#L47-L67"
     },
     {
       "kind": "interface",
@@ -170325,7 +170370,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L68-L80"
+      "specLocation": "inference/_types/Services.ts#L69-L81"
     },
     {
       "kind": "interface",
@@ -170365,7 +170410,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L82-L91"
+      "specLocation": "inference/_types/Services.ts#L83-L92"
     },
     {
       "kind": "interface",
@@ -170405,7 +170450,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L93-L102"
+      "specLocation": "inference/_types/Services.ts#L94-L103"
     },
     {
       "kind": "interface",
@@ -170445,7 +170490,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L104-L113"
+      "specLocation": "inference/_types/Services.ts#L105-L114"
     },
     {
       "kind": "interface",
@@ -170485,7 +170530,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L115-L124"
+      "specLocation": "inference/_types/Services.ts#L116-L125"
     },
     {
       "kind": "interface",
@@ -170525,7 +170570,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L126-L135"
+      "specLocation": "inference/_types/Services.ts#L127-L136"
     },
     {
       "kind": "interface",
@@ -170565,7 +170610,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L137-L146"
+      "specLocation": "inference/_types/Services.ts#L138-L147"
     },
     {
       "kind": "interface",
@@ -170605,7 +170650,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L148-L157"
+      "specLocation": "inference/_types/Services.ts#L149-L158"
     },
     {
       "kind": "interface",
@@ -170645,7 +170690,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L159-L168"
+      "specLocation": "inference/_types/Services.ts#L160-L169"
     },
     {
       "kind": "interface",
@@ -170685,7 +170730,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L170-L179"
+      "specLocation": "inference/_types/Services.ts#L171-L180"
     },
     {
       "kind": "interface",
@@ -170725,7 +170770,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L180-L189"
+      "specLocation": "inference/_types/Services.ts#L181-L190"
     },
     {
       "kind": "interface",
@@ -170765,7 +170810,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L202-L211"
+      "specLocation": "inference/_types/Services.ts#L203-L212"
     },
     {
       "kind": "interface",
@@ -170805,7 +170850,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L191-L200"
+      "specLocation": "inference/_types/Services.ts#L192-L201"
     },
     {
       "kind": "interface",
@@ -170845,7 +170890,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L213-L222"
+      "specLocation": "inference/_types/Services.ts#L214-L223"
     },
     {
       "kind": "interface",
@@ -170885,7 +170930,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L224-L233"
+      "specLocation": "inference/_types/Services.ts#L225-L234"
     },
     {
       "kind": "interface",
@@ -170925,7 +170970,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L235-L244"
+      "specLocation": "inference/_types/Services.ts#L236-L245"
     },
     {
       "kind": "interface",
@@ -170965,7 +171010,47 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L246-L255"
+      "specLocation": "inference/_types/Services.ts#L247-L256"
+    },
+    {
+      "kind": "interface",
+      "inherits": {
+        "type": {
+          "name": "InferenceEndpoint",
+          "namespace": "inference._types"
+        }
+      },
+      "name": {
+        "name": "InferenceEndpointInfoLlama",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The inference Id",
+          "name": "inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The task type",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "TaskTypeLlama",
+              "namespace": "inference._types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/Services.ts#L258-L267"
     },
     {
       "kind": "interface",
@@ -171005,7 +171090,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L257-L266"
+      "specLocation": "inference/_types/Services.ts#L269-L278"
     },
     {
       "kind": "interface",
@@ -171045,7 +171130,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L268-L277"
+      "specLocation": "inference/_types/Services.ts#L280-L289"
     },
     {
       "kind": "interface",
@@ -171085,7 +171170,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L279-L288"
+      "specLocation": "inference/_types/Services.ts#L291-L300"
     },
     {
       "kind": "interface",
@@ -171125,7 +171210,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L290-L299"
+      "specLocation": "inference/_types/Services.ts#L302-L311"
     },
     {
       "kind": "interface",
@@ -171403,6 +171488,129 @@
       },
       "specLocation": "inference/_types/CommonTypes.ts#L1552-L1557"
     },
+    {
+      "kind": "interface",
+      "name": {
+        "name": "LlamaServiceSettings",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+          "name": "url",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+          "extDocId": "llama-api-models",
+          "extDocUrl": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/",
+          "name": "model_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+          "name": "max_input_tokens",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "integer",
+              "namespace": "_types"
+            }
+          }
+        },
+        {
+          "description": "For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.",
+          "name": "similarity",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "LlamaSimilarityType",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "This setting helps to minimize the number of rate limit errors returned from the Llama API.\nBy default, the `llama` service sets the number of requests allowed per minute to 3000.",
+          "name": "rate_limit",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "RateLimitSetting",
+              "namespace": "inference._types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/CommonTypes.ts#L1559-L1589"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "llama"
+        }
+      ],
+      "name": {
+        "name": "LlamaServiceType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1597-L1599"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "cosine"
+        },
+        {
+          "name": "dot_product"
+        },
+        {
+          "name": "l2_norm"
+        }
+      ],
+      "name": {
+        "name": "LlamaSimilarityType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1601-L1605"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "text_embedding"
+        },
+        {
+          "name": "completion"
+        },
+        {
+          "name": "chat_completion"
+        }
+      ],
+      "name": {
+        "name": "LlamaTaskType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1591-L1595"
+    },
     {
       "kind": "interface",
       "description": "An object representing part of the conversation.",
@@ -171559,7 +171767,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1559-L1586"
+      "specLocation": "inference/_types/CommonTypes.ts#L1607-L1634"
     },
     {
       "kind": "enum",
@@ -171572,7 +171780,7 @@
         "name": "MistralServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1594-L1596"
+      "specLocation": "inference/_types/CommonTypes.ts#L1642-L1644"
     },
     {
       "kind": "enum",
@@ -171591,7 +171799,7 @@
         "name": "MistralTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1588-L1592"
+      "specLocation": "inference/_types/CommonTypes.ts#L1636-L1640"
     },
     {
       "kind": "interface",
@@ -171678,7 +171886,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1598-L1640"
+      "specLocation": "inference/_types/CommonTypes.ts#L1646-L1688"
     },
     {
       "kind": "enum",
@@ -171691,7 +171899,7 @@
         "name": "OpenAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1656-L1658"
+      "specLocation": "inference/_types/CommonTypes.ts#L1704-L1706"
     },
     {
       "kind": "interface",
@@ -171713,7 +171921,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1642-L1648"
+      "specLocation": "inference/_types/CommonTypes.ts#L1690-L1696"
     },
     {
       "kind": "enum",
@@ -171732,7 +171940,7 @@
         "name": "OpenAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1650-L1654"
+      "specLocation": "inference/_types/CommonTypes.ts#L1698-L1702"
     },
     {
       "kind": "interface",
@@ -171787,7 +171995,7 @@
       },
       "properties": [
         {
-          "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+          "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
           "name": "requests_per_minute",
           "required": false,
           "type": {
@@ -171799,7 +172007,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L363-L389"
+      "specLocation": "inference/_types/Services.ts#L375-L402"
     },
     {
       "kind": "interface",
@@ -171947,7 +172155,7 @@
         "name": "ServiceSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L359-L359",
+      "specLocation": "inference/_types/Services.ts#L371-L371",
       "type": {
         "kind": "user_defined_value"
       }
@@ -172031,7 +172239,7 @@
         "name": "TaskSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L361-L361",
+      "specLocation": "inference/_types/Services.ts#L373-L373",
       "type": {
         "kind": "user_defined_value"
       }
@@ -172361,7 +172569,7 @@
         }
       ],
       "name": {
-        "name": "TaskTypeMistral",
+        "name": "TaskTypeLlama",
         "namespace": "inference._types"
       },
       "specLocation": "inference/_types/TaskType.ts#L121-L125"
@@ -172380,11 +172588,30 @@
         }
       ],
       "name": {
-        "name": "TaskTypeOpenAI",
+        "name": "TaskTypeMistral",
         "namespace": "inference._types"
       },
       "specLocation": "inference/_types/TaskType.ts#L127-L131"
     },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "text_embedding"
+        },
+        {
+          "name": "chat_completion"
+        },
+        {
+          "name": "completion"
+        }
+      ],
+      "name": {
+        "name": "TaskTypeOpenAI",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/TaskType.ts#L133-L137"
+    },
     {
       "kind": "enum",
       "members": [
@@ -172399,7 +172626,7 @@
         "name": "TaskTypeVoyageAI",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L133-L136"
+      "specLocation": "inference/_types/TaskType.ts#L139-L142"
     },
     {
       "kind": "enum",
@@ -172418,7 +172645,7 @@
         "name": "TaskTypeWatsonx",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L138-L142"
+      "specLocation": "inference/_types/TaskType.ts#L144-L148"
     },
     {
       "kind": "interface",
@@ -172664,7 +172891,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1660-L1691"
+      "specLocation": "inference/_types/CommonTypes.ts#L1708-L1739"
     },
     {
       "kind": "enum",
@@ -172677,7 +172904,7 @@
         "name": "VoyageAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1724-L1726"
+      "specLocation": "inference/_types/CommonTypes.ts#L1772-L1774"
     },
     {
       "kind": "interface",
@@ -172737,7 +172964,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1693-L1717"
+      "specLocation": "inference/_types/CommonTypes.ts#L1741-L1765"
     },
     {
       "kind": "enum",
@@ -172753,7 +172980,7 @@
         "name": "VoyageAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1719-L1722"
+      "specLocation": "inference/_types/CommonTypes.ts#L1767-L1770"
     },
     {
       "kind": "interface",
@@ -172841,7 +173068,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1728-L1766"
+      "specLocation": "inference/_types/CommonTypes.ts#L1776-L1814"
     },
     {
       "kind": "enum",
@@ -172854,7 +173081,7 @@
         "name": "WatsonxServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1774-L1776"
+      "specLocation": "inference/_types/CommonTypes.ts#L1822-L1824"
     },
     {
       "kind": "enum",
@@ -172873,7 +173100,7 @@
         "name": "WatsonxTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1768-L1772"
+      "specLocation": "inference/_types/CommonTypes.ts#L1816-L1820"
     },
     {
       "kind": "request",
@@ -173600,7 +173827,7 @@
           }
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "examples": {
         "InferencePutExample1": {
           "alternatives": [
@@ -173685,7 +173912,7 @@
           }
         }
       ],
-      "specLocation": "inference/put/PutRequest.ts#L26-L89"
+      "specLocation": "inference/put/PutRequest.ts#L26-L90"
     },
     {
       "kind": "response",
@@ -176739,6 +176966,144 @@
       },
       "specLocation": "inference/put_jinaai/PutJinaAiResponse.ts#L22-L25"
     },
+    {
+      "kind": "request",
+      "attachedBehaviors": [
+        "CommonQueryParameters"
+      ],
+      "body": {
+        "kind": "properties",
+        "properties": [
+          {
+            "description": "The chunking configuration object.",
+            "extDocId": "inference-chunking",
+            "extDocUrl": "https://www.elastic.co/docs/explore-analyze/elastic-inference/inference-api#infer-chunking-config",
+            "name": "chunking_settings",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "InferenceChunkingSettings",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "The type of service supported for the specified task type. In this case, `llama`.",
+            "name": "service",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "LlamaServiceType",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "Settings used to install the inference model. These settings are specific to the `llama` service.",
+            "name": "service_settings",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "LlamaServiceSettings",
+                "namespace": "inference._types"
+              }
+            }
+          }
+        ]
+      },
+      "description": "Create a Llama inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `llama` service.",
+      "examples": {
+        "PutLlamaRequestExample1": {
+          "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+          "method_request": "PUT _inference/text_embedding/llama-text-embedding",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"dimensions\": 384,\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+        },
+        "PutLlamaRequestExample2": {
+          "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
+          "method_request": "PUT _inference/completion/llama-completion",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+        },
+        "PutLlamaRequestExample3": {
+          "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
+          "method_request": "PUT _inference/chat-completion/llama-chat-completion",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+        }
+      },
+      "inherits": {
+        "type": {
+          "name": "RequestBase",
+          "namespace": "_types"
+        }
+      },
+      "name": {
+        "name": "Request",
+        "namespace": "inference.put_llama"
+      },
+      "path": [
+        {
+          "description": "The type of the inference task that the model will perform.",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "LlamaTaskType",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "The unique identifier of the inference endpoint.",
+          "name": "llama_inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Id",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "query": [
+        {
+          "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+          "name": "timeout",
+          "required": false,
+          "serverDefault": "30s",
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Duration",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/put_llama/PutLlamaRequest.ts#L30-L79"
+    },
+    {
+      "kind": "response",
+      "body": {
+        "kind": "value",
+        "codegenName": "endpoint_info",
+        "value": {
+          "kind": "instance_of",
+          "type": {
+            "name": "InferenceEndpointInfoLlama",
+            "namespace": "inference._types"
+          }
+        }
+      },
+      "name": {
+        "name": "Response",
+        "namespace": "inference.put_llama"
+      },
+      "specLocation": "inference/put_llama/PutLlamaResponse.ts#L22-L25"
+    },
     {
       "kind": "request",
       "attachedBehaviors": [
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
index c7db5f24b4..9d45d3b76e 100644
--- a/output/typescript/types.ts
+++ b/output/typescript/types.ts
@@ -14192,9 +14192,7 @@ export type InferenceJinaAITextEmbeddingTask = 'classification' | 'clustering' |
 export interface InferenceLlamaServiceSettings {
   url: string
   model_id: string
-  api_key?: string
   max_input_tokens?: integer
-  dimensions?: integer
   similarity?: InferenceLlamaSimilarityType
   rate_limit?: InferenceRateLimitSetting
 }
@@ -14203,10 +14201,6 @@ export type InferenceLlamaServiceType = 'llama'
 
 export type InferenceLlamaSimilarityType = 'cosine' | 'dot_product' | 'l2_norm'
 
-export interface InferenceLlamaTaskSettings {
-  user?: string
-}
-
 export type InferenceLlamaTaskType = 'text_embedding' | 'completion' | 'chat_completion'
 
 export interface InferenceMessage {
@@ -14671,7 +14665,6 @@ export interface InferencePutLlamaRequest extends RequestBase {
     chunking_settings?: InferenceInferenceChunkingSettings
     service: InferenceLlamaServiceType
     service_settings: InferenceLlamaServiceSettings
-    task_settings?: InferenceLlamaTaskSettings
   }
 }
 
diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
index e171ea1ee0..e960e06353 100644
--- a/specification/inference/_types/CommonTypes.ts
+++ b/specification/inference/_types/CommonTypes.ts
@@ -1560,7 +1560,7 @@ export class LlamaServiceSettings {
   /**
    * The URL endpoint of the Llama stack endpoint.
    * URL must contain:
-   * * For `text_embedding` task - `/v1/openai/v1/embeddings`.
+   * * For `text_embedding` task - `/v1/inference/embeddings`.
    * * For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.
    */
   url: string
@@ -1573,26 +1573,10 @@ export class LlamaServiceSettings {
    * @ext_doc_id llama-api-models
    */
   model_id: string
-  /**
-   * A valid API key for accessing Llama stack endpoint that is going to be sent as part of Bearer authentication header.
-   * This field is optional because Llama stack doesn't provide authentication by default.
-   *
-   * IMPORTANT: You need to provide the API key only once, during the inference model creation.
-   * The get inference endpoint API does not retrieve your API key.
-   * After creating the inference model, you cannot change the associated API key.
-   * If you want to use a different API key, delete the inference model and recreate it with the same name and the updated API key.
-   */
-  api_key?: string
   /**
    * For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.
    */
   max_input_tokens?: integer
-  /**
-   * For a `text_embedding` task, the number of dimensions the resulting output embeddings must have.
-   * It is supported only in `text-embedding-3` and later models. If it is not set by user, it defaults to the model returned dimensions.
-   * If model returns embeddings with a different number of dimensions, error is returned.
-   */
-  dimensions?: integer
   /**
    * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
    */
@@ -1604,14 +1588,6 @@ export class LlamaServiceSettings {
   rate_limit?: RateLimitSetting
 }
 
-export class LlamaTaskSettings {
-  /**
-   * For a `completion` or `text_embedding` task, specify the user issuing the request.
-   * This information can be used for abuse detection.
-   */
-  user?: string
-}
-
 export enum LlamaTaskType {
   text_embedding,
   completion,
diff --git a/specification/inference/put_llama/PutLlamaRequest.ts b/specification/inference/put_llama/PutLlamaRequest.ts
index 2a41fb67b3..966f83cc19 100644
--- a/specification/inference/put_llama/PutLlamaRequest.ts
+++ b/specification/inference/put_llama/PutLlamaRequest.ts
@@ -23,7 +23,6 @@ import { Duration } from '@_types/Time'
 import {
   LlamaServiceSettings,
   LlamaServiceType,
-  LlamaTaskSettings,
   LlamaTaskType
 } from '@inference/_types/CommonTypes'
 import { InferenceChunkingSettings } from '@inference/_types/Services'
@@ -76,10 +75,5 @@ export interface Request extends RequestBase {
      * Settings used to install the inference model. These settings are specific to the `llama` service.
      */
     service_settings: LlamaServiceSettings
-    /**
-     * Settings to configure the inference task.
-     * These settings are specific to the task type you specified.
-     */
-    task_settings?: LlamaTaskSettings
   }
 }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
index 102a6a605d..709663248e 100644
--- a/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
@@ -6,9 +6,8 @@ value: |-
   {
     "service": "llama",
     "service_settings": {
-      "url": "http://localhost:8321/v1/openai/v1/embeddings"
+      "url": "http://localhost:8321/v1/inference/embeddings"
       "dimensions": 384,
-      "api_key": "llama-api-key",
       "model_id": "all-MiniLM-L6-v2" 
     }
   }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
index 1a8417eaa2..6ec845bf59 100644
--- a/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
@@ -7,7 +7,6 @@ value: |-
     "service": "llama",
     "service_settings": {
       "url": "http://localhost:8321/v1/openai/v1/chat/completions"
-      "api_key": "llama-api-key",
       "model_id": "llama3.2:3b" 
     }
   }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
index b7c510305a..40124abab7 100644
--- a/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
@@ -7,7 +7,6 @@ value: |-
     "service": "llama",
     "service_settings": {
       "url": "http://localhost:8321/v1/openai/v1/chat/completions"
-      "api_key": "llama-api-key",
       "model_id": "llama3.2:3b" 
     }
   }