diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index d414976e5e..9387277b74 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -20573,7 +20573,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -20694,7 +20694,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -22423,6 +22423,107 @@
         ]
       }
     },
+    "/_inference/{task_type}/{llama_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create a Llama inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `llama` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-llama",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.LlamaTaskType"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "llama_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              },
+              "examples": {
+                "PutLlamaRequestExample1": {
+                  "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"dimensions\": 384,\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                },
+                "PutLlamaRequestExample2": {
+                  "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                },
+                "PutLlamaRequestExample3": {
+                  "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                }
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoLlama"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available; Added in 9.2.0",
+        "x-metaTags": [
+          {
+            "content": "Elasticsearch, Machine Learning",
+            "name": "product_name"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{mistral_inference_id}": {
       "put": {
         "tags": [
@@ -88877,7 +88978,7 @@
         "type": "object",
         "properties": {
           "requests_per_minute": {
-            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
             "type": "number"
           }
         }
@@ -90392,6 +90493,89 @@
           "rerank"
         ]
       },
+      "inference._types.LlamaTaskType": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion"
+        ]
+      },
+      "inference._types.LlamaServiceType": {
+        "type": "string",
+        "enum": [
+          "llama"
+        ]
+      },
+      "inference._types.LlamaServiceSettings": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "type": "string"
+          },
+          "model_id": {
+            "externalDocs": {
+              "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
+            },
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "type": "string"
+          },
+          "max_input_tokens": {
+            "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+            "type": "number"
+          },
+          "similarity": {
+            "$ref": "#/components/schemas/inference._types.LlamaSimilarityType"
+          },
+          "rate_limit": {
+            "$ref": "#/components/schemas/inference._types.RateLimitSetting"
+          }
+        },
+        "required": [
+          "url",
+          "model_id"
+        ]
+      },
+      "inference._types.LlamaSimilarityType": {
+        "type": "string",
+        "enum": [
+          "cosine",
+          "dot_product",
+          "l2_norm"
+        ]
+      },
+      "inference._types.InferenceEndpointInfoLlama": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeLlama"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
+      "inference._types.TaskTypeLlama": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "chat_completion",
+          "completion"
+        ]
+      },
       "inference._types.MistralTaskType": {
         "type": "string",
         "enum": [
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index fd422ee6b7..72548ef8c3 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -11368,7 +11368,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put",
         "parameters": [
           {
@@ -11489,7 +11489,7 @@
           "inference"
         ],
         "summary": "Create an inference endpoint",
-        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
         "operationId": "inference-put-1",
         "parameters": [
           {
@@ -13218,6 +13218,107 @@
         ]
       }
     },
+    "/_inference/{task_type}/{llama_inference_id}": {
+      "put": {
+        "tags": [
+          "inference"
+        ],
+        "summary": "Create a Llama inference endpoint",
+        "description": "Create an inference endpoint to perform an inference task with the `llama` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n",
+        "operationId": "inference-put-llama",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "task_type",
+            "description": "The type of the inference task that the model will perform.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/inference._types.LlamaTaskType"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "path",
+            "name": "llama_inference_id",
+            "description": "The unique identifier of the inference endpoint.",
+            "required": true,
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Id"
+            },
+            "style": "simple"
+          },
+          {
+            "in": "query",
+            "name": "timeout",
+            "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+            "deprecated": false,
+            "schema": {
+              "$ref": "#/components/schemas/_types.Duration"
+            },
+            "style": "form"
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "chunking_settings": {
+                    "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings"
+                  },
+                  "service": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceType"
+                  },
+                  "service_settings": {
+                    "$ref": "#/components/schemas/inference._types.LlamaServiceSettings"
+                  }
+                },
+                "required": [
+                  "service",
+                  "service_settings"
+                ]
+              },
+              "examples": {
+                "PutLlamaRequestExample1": {
+                  "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"dimensions\": 384,\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+                },
+                "PutLlamaRequestExample2": {
+                  "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                },
+                "PutLlamaRequestExample3": {
+                  "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
+                  "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+                }
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoLlama"
+                }
+              }
+            }
+          }
+        },
+        "x-state": "Generally available",
+        "x-metaTags": [
+          {
+            "content": "Elasticsearch, Machine Learning",
+            "name": "product_name"
+          }
+        ]
+      }
+    },
     "/_inference/{task_type}/{mistral_inference_id}": {
       "put": {
         "tags": [
@@ -56197,7 +56298,7 @@
         "type": "object",
         "properties": {
           "requests_per_minute": {
-            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+            "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
             "type": "number"
           }
         }
@@ -57712,6 +57813,89 @@
           "rerank"
         ]
       },
+      "inference._types.LlamaTaskType": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "completion",
+          "chat_completion"
+        ]
+      },
+      "inference._types.LlamaServiceType": {
+        "type": "string",
+        "enum": [
+          "llama"
+        ]
+      },
+      "inference._types.LlamaServiceSettings": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+            "type": "string"
+          },
+          "model_id": {
+            "externalDocs": {
+              "url": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/"
+            },
+            "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+            "type": "string"
+          },
+          "max_input_tokens": {
+            "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+            "type": "number"
+          },
+          "similarity": {
+            "$ref": "#/components/schemas/inference._types.LlamaSimilarityType"
+          },
+          "rate_limit": {
+            "$ref": "#/components/schemas/inference._types.RateLimitSetting"
+          }
+        },
+        "required": [
+          "url",
+          "model_id"
+        ]
+      },
+      "inference._types.LlamaSimilarityType": {
+        "type": "string",
+        "enum": [
+          "cosine",
+          "dot_product",
+          "l2_norm"
+        ]
+      },
+      "inference._types.InferenceEndpointInfoLlama": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/inference._types.InferenceEndpoint"
+          },
+          {
+            "type": "object",
+            "properties": {
+              "inference_id": {
+                "description": "The inference Id",
+                "type": "string"
+              },
+              "task_type": {
+                "$ref": "#/components/schemas/inference._types.TaskTypeLlama"
+              }
+            },
+            "required": [
+              "inference_id",
+              "task_type"
+            ]
+          }
+        ]
+      },
+      "inference._types.TaskTypeLlama": {
+        "type": "string",
+        "enum": [
+          "text_embedding",
+          "chat_completion",
+          "completion"
+        ]
+      },
       "inference._types.MistralTaskType": {
         "type": "string",
         "enum": [
diff --git a/output/schema/schema.json b/output/schema/schema.json
index 4af88fb511..d4ba4d467b 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -9920,7 +9920,7 @@
           "visibility": "public"
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "docId": "inference-api-put",
       "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put",
       "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/put-inference-api.html",
@@ -10698,6 +10698,51 @@
         }
       ]
     },
+    {
+      "availability": {
+        "serverless": {
+          "stability": "stable",
+          "visibility": "public"
+        },
+        "stack": {
+          "since": "9.2.0",
+          "stability": "stable",
+          "visibility": "public"
+        }
+      },
+      "description": "Create a Llama inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `llama` service.",
+      "docId": "inference-api-put-llama",
+      "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama",
+      "name": "inference.put_llama",
+      "privileges": {
+        "cluster": [
+          "manage_inference"
+        ]
+      },
+      "request": {
+        "name": "Request",
+        "namespace": "inference.put_llama"
+      },
+      "requestBodyRequired": false,
+      "requestMediaType": [
+        "application/json"
+      ],
+      "response": {
+        "name": "Response",
+        "namespace": "inference.put_llama"
+      },
+      "responseMediaType": [
+        "application/json"
+      ],
+      "urls": [
+        {
+          "methods": [
+            "PUT"
+          ],
+          "path": "/_inference/{task_type}/{llama_inference_id}"
+        }
+      ]
+    },
     {
       "availability": {
         "serverless": {
@@ -170225,7 +170270,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L301-L357"
+      "specLocation": "inference/_types/Services.ts#L313-L369"
     },
     {
       "kind": "interface",
@@ -170284,7 +170329,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L46-L66"
+      "specLocation": "inference/_types/Services.ts#L47-L67"
     },
     {
       "kind": "interface",
@@ -170325,7 +170370,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L68-L80"
+      "specLocation": "inference/_types/Services.ts#L69-L81"
     },
     {
       "kind": "interface",
@@ -170365,7 +170410,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L82-L91"
+      "specLocation": "inference/_types/Services.ts#L83-L92"
     },
     {
       "kind": "interface",
@@ -170405,7 +170450,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L93-L102"
+      "specLocation": "inference/_types/Services.ts#L94-L103"
     },
     {
       "kind": "interface",
@@ -170445,7 +170490,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L104-L113"
+      "specLocation": "inference/_types/Services.ts#L105-L114"
     },
     {
       "kind": "interface",
@@ -170485,7 +170530,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L115-L124"
+      "specLocation": "inference/_types/Services.ts#L116-L125"
     },
     {
       "kind": "interface",
@@ -170525,7 +170570,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L126-L135"
+      "specLocation": "inference/_types/Services.ts#L127-L136"
     },
     {
       "kind": "interface",
@@ -170565,7 +170610,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L137-L146"
+      "specLocation": "inference/_types/Services.ts#L138-L147"
     },
     {
       "kind": "interface",
@@ -170605,7 +170650,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L148-L157"
+      "specLocation": "inference/_types/Services.ts#L149-L158"
     },
     {
       "kind": "interface",
@@ -170645,7 +170690,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L159-L168"
+      "specLocation": "inference/_types/Services.ts#L160-L169"
     },
     {
       "kind": "interface",
@@ -170685,7 +170730,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L170-L179"
+      "specLocation": "inference/_types/Services.ts#L171-L180"
     },
     {
       "kind": "interface",
@@ -170725,7 +170770,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L180-L189"
+      "specLocation": "inference/_types/Services.ts#L181-L190"
     },
     {
       "kind": "interface",
@@ -170765,7 +170810,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L202-L211"
+      "specLocation": "inference/_types/Services.ts#L203-L212"
     },
     {
       "kind": "interface",
@@ -170805,7 +170850,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L191-L200"
+      "specLocation": "inference/_types/Services.ts#L192-L201"
     },
     {
       "kind": "interface",
@@ -170845,7 +170890,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L213-L222"
+      "specLocation": "inference/_types/Services.ts#L214-L223"
     },
     {
       "kind": "interface",
@@ -170885,7 +170930,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L224-L233"
+      "specLocation": "inference/_types/Services.ts#L225-L234"
     },
     {
       "kind": "interface",
@@ -170925,7 +170970,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L235-L244"
+      "specLocation": "inference/_types/Services.ts#L236-L245"
     },
     {
       "kind": "interface",
@@ -170965,7 +171010,47 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L246-L255"
+      "specLocation": "inference/_types/Services.ts#L247-L256"
+    },
+    {
+      "kind": "interface",
+      "inherits": {
+        "type": {
+          "name": "InferenceEndpoint",
+          "namespace": "inference._types"
+        }
+      },
+      "name": {
+        "name": "InferenceEndpointInfoLlama",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The inference Id",
+          "name": "inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The task type",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "TaskTypeLlama",
+              "namespace": "inference._types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/Services.ts#L258-L267"
     },
     {
       "kind": "interface",
@@ -171005,7 +171090,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L257-L266"
+      "specLocation": "inference/_types/Services.ts#L269-L278"
     },
     {
       "kind": "interface",
@@ -171045,7 +171130,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L268-L277"
+      "specLocation": "inference/_types/Services.ts#L280-L289"
     },
     {
       "kind": "interface",
@@ -171085,7 +171170,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L279-L288"
+      "specLocation": "inference/_types/Services.ts#L291-L300"
     },
     {
       "kind": "interface",
@@ -171125,7 +171210,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L290-L299"
+      "specLocation": "inference/_types/Services.ts#L302-L311"
     },
     {
       "kind": "interface",
@@ -171403,6 +171488,129 @@
       },
       "specLocation": "inference/_types/CommonTypes.ts#L1552-L1557"
     },
+    {
+      "kind": "interface",
+      "name": {
+        "name": "LlamaServiceSettings",
+        "namespace": "inference._types"
+      },
+      "properties": [
+        {
+          "description": "The URL endpoint of the Llama stack endpoint.\nURL must contain:\n* For `text_embedding` task - `/v1/inference/embeddings`.\n* For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.",
+          "name": "url",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "The name of the model to use for the inference task.\nRefer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.\nService has been tested and confirmed to be working with the following models:\n* For `text_embedding` task - `all-MiniLM-L6-v2`.\n* For `completion` and `chat_completion` tasks - `llama3.2:3b`.",
+          "extDocId": "llama-api-models",
+          "extDocUrl": "https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/",
+          "name": "model_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "string",
+              "namespace": "_builtins"
+            }
+          }
+        },
+        {
+          "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.",
+          "name": "max_input_tokens",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "integer",
+              "namespace": "_types"
+            }
+          }
+        },
+        {
+          "description": "For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.",
+          "name": "similarity",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "LlamaSimilarityType",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "This setting helps to minimize the number of rate limit errors returned from the Llama API.\nBy default, the `llama` service sets the number of requests allowed per minute to 3000.",
+          "name": "rate_limit",
+          "required": false,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "RateLimitSetting",
+              "namespace": "inference._types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/_types/CommonTypes.ts#L1559-L1589"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "llama"
+        }
+      ],
+      "name": {
+        "name": "LlamaServiceType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1597-L1599"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "cosine"
+        },
+        {
+          "name": "dot_product"
+        },
+        {
+          "name": "l2_norm"
+        }
+      ],
+      "name": {
+        "name": "LlamaSimilarityType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1601-L1605"
+    },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "text_embedding"
+        },
+        {
+          "name": "completion"
+        },
+        {
+          "name": "chat_completion"
+        }
+      ],
+      "name": {
+        "name": "LlamaTaskType",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/CommonTypes.ts#L1591-L1595"
+    },
     {
       "kind": "interface",
       "description": "An object representing part of the conversation.",
@@ -171559,7 +171767,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1559-L1586"
+      "specLocation": "inference/_types/CommonTypes.ts#L1607-L1634"
     },
     {
       "kind": "enum",
@@ -171572,7 +171780,7 @@
         "name": "MistralServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1594-L1596"
+      "specLocation": "inference/_types/CommonTypes.ts#L1642-L1644"
     },
     {
       "kind": "enum",
@@ -171591,7 +171799,7 @@
         "name": "MistralTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1588-L1592"
+      "specLocation": "inference/_types/CommonTypes.ts#L1636-L1640"
     },
     {
       "kind": "interface",
@@ -171678,7 +171886,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1598-L1640"
+      "specLocation": "inference/_types/CommonTypes.ts#L1646-L1688"
     },
     {
       "kind": "enum",
@@ -171691,7 +171899,7 @@
         "name": "OpenAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1656-L1658"
+      "specLocation": "inference/_types/CommonTypes.ts#L1704-L1706"
     },
     {
       "kind": "interface",
@@ -171713,7 +171921,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1642-L1648"
+      "specLocation": "inference/_types/CommonTypes.ts#L1690-L1696"
     },
     {
       "kind": "enum",
@@ -171732,7 +171940,7 @@
         "name": "OpenAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1650-L1654"
+      "specLocation": "inference/_types/CommonTypes.ts#L1698-L1702"
     },
     {
       "kind": "interface",
@@ -171787,7 +171995,7 @@
       },
       "properties": [
         {
-          "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
+          "description": "The number of requests allowed per minute.\nBy default, the number of requests allowed per minute is set by each service as follows:\n\n* `alibabacloud-ai-search` service: `1000`\n* `anthropic` service: `50`\n* `azureaistudio` service: `240`\n* `azureopenai` service and task type `text_embedding`: `1440`\n* `azureopenai` service and task type `completion`: `120`\n* `cohere` service: `10000`\n* `elastic` service and task type `chat_completion`: `240`\n* `googleaistudio` service: `360`\n* `googlevertexai` service: `30000`\n* `hugging_face` service: `3000`\n* `jinaai` service: `2000`\n* `llama` service: `3000`\n* `mistral` service: `240`\n* `openai` service and task type `text_embedding`: `3000`\n* `openai` service and task type `completion`: `500`\n* `voyageai` service: `2000`\n* `watsonxai` service: `120`",
           "name": "requests_per_minute",
           "required": false,
           "type": {
@@ -171799,7 +172007,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/Services.ts#L363-L389"
+      "specLocation": "inference/_types/Services.ts#L375-L402"
     },
     {
       "kind": "interface",
@@ -171947,7 +172155,7 @@
         "name": "ServiceSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L359-L359",
+      "specLocation": "inference/_types/Services.ts#L371-L371",
       "type": {
         "kind": "user_defined_value"
       }
@@ -172031,7 +172239,7 @@
         "name": "TaskSettings",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/Services.ts#L361-L361",
+      "specLocation": "inference/_types/Services.ts#L373-L373",
       "type": {
         "kind": "user_defined_value"
       }
@@ -172361,7 +172569,7 @@
         }
       ],
       "name": {
-        "name": "TaskTypeMistral",
+        "name": "TaskTypeLlama",
         "namespace": "inference._types"
       },
       "specLocation": "inference/_types/TaskType.ts#L121-L125"
@@ -172380,11 +172588,30 @@
         }
       ],
       "name": {
-        "name": "TaskTypeOpenAI",
+        "name": "TaskTypeMistral",
         "namespace": "inference._types"
       },
       "specLocation": "inference/_types/TaskType.ts#L127-L131"
     },
+    {
+      "kind": "enum",
+      "members": [
+        {
+          "name": "text_embedding"
+        },
+        {
+          "name": "chat_completion"
+        },
+        {
+          "name": "completion"
+        }
+      ],
+      "name": {
+        "name": "TaskTypeOpenAI",
+        "namespace": "inference._types"
+      },
+      "specLocation": "inference/_types/TaskType.ts#L133-L137"
+    },
     {
       "kind": "enum",
       "members": [
@@ -172399,7 +172626,7 @@
         "name": "TaskTypeVoyageAI",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L133-L136"
+      "specLocation": "inference/_types/TaskType.ts#L139-L142"
     },
     {
       "kind": "enum",
@@ -172418,7 +172645,7 @@
         "name": "TaskTypeWatsonx",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/TaskType.ts#L138-L142"
+      "specLocation": "inference/_types/TaskType.ts#L144-L148"
     },
     {
       "kind": "interface",
@@ -172664,7 +172891,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1660-L1691"
+      "specLocation": "inference/_types/CommonTypes.ts#L1708-L1739"
     },
     {
       "kind": "enum",
@@ -172677,7 +172904,7 @@
         "name": "VoyageAIServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1724-L1726"
+      "specLocation": "inference/_types/CommonTypes.ts#L1772-L1774"
     },
     {
       "kind": "interface",
@@ -172737,7 +172964,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1693-L1717"
+      "specLocation": "inference/_types/CommonTypes.ts#L1741-L1765"
     },
     {
       "kind": "enum",
@@ -172753,7 +172980,7 @@
         "name": "VoyageAITaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1719-L1722"
+      "specLocation": "inference/_types/CommonTypes.ts#L1767-L1770"
     },
     {
       "kind": "interface",
@@ -172841,7 +173068,7 @@
           }
         }
       ],
-      "specLocation": "inference/_types/CommonTypes.ts#L1728-L1766"
+      "specLocation": "inference/_types/CommonTypes.ts#L1776-L1814"
     },
     {
       "kind": "enum",
@@ -172854,7 +173081,7 @@
         "name": "WatsonxServiceType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1774-L1776"
+      "specLocation": "inference/_types/CommonTypes.ts#L1822-L1824"
     },
     {
       "kind": "enum",
@@ -172873,7 +173100,7 @@
         "name": "WatsonxTaskType",
         "namespace": "inference._types"
       },
-      "specLocation": "inference/_types/CommonTypes.ts#L1768-L1772"
+      "specLocation": "inference/_types/CommonTypes.ts#L1816-L1820"
     },
     {
       "kind": "request",
@@ -173600,7 +173827,7 @@
           }
         }
       },
-      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
+      "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, 'rerank', `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`completion`, `chat_completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`rerank`, `text_embedding`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* VoyageAI (`text_embedding`, `rerank`)\n* Watsonx inference integration (`text_embedding`)\n* JinaAI (`text_embedding`, `rerank`)",
       "examples": {
         "InferencePutExample1": {
           "alternatives": [
@@ -173685,7 +173912,7 @@
           }
         }
       ],
-      "specLocation": "inference/put/PutRequest.ts#L26-L89"
+      "specLocation": "inference/put/PutRequest.ts#L26-L90"
     },
     {
       "kind": "response",
@@ -176739,6 +176966,144 @@
       },
       "specLocation": "inference/put_jinaai/PutJinaAiResponse.ts#L22-L25"
     },
+    {
+      "kind": "request",
+      "attachedBehaviors": [
+        "CommonQueryParameters"
+      ],
+      "body": {
+        "kind": "properties",
+        "properties": [
+          {
+            "description": "The chunking configuration object.",
+            "extDocId": "inference-chunking",
+            "extDocUrl": "https://www.elastic.co/docs/explore-analyze/elastic-inference/inference-api#infer-chunking-config",
+            "name": "chunking_settings",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "InferenceChunkingSettings",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "The type of service supported for the specified task type. In this case, `llama`.",
+            "name": "service",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "LlamaServiceType",
+                "namespace": "inference._types"
+              }
+            }
+          },
+          {
+            "description": "Settings used to install the inference model. These settings are specific to the `llama` service.",
+            "name": "service_settings",
+            "required": true,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "LlamaServiceSettings",
+                "namespace": "inference._types"
+              }
+            }
+          }
+        ]
+      },
+      "description": "Create a Llama inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `llama` service.",
+      "examples": {
+        "PutLlamaRequestExample1": {
+          "description": "Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.",
+          "method_request": "PUT _inference/text_embedding/llama-text-embedding",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/inference/embeddings\"\n    \"dimensions\": 384,\n    \"model_id\": \"all-MiniLM-L6-v2\" \n  }\n}"
+        },
+        "PutLlamaRequestExample2": {
+          "description": "Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.",
+          "method_request": "PUT _inference/completion/llama-completion",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+        },
+        "PutLlamaRequestExample3": {
+          "description": "Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.",
+          "method_request": "PUT _inference/chat-completion/llama-chat-completion",
+          "value": "{\n  \"service\": \"llama\",\n  \"service_settings\": {\n    \"url\": \"http://localhost:8321/v1/openai/v1/chat/completions\"\n    \"model_id\": \"llama3.2:3b\" \n  }\n}"
+        }
+      },
+      "inherits": {
+        "type": {
+          "name": "RequestBase",
+          "namespace": "_types"
+        }
+      },
+      "name": {
+        "name": "Request",
+        "namespace": "inference.put_llama"
+      },
+      "path": [
+        {
+          "description": "The type of the inference task that the model will perform.",
+          "name": "task_type",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "LlamaTaskType",
+              "namespace": "inference._types"
+            }
+          }
+        },
+        {
+          "description": "The unique identifier of the inference endpoint.",
+          "name": "llama_inference_id",
+          "required": true,
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Id",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "query": [
+        {
+          "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+          "name": "timeout",
+          "required": false,
+          "serverDefault": "30s",
+          "type": {
+            "kind": "instance_of",
+            "type": {
+              "name": "Duration",
+              "namespace": "_types"
+            }
+          }
+        }
+      ],
+      "specLocation": "inference/put_llama/PutLlamaRequest.ts#L30-L79"
+    },
+    {
+      "kind": "response",
+      "body": {
+        "kind": "value",
+        "codegenName": "endpoint_info",
+        "value": {
+          "kind": "instance_of",
+          "type": {
+            "name": "InferenceEndpointInfoLlama",
+            "namespace": "inference._types"
+          }
+        }
+      },
+      "name": {
+        "name": "Response",
+        "namespace": "inference.put_llama"
+      },
+      "specLocation": "inference/put_llama/PutLlamaResponse.ts#L22-L25"
+    },
     {
       "kind": "request",
       "attachedBehaviors": [
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
index ecae7392da..9d45d3b76e 100644
--- a/output/typescript/types.ts
+++ b/output/typescript/types.ts
@@ -14134,6 +14134,11 @@ export interface InferenceInferenceEndpointInfoJinaAi extends InferenceInference
   task_type: InferenceTaskTypeJinaAi
 }
 
+export interface InferenceInferenceEndpointInfoLlama extends InferenceInferenceEndpoint {
+  inference_id: string
+  task_type: InferenceTaskTypeLlama
+}
+
 export interface InferenceInferenceEndpointInfoMistral extends InferenceInferenceEndpoint {
   inference_id: string
   task_type: InferenceTaskTypeMistral
@@ -14184,6 +14189,20 @@ export type InferenceJinaAITaskType = 'rerank' | 'text_embedding'
 
 export type InferenceJinaAITextEmbeddingTask = 'classification' | 'clustering' | 'ingest' | 'search'
 
+export interface InferenceLlamaServiceSettings {
+  url: string
+  model_id: string
+  max_input_tokens?: integer
+  similarity?: InferenceLlamaSimilarityType
+  rate_limit?: InferenceRateLimitSetting
+}
+
+export type InferenceLlamaServiceType = 'llama'
+
+export type InferenceLlamaSimilarityType = 'cosine' | 'dot_product' | 'l2_norm'
+
+export type InferenceLlamaTaskType = 'text_embedding' | 'completion' | 'chat_completion'
+
 export interface InferenceMessage {
   content?: InferenceMessageContent
   role: string
@@ -14294,6 +14313,8 @@ export type InferenceTaskTypeHuggingFace = 'chat_completion' | 'completion' | 'r
 
 export type InferenceTaskTypeJinaAi = 'text_embedding' | 'rerank'
 
+export type InferenceTaskTypeLlama = 'text_embedding' | 'chat_completion' | 'completion'
+
 export type InferenceTaskTypeMistral = 'text_embedding' | 'chat_completion' | 'completion'
 
 export type InferenceTaskTypeOpenAI = 'text_embedding' | 'chat_completion' | 'completion'
@@ -14636,6 +14657,19 @@ export interface InferencePutJinaaiRequest extends RequestBase {
 
 export type InferencePutJinaaiResponse = InferenceInferenceEndpointInfoJinaAi
 
+export interface InferencePutLlamaRequest extends RequestBase {
+  task_type: InferenceLlamaTaskType
+  llama_inference_id: Id
+  timeout?: Duration
+  body?: {
+    chunking_settings?: InferenceInferenceChunkingSettings
+    service: InferenceLlamaServiceType
+    service_settings: InferenceLlamaServiceSettings
+  }
+}
+
+export type InferencePutLlamaResponse = InferenceInferenceEndpointInfoLlama
+
 export interface InferencePutMistralRequest extends RequestBase {
   task_type: InferenceMistralTaskType
   mistral_inference_id: Id
diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv
index d41dea4903..6e4281cd88 100644
--- a/specification/_doc_ids/table.csv
+++ b/specification/_doc_ids/table.csv
@@ -374,6 +374,7 @@ inference-api-put-googleaistudio,https://www.elastic.co/docs/api/doc/elasticsear
 inference-api-put-googlevertexai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-googlevertexai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-google-vertex-ai.html,
 inference-api-put-huggingface,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-hugging-face,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-hugging-face.html,
 inference-api-put-jinaai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-jinaai,,
+inference-api-put-llama,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama,,
 inference-api-put-mistral,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-mistral,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-mistral.html,
 inference-api-put-openai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-openai.html,
 inference-api-put-voyageai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-voyageai,,
@@ -403,6 +404,7 @@ knn-inner-hits,https://www.elastic.co/docs/solutions/search/vector/knn#nested-kn
 license-management,https://www.elastic.co/docs/deploy-manage/license/manage-your-license-in-self-managed-cluster,,
 list-analytics-collection,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-search-application-get-behavioral-analytics,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-analytics-collection.html,
 list-synonyms-sets,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-synonyms-get-synonyms-sets,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-synonyms-sets.html,
+llama-api-models,https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/,,
 logstash-api-delete-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-delete-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-delete-pipeline.html,
 logstash-api-get-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-get-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-get-pipeline.html,
 logstash-api-put-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-put-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-put-pipeline.html,
diff --git a/specification/_json_spec/inference.put_llama.json b/specification/_json_spec/inference.put_llama.json
new file mode 100644
index 0000000000..5551f655cb
--- /dev/null
+++ b/specification/_json_spec/inference.put_llama.json
@@ -0,0 +1,35 @@
+{
+  "inference.put_llama": {
+    "documentation": {
+      "url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-llama.html",
+      "description": "Configure a Llama inference endpoint"
+    },
+    "stability": "stable",
+    "visibility": "public",
+    "headers": {
+      "accept": ["application/json"],
+      "content_type": ["application/json"]
+    },
+    "url": {
+      "paths": [
+        {
+          "path": "/_inference/{task_type}/{llama_inference_id}",
+          "methods": ["PUT"],
+          "parts": {
+            "task_type": {
+              "type": "string",
+              "description": "The task type"
+            },
+            "llama_inference_id": {
+              "type": "string",
+              "description": "The inference ID"
+            }
+          }
+        }
+      ]
+    },
+    "body": {
+      "description": "The inference endpoint's task and service settings"
+    }
+  }
+}
diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
index 2bbe2ac3b8..e960e06353 100644
--- a/specification/inference/_types/CommonTypes.ts
+++ b/specification/inference/_types/CommonTypes.ts
@@ -1556,6 +1556,54 @@ export enum JinaAITextEmbeddingTask {
   search
 }
 
+export class LlamaServiceSettings {
+  /**
+   * The URL endpoint of the Llama stack endpoint.
+   * URL must contain:
+   * * For `text_embedding` task - `/v1/inference/embeddings`.
+   * * For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.
+   */
+  url: string
+  /**
+   * The name of the model to use for the inference task.
+   * Refer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.
+   * Service has been tested and confirmed to be working with the following models:
+   * * For `text_embedding` task - `all-MiniLM-L6-v2`.
+   * * For `completion` and `chat_completion` tasks - `llama3.2:3b`.
+   * @ext_doc_id llama-api-models
+   */
+  model_id: string
+  /**
+   * For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.
+   */
+  max_input_tokens?: integer
+  /**
+   * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
+   */
+  similarity?: LlamaSimilarityType
+  /**
+   * This setting helps to minimize the number of rate limit errors returned from the Llama API.
+   * By default, the `llama` service sets the number of requests allowed per minute to 3000.
+   */
+  rate_limit?: RateLimitSetting
+}
+
+export enum LlamaTaskType {
+  text_embedding,
+  completion,
+  chat_completion
+}
+
+export enum LlamaServiceType {
+  llama
+}
+
+export enum LlamaSimilarityType {
+  cosine,
+  dot_product,
+  l2_norm
+}
+
 export class MistralServiceSettings {
   /**
    * A valid API key of your Mistral account.
diff --git a/specification/inference/_types/Services.ts b/specification/inference/_types/Services.ts
index 5dd1d2ea4a..20a95fed97 100644
--- a/specification/inference/_types/Services.ts
+++ b/specification/inference/_types/Services.ts
@@ -37,6 +37,7 @@ import {
   TaskTypeGoogleVertexAI,
   TaskTypeHuggingFace,
   TaskTypeJinaAi,
+  TaskTypeLlama,
   TaskTypeMistral,
   TaskTypeOpenAI,
   TaskTypeVoyageAI,
@@ -254,6 +255,17 @@ export class InferenceEndpointInfoJinaAi extends InferenceEndpoint {
   task_type: TaskTypeJinaAi
 }
 
+export class InferenceEndpointInfoLlama extends InferenceEndpoint {
+  /**
+   * The inference Id
+   */
+  inference_id: string
+  /**
+   * The task type
+   */
+  task_type: TaskTypeLlama
+}
+
 export class InferenceEndpointInfoMistral extends InferenceEndpoint {
   /**
    * The inference Id
@@ -379,6 +391,7 @@ export class RateLimitSetting {
    * * `googlevertexai` service: `30000`
    * * `hugging_face` service: `3000`
    * * `jinaai` service: `2000`
+   * * `llama` service: `3000`
    * * `mistral` service: `240`
    * * `openai` service and task type `text_embedding`: `3000`
    * * `openai` service and task type `completion`: `500`
diff --git a/specification/inference/_types/TaskType.ts b/specification/inference/_types/TaskType.ts
index a46bff5638..5e76973a74 100644
--- a/specification/inference/_types/TaskType.ts
+++ b/specification/inference/_types/TaskType.ts
@@ -118,6 +118,12 @@ export enum TaskTypeHuggingFace {
   text_embedding
 }
 
+export enum TaskTypeLlama {
+  text_embedding,
+  chat_completion,
+  completion
+}
+
 export enum TaskTypeMistral {
   text_embedding,
   chat_completion,
diff --git a/specification/inference/put/PutRequest.ts b/specification/inference/put/PutRequest.ts
index 3ed581fd6c..fbb53135cf 100644
--- a/specification/inference/put/PutRequest.ts
+++ b/specification/inference/put/PutRequest.ts
@@ -45,6 +45,7 @@ import { TaskType } from '@inference/_types/TaskType'
  * * Google AI Studio (`completion`, `text_embedding`)
  * * Google Vertex AI (`rerank`, `text_embedding`)
  * * Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)
+ * * Llama (`chat_completion`, `completion`, `text_embedding`)
  * * Mistral (`chat_completion`, `completion`, `text_embedding`)
  * * OpenAI (`chat_completion`, `completion`, `text_embedding`)
  * * VoyageAI (`text_embedding`, `rerank`)
diff --git a/specification/inference/put_llama/PutLlamaRequest.ts b/specification/inference/put_llama/PutLlamaRequest.ts
new file mode 100644
index 0000000000..966f83cc19
--- /dev/null
+++ b/specification/inference/put_llama/PutLlamaRequest.ts
@@ -0,0 +1,79 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { RequestBase } from '@_types/Base'
+import { Id } from '@_types/common'
+import { Duration } from '@_types/Time'
+import {
+  LlamaServiceSettings,
+  LlamaServiceType,
+  LlamaTaskType
+} from '@inference/_types/CommonTypes'
+import { InferenceChunkingSettings } from '@inference/_types/Services'
+
+/**
+ * Create a Llama inference endpoint.
+ *
+ * Create an inference endpoint to perform an inference task with the `llama` service.
+ * @rest_spec_name inference.put_llama
+ * @availability stack since=9.2.0 stability=stable visibility=public
+ * @availability serverless stability=stable visibility=public
+ * @cluster_privileges manage_inference
+ * @doc_id inference-api-put-llama
+ */
+export interface Request extends RequestBase {
+  urls: [
+    {
+      path: '/_inference/{task_type}/{llama_inference_id}'
+      methods: ['PUT']
+    }
+  ]
+  path_parts: {
+    /**
+     * The type of the inference task that the model will perform.
+     */
+    task_type: LlamaTaskType
+    /**
+     * The unique identifier of the inference endpoint.
+     */
+    llama_inference_id: Id
+  }
+  query_parameters: {
+    /**
+     * Specifies the amount of time to wait for the inference endpoint to be created.
+     * @server_default 30s
+     */
+    timeout?: Duration
+  }
+  body: {
+    /**
+     * The chunking configuration object.
+     * @ext_doc_id inference-chunking
+     */
+    chunking_settings?: InferenceChunkingSettings
+    /**
+     * The type of service supported for the specified task type. In this case, `llama`.
+     */
+    service: LlamaServiceType
+    /**
+     * Settings used to install the inference model. These settings are specific to the `llama` service.
+     */
+    service_settings: LlamaServiceSettings
+  }
+}
diff --git a/specification/inference/put_llama/PutLlamaResponse.ts b/specification/inference/put_llama/PutLlamaResponse.ts
new file mode 100644
index 0000000000..858e05875b
--- /dev/null
+++ b/specification/inference/put_llama/PutLlamaResponse.ts
@@ -0,0 +1,25 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { InferenceEndpointInfoLlama } from '@inference/_types/Services'
+
+export class Response {
+  /** @codegen_name endpoint_info */
+  body: InferenceEndpointInfoLlama
+}
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
new file mode 100644
index 0000000000..709663248e
--- /dev/null
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
@@ -0,0 +1,13 @@
+# summary:
+description: Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.
+method_request: 'PUT _inference/text_embedding/llama-text-embedding'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/inference/embeddings"
+      "dimensions": 384,
+      "model_id": "all-MiniLM-L6-v2" 
+    }
+  }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
new file mode 100644
index 0000000000..6ec845bf59
--- /dev/null
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
@@ -0,0 +1,12 @@
+# summary:
+description: Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.
+method_request: 'PUT _inference/completion/llama-completion'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/openai/v1/chat/completions"
+      "model_id": "llama3.2:3b" 
+    }
+  }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
new file mode 100644
index 0000000000..40124abab7
--- /dev/null
+++ b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
@@ -0,0 +1,12 @@
+# summary:
+description: Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.
+method_request: 'PUT _inference/chat-completion/llama-chat-completion'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/openai/v1/chat/completions"
+      "model_id": "llama3.2:3b" 
+    }
+  }