Update inference API specification to include new Llama Service (#5020)

Jan-Kazlouski-elastic · web-flow · commit 68ebcbe1997d · 2025-08-04T18:35:30.000+03:00
* Update inference API specification to include new Llama Service

* Fix typos

* Fixed Typo

* Update json outputs

* Update specification

* Update llama specification
diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
diff --git a/output/schema/schema.json b/output/schema/schema.json
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv
@@ -374,6 +374,7 @@ inference-api-put-googleaistudio,https://www.elastic.co/docs/api/doc/elasticsear
 inference-api-put-googlevertexai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-googlevertexai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-google-vertex-ai.html,
 inference-api-put-huggingface,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-hugging-face,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-hugging-face.html,
 inference-api-put-jinaai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-jinaai,,
+inference-api-put-llama,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama,,
 inference-api-put-mistral,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-mistral,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-mistral.html,
 inference-api-put-openai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-openai.html,
 inference-api-put-voyageai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-voyageai,,
@@ -403,6 +404,7 @@ knn-inner-hits,https://www.elastic.co/docs/solutions/search/vector/knn#nested-kn
 license-management,https://www.elastic.co/docs/deploy-manage/license/manage-your-license-in-self-managed-cluster,,
 list-analytics-collection,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-search-application-get-behavioral-analytics,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-analytics-collection.html,
 list-synonyms-sets,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-synonyms-get-synonyms-sets,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/list-synonyms-sets.html,
+llama-api-models,https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html/,,
 logstash-api-delete-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-delete-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-delete-pipeline.html,
 logstash-api-get-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-get-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-get-pipeline.html,
 logstash-api-put-pipeline,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-logstash-put-pipeline,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/logstash-api-put-pipeline.html,
diff --git a/specification/_json_spec/inference.put_llama.json b/specification/_json_spec/inference.put_llama.json
@@ -0,0 +1,35 @@
+{
+  "inference.put_llama": {
+    "documentation": {
+      "url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-llama.html",
+      "description": "Configure a Llama inference endpoint"
+    },
+    "stability": "stable",
+    "visibility": "public",
+    "headers": {
+      "accept": ["application/json"],
+      "content_type": ["application/json"]
+    },
+    "url": {
+      "paths": [
+        {
+          "path": "/_inference/{task_type}/{llama_inference_id}",
+          "methods": ["PUT"],
+          "parts": {
+            "task_type": {
+              "type": "string",
+              "description": "The task type"
+            },
+            "llama_inference_id": {
+              "type": "string",
+              "description": "The inference ID"
+            }
+          }
+        }
+      ]
+    },
+    "body": {
+      "description": "The inference endpoint's task and service settings"
+    }
+  }
+}
diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
@@ -1556,6 +1556,54 @@ export enum JinaAITextEmbeddingTask {
   search
 }
 
+export class LlamaServiceSettings {
+  /**
+   * The URL endpoint of the Llama stack endpoint.
+   * URL must contain:
+   * * For `text_embedding` task - `/v1/inference/embeddings`.
+   * * For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`.
+   */
+  url: string
+  /**
+   * The name of the model to use for the inference task.
+   * Refer to the Llama downloading models documentation for different ways of getting a list of available models and downloading them.
+   * Service has been tested and confirmed to be working with the following models:
+   * * For `text_embedding` task - `all-MiniLM-L6-v2`.
+   * * For `completion` and `chat_completion` tasks - `llama3.2:3b`.
+   * @ext_doc_id llama-api-models
+   */
+  model_id: string
+  /**
+   * For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.
+   */
+  max_input_tokens?: integer
+  /**
+   * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
+   */
+  similarity?: LlamaSimilarityType
+  /**
+   * This setting helps to minimize the number of rate limit errors returned from the Llama API.
+   * By default, the `llama` service sets the number of requests allowed per minute to 3000.
+   */
+  rate_limit?: RateLimitSetting
+}
+
+export enum LlamaTaskType {
+  text_embedding,
+  completion,
+  chat_completion
+}
+
+export enum LlamaServiceType {
+  llama
+}
+
+export enum LlamaSimilarityType {
+  cosine,
+  dot_product,
+  l2_norm
+}
+
 export class MistralServiceSettings {
   /**
    * A valid API key of your Mistral account.
diff --git a/specification/inference/_types/Services.ts b/specification/inference/_types/Services.ts
@@ -37,6 +37,7 @@ import {
   TaskTypeGoogleVertexAI,
   TaskTypeHuggingFace,
   TaskTypeJinaAi,
+  TaskTypeLlama,
   TaskTypeMistral,
   TaskTypeOpenAI,
   TaskTypeVoyageAI,
@@ -254,6 +255,17 @@ export class InferenceEndpointInfoJinaAi extends InferenceEndpoint {
   task_type: TaskTypeJinaAi
 }
 
+export class InferenceEndpointInfoLlama extends InferenceEndpoint {
+  /**
+   * The inference Id
+   */
+  inference_id: string
+  /**
+   * The task type
+   */
+  task_type: TaskTypeLlama
+}
+
 export class InferenceEndpointInfoMistral extends InferenceEndpoint {
   /**
    * The inference Id
@@ -379,6 +391,7 @@ export class RateLimitSetting {
    * * `googlevertexai` service: `30000`
    * * `hugging_face` service: `3000`
    * * `jinaai` service: `2000`
+   * * `llama` service: `3000`
    * * `mistral` service: `240`
    * * `openai` service and task type `text_embedding`: `3000`
    * * `openai` service and task type `completion`: `500`
diff --git a/specification/inference/_types/TaskType.ts b/specification/inference/_types/TaskType.ts
@@ -118,6 +118,12 @@ export enum TaskTypeHuggingFace {
   text_embedding
 }
 
+export enum TaskTypeLlama {
+  text_embedding,
+  chat_completion,
+  completion
+}
+
 export enum TaskTypeMistral {
   text_embedding,
   chat_completion,
diff --git a/specification/inference/put/PutRequest.ts b/specification/inference/put/PutRequest.ts
@@ -45,6 +45,7 @@ import { TaskType } from '@inference/_types/TaskType'
  * * Google AI Studio (`completion`, `text_embedding`)
  * * Google Vertex AI (`rerank`, `text_embedding`)
  * * Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)
+ * * Llama (`chat_completion`, `completion`, `text_embedding`)
  * * Mistral (`chat_completion`, `completion`, `text_embedding`)
  * * OpenAI (`chat_completion`, `completion`, `text_embedding`)
  * * VoyageAI (`text_embedding`, `rerank`)
diff --git a/specification/inference/put_llama/PutLlamaRequest.ts b/specification/inference/put_llama/PutLlamaRequest.ts
@@ -0,0 +1,79 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { RequestBase } from '@_types/Base'
+import { Id } from '@_types/common'
+import { Duration } from '@_types/Time'
+import {
+  LlamaServiceSettings,
+  LlamaServiceType,
+  LlamaTaskType
+} from '@inference/_types/CommonTypes'
+import { InferenceChunkingSettings } from '@inference/_types/Services'
+
+/**
+ * Create a Llama inference endpoint.
+ *
+ * Create an inference endpoint to perform an inference task with the `llama` service.
+ * @rest_spec_name inference.put_llama
+ * @availability stack since=9.2.0 stability=stable visibility=public
+ * @availability serverless stability=stable visibility=public
+ * @cluster_privileges manage_inference
+ * @doc_id inference-api-put-llama
+ */
+export interface Request extends RequestBase {
+  urls: [
+    {
+      path: '/_inference/{task_type}/{llama_inference_id}'
+      methods: ['PUT']
+    }
+  ]
+  path_parts: {
+    /**
+     * The type of the inference task that the model will perform.
+     */
+    task_type: LlamaTaskType
+    /**
+     * The unique identifier of the inference endpoint.
+     */
+    llama_inference_id: Id
+  }
+  query_parameters: {
+    /**
+     * Specifies the amount of time to wait for the inference endpoint to be created.
+     * @server_default 30s
+     */
+    timeout?: Duration
+  }
+  body: {
+    /**
+     * The chunking configuration object.
+     * @ext_doc_id inference-chunking
+     */
+    chunking_settings?: InferenceChunkingSettings
+    /**
+     * The type of service supported for the specified task type. In this case, `llama`.
+     */
+    service: LlamaServiceType
+    /**
+     * Settings used to install the inference model. These settings are specific to the `llama` service.
+     */
+    service_settings: LlamaServiceSettings
+  }
+}
diff --git a/specification/inference/put_llama/PutLlamaResponse.ts b/specification/inference/put_llama/PutLlamaResponse.ts
@@ -0,0 +1,25 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { InferenceEndpointInfoLlama } from '@inference/_types/Services'
+
+export class Response {
+  /** @codegen_name endpoint_info */
+  body: InferenceEndpointInfoLlama
+}
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample1.yaml
@@ -0,0 +1,13 @@
+# summary:
+description: Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task.
+method_request: 'PUT _inference/text_embedding/llama-text-embedding'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/inference/embeddings"
+      "dimensions": 384,
+      "model_id": "all-MiniLM-L6-v2" 
+    }
+  }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample2.yaml
@@ -0,0 +1,12 @@
+# summary:
+description: Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task.
+method_request: 'PUT _inference/completion/llama-completion'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/openai/v1/chat/completions"
+      "model_id": "llama3.2:3b" 
+    }
+  }
diff --git a/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml b/specification/inference/put_llama/examples/request/PutLlamaRequestExample3.yaml
@@ -0,0 +1,12 @@
+# summary:
+description: Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task.
+method_request: 'PUT _inference/chat-completion/llama-chat-completion'
+# type: "request"
+value: |-
+  {
+    "service": "llama",
+    "service_settings": {
+      "url": "http://localhost:8321/v1/openai/v1/chat/completions"
+      "model_id": "llama3.2:3b" 
+    }
+  }