openapi: 3.1.0
# https://github.com/davidmigloz/langchain_dart/blob/main/packages/ollama_dart/oas/ollama-curated.yaml

info:
  title: Ollama API
  description: API Spec for Ollama API. Please see https://github.com/jmorganca/ollama/blob/main/docs/api.md for more details.
  version: 0.1.9

#servers:
#  - url: http://localhost:11434/api
#    description: Ollama server URL

tags:
  - name: Completions
    description: Given a prompt, the model will generate a completion.
  - name: Chat
    description: Given a list of messages comprising a conversation, the model will return a response.
  - name: Embeddings
    description: Get a vector representation of a given input.
  - name: Models
    description: List and describe the various models available.

paths:
  /generate:
    post:
      operationId: generateCompletion
      tags:
        - Completions
      summary: Generate a response for a given prompt with a provided model.
      description: The final response object will include statistics and additional data from the request.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateCompletionRequest'
      responses:
        '200':
          description: Successful operation.
          content:
            application/x-ndjson:
              schema:
                $ref: '#/components/schemas/GenerateCompletionResponse'
  /chat:
    post:
      operationId: generateChatCompletion
      tags:
        - Chat
      summary: Generate the next message in a chat with a provided model.
      description: This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateChatCompletionRequest'
      responses:
        '200':
          description: Successful operation.
          content:
            application/x-ndjson:
              schema:
                $ref: '#/components/schemas/GenerateChatCompletionResponse'
  /embeddings:
    post:
      operationId: generateEmbedding
      tags:
        - Embeddings
      summary: Generate embeddings from a model.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateEmbeddingRequest'
      responses:
        '200':
          description: Successful operation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GenerateEmbeddingResponse'
  /create:
    post:
      operationId: createModel
      tags:
        - Models
      summary: Create a model from a Modelfile.
      description: It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation should also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using Create a Blob and the value to the path indicated in the response.
      requestBody:
        description: Create a new model from a Modelfile.
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateModelRequest'
      responses:
        '200':
          description: Successful operation.
          content:
            application/x-ndjson:
              schema:
                $ref: '#/components/schemas/CreateModelResponse'
  /tags:
    get:
      operationId: listModels
      tags:
        - Models
      summary: List models that are available locally.
      responses:
        '200':
          description: Successful operation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModelsResponse'
  /show:
    post:
      operationId: showModelInfo
      tags:
        - Models
      summary: Show details about a model including modelfile, template, parameters, license, and system prompt.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ModelInfoRequest'
      responses:
        '200':
          description: Successful operation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModelInfo'
  /copy:
    post:
      operationId: copyModel
      tags:
        - Models
      summary: Creates a model with another name from an existing model.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CopyModelRequest'
      responses:
        '200':
          description: Successful operation.
  /delete:
    delete:
      operationId: deleteModel
      tags:
        - Models
      summary: Delete a model and its data.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/DeleteModelRequest'
      responses:
        '200':
          description: Successful operation.
  /pull:
    post:
      operationId: pullModel
      tags:
        - Models
      summary: Download a model from the ollama library.
      description: Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/PullModelRequest'
      responses:
        '200':
          description: Successful operation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/PullModelResponse'
  /push:
    post:
      operationId: pushModel
      tags:
        - Models
      summary: Upload a model to a model library.
      description: Requires registering for ollama.ai and adding a public key first.
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/PushModelRequest'
      responses:
        '200':
          description: Successful operation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/PushModelResponse'
  "/blobs/{digest}":
    head:
      operationId: checkBlob
      tags:
        - Models
      summary: Check to see if a blob exists on the Ollama server which is useful when creating models.
      parameters:
        - in: path
          name: digest
          schema:
            type: string
          required: true
          description: the SHA256 digest of the blob
          example: sha256:c8edda1f17edd2f1b60253b773d837bda7b9d249a61245931a4d7c9a8d350250
      responses:
        '200':
          description: Blob exists on the server
        '404':
          description: Blob was not found
    post:
      operationId: createBlob
      tags:
        - Models
      summary: Create a blob from a file. Returns the server file path.
      parameters:
        - in: path
          name: digest
          schema:
            type: string
          required: true
          description: the SHA256 digest of the blob
          example: sha256:c8edda1f17edd2f1b60253b773d837bda7b9d249a61245931a4d7c9a8d350250
      requestBody:
        content:
          application/octet-stream:
            schema:
              type: string
              format: binary
      responses:
        '201':
          description: Blob was successfully created

components:
  schemas:
    GenerateCompletionRequest:
      type: object
      description: Request class for the generate endpoint.
      properties:
        model:
          type: string
          description: &model_name |
            The model name.

            Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
          example: llama2:7b
        prompt:
          type: string
          description: The prompt to generate a response.
          example: Why is the sky blue?
        images:
          type: array
          description: (optional) a list of Base64-encoded images to include in the message (for multimodal models such as llava)
          items:
            type: string
            contentEncoding: base64
            description: Base64-encoded image (for multimodal models such as llava)
            example: iVBORw0KGgoAAAANSUhEUgAAAAkAAAANCAIAAAD0YtNRAAAABnRSTlMA/AD+APzoM1ogAAAAWklEQVR4AWP48+8PLkR7uUdzcMvtU8EhdykHKAciEXL3pvw5FQIURaBDJkARoDhY3zEXiCgCHbNBmAlUiyaBkENoxZSDWnOtBmoAQu7TnT+3WuDOA7KBIkAGAGwiNeqjusp/AAAAAElFTkSuQmCC
        system:
          type: string
          description: The system prompt to (overrides what is defined in the Modelfile).
        template:
          type: string
          description: The full prompt or prompt template (overrides what is defined in the Modelfile).
        context:
          type: array
          description: The context parameter returned from a previous request to [generateCompletion], this can be used to keep a short conversational memory.
          items:
            type: integer
        options:
          $ref: '#/components/schemas/RequestOptions'
        format:
          $ref: '#/components/schemas/ResponseFormat'
        raw:
          type: boolean
          description: |
            If `true` no formatting will be applied to the prompt and no context will be returned.

            You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
        stream:
          type: boolean
          description: &stream |
            If `false` the response will be returned as a single response object, otherwise the response will be streamed as a series of objects.
          default: false
        keep_alive:
          type: integer
          description: &keep_alive |
            How long (in minutes) to keep the model loaded in memory.

            - If set to a positive duration (e.g. 20), the model will stay loaded for the provided duration.
            - If set to a negative duration (e.g. -1), the model will stay loaded indefinitely.
            - If set to 0, the model will be unloaded immediately once finished.
            - If not set, the model will stay loaded for 5 minutes by default
      required:
        - model
        - prompt
    RequestOptions:
      type: object
      description: Additional model parameters listed in the documentation for the Modelfile such as `temperature`.
      properties:
        num_keep:
          type: integer
          description: |
            Number of tokens to keep from the prompt.
        seed:
          type: integer
          description: |
            Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0)
        num_predict:
          type: integer
          description: |
            Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)
        top_k:
          type: integer
          description: |
            Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
        top_p:
          type: number
          format: float
          description: |
            Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
        tfs_z:
          type: number
          format: float
          description: |
            Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1)
        typical_p:
          type: number
          format: float
          description: |
            Typical p is used to reduce the impact of less probable tokens from the output.
        repeat_last_n:
          type: integer
          description: |
            Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
        temperature:
          type: number
          format: float
          description: |
            The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)
        repeat_penalty:
          type: number
          format: float
          description: |
            Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
        presence_penalty:
          type: number
          format: float
          description: |
            Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
        frequency_penalty:
          type: number
          format: float
          description: |
            Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
        mirostat:
          type: integer
          description: |
            Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
        mirostat_tau:
          type: number
          format: float
          description: |
            Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)
        mirostat_eta:
          type: number
          format: float
          description: |
            Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)
        penalize_newline:
          type: boolean
          description: |
            Penalize newlines in the output. (Default: false)
        stop:
          type: array
          description: Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
          items:
            type: string
        numa:
          type: boolean
          description: |
            Enable NUMA support. (Default: false)
        num_ctx:
          type: integer
          description: |
            Sets the size of the context window used to generate the next token.
        num_batch:
          type: integer
          description: |
            Sets the number of batches to use for generation. (Default: 1)
        num_gqa:
          type: integer
          description: |
            The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for `llama2:70b`.
        num_gpu:
          type: integer
          description: |
            The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable.
        main_gpu:
          type: integer
          description: |
            The GPU to use for the main model. Default is 0.
        low_vram:
          type: boolean
          description: |
            Enable low VRAM mode. (Default: false)
        f16_kv:
          type: boolean
          description: |
            Enable f16 key/value. (Default: false)
        logits_all:
          type: boolean
          description: |
            Enable logits all. (Default: false)
        vocab_only:
          type: boolean
          description: |
            Enable vocab only. (Default: false)
        use_mmap:
          type: boolean
          description: |
            Enable mmap. (Default: false)
        use_mlock:
          type: boolean
          description: |
            Enable mlock. (Default: false)
        embedding_only:
          type: boolean
          description: |
            Enable embedding only. (Default: false)
        rope_frequency_base:
          type: number
          format: float
          description: |
            The base of the rope frequency scale. (Default: 1.0)
        rope_frequency_scale:
          type: number
          format: float
          description: |
            The scale of the rope frequency. (Default: 1.0)
        num_thread:
          type: integer
          description: |
            Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores).
    ResponseFormat:
      type: string
      description: |
        The format to return a response in. Currently the only accepted value is json.

        Enable JSON mode by setting the format parameter to json. This will structure the response as valid JSON.

        Note: it's important to instruct the model to use JSON in the prompt. Otherwise, the model may generate large amounts whitespace.
      enum:
        - json
    GenerateCompletionResponse:
      type: object
      description: The response class for the generate endpoint.
      properties:
        model:
          type: string
          description: *model_name
          example: llama2:7b
        created_at:
          type: string
          format: date-time
          description: Date on which a model was created.
          example: 2023-08-04T19:22:45.499127Z
        response:
          type: string
          description: The response for a given prompt with a provided model.
          example: The sky appears blue because of a phenomenon called Rayleigh scattering.
        done:
          type: boolean
          description: Whether the response has completed.
          example: true
        context:
          type: array
          description: |
            An encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory.
          items:
            type: integer
          example: [ 1, 2, 3 ]
        total_duration:
          type: integer
          description: Time spent generating the response.
          example: 5589157167
        load_duration:
          type: integer
          description: Time spent in nanoseconds loading the model.
          example: 3013701500
        prompt_eval_count:
          type: integer
          description: Number of tokens in the prompt.
          example: 46
        prompt_eval_duration:
          type: integer
          description: Time spent in nanoseconds evaluating the prompt.
          example: 1160282000
        eval_count:
          type: integer
          description: Number of tokens the response.
          example: 113
        eval_duration:
          type: integer
          description: Time in nanoseconds spent generating the response.
          example: 1325948000
    GenerateChatCompletionRequest:
      type: object
      description: Request class for the chat endpoint.
      properties:
        model:
          type: string
          description: *model_name
          example: llama2:7b
        messages:
          type: array
          description: The messages of the chat, this can be used to keep a chat memory
          items:
            $ref: '#/components/schemas/Message'
        format:
          $ref: '#/components/schemas/ResponseFormat'
        options:
          $ref: '#/components/schemas/RequestOptions'
        stream:
          type: boolean
          description: *stream
          default: false
        keep_alive:
          type: integer
          description: *keep_alive
      required:
        - model
        - messages
    GenerateChatCompletionResponse:
      type: object
      description: The response class for the chat endpoint.
      properties:
        message:
          $ref: '#/components/schemas/Message'
        model:
          type: string
          description: *model_name
          example: llama2:7b
        created_at:
          type: string
          format: date-time
          description: Date on which a model was created.
          example: 2023-08-04T19:22:45.499127Z
        done:
          type: boolean
          description: Whether the response has completed.
          example: true
        total_duration:
          type: integer
          description: Time spent generating the response.
          example: 5589157167
        load_duration:
          type: integer
          description: Time spent in nanoseconds loading the model.
          example: 3013701500
        prompt_eval_count:
          type: integer
          description: Number of tokens in the prompt.
          example: 46
        prompt_eval_duration:
          type: integer
          description: Time spent in nanoseconds evaluating the prompt.
          example: 1160282000
        eval_count:
          type: integer
          description: Number of tokens the response.
          example: 113
        eval_duration:
          type: integer
          description: Time in nanoseconds spent generating the response.
          example: 1325948000
    Message:
      type: object
      description: A message in the chat endpoint
      properties:
        role:
          type: string
          description: The role of the message
          enum: [ "system", "user", "assistant" ]
        content:
          type: string
          description: The content of the message
          example: Why is the sky blue?
        images:
          type: array
          description: (optional) a list of Base64-encoded images to include in the message (for multimodal models such as llava)
          items:
            type: string
            description: Base64-encoded image (for multimodal models such as llava)
            example: iVBORw0KGgoAAAANSUhEUgAAAAkAAAANCAIAAAD0YtNRAAAABnRSTlMA/AD+APzoM1ogAAAAWklEQVR4AWP48+8PLkR7uUdzcMvtU8EhdykHKAciEXL3pvw5FQIURaBDJkARoDhY3zEXiCgCHbNBmAlUiyaBkENoxZSDWnOtBmoAQu7TnT+3WuDOA7KBIkAGAGwiNeqjusp/AAAAAElFTkSuQmCC
      required:
        - role
        - content
    GenerateEmbeddingRequest:
      description: Generate embeddings from a model.
      type: object
      properties:
        model:
          type: string
          description: *model_name
          example: llama2:7b
        prompt:
          type: string
          description: Text to generate embeddings for.
          example: 'Here is an article about llamas...'
        options:
          $ref: '#/components/schemas/RequestOptions'
      required:
        - model
        - prompt
    GenerateEmbeddingResponse:
      type: object
      description: Returns the embedding information.
      properties:
        embedding:
          type: array
          description: The embedding for the prompt.
          items:
            type: number
            format: double
          example: [ 0.5670403838157654, 0.009260174818336964, ... ]
    CreateModelRequest:
      type: object
      description: Create model request object.
      properties:
        name:
          type: string
          description: *model_name
          example: mario
        modelfile:
          type: string
          description: The contents of the Modelfile.
          example: FROM llama2\nSYSTEM You are mario from Super Mario Bros.
        stream:
          type: boolean
          description: *stream
          default: false
      required:
        - name
        - modelfile
    CreateModelResponse:
      description: Response object for creating a model. When finished, `status` is `success`.
      type: object
      properties:
        status:
          $ref: '#/components/schemas/CreateModelStatus'
    CreateModelStatus:
      type: string
      description: Status creating the model
      enum:
        - creating system layer
        - parsing modelfile
        - success
    ModelsResponse:
      description: Response class for the list models endpoint.
      type: object
      properties:
        models:
          type: array
          description: List of models available locally.
          items:
            $ref: '#/components/schemas/Model'
    Model:
      type: object
      description: A model available locally.
      properties:
        name:
          type: string
          description: *model_name
          example: llama2:7b
        modified_at:
          type: string
          format: date-time
          description: Model modification date.
          example: 2023-08-02T17:02:23.713454393-07:00
        size:
          type: integer
          description: Size of the model on disk.
          example: 7323310500
    ModelInfoRequest:
      description: Request class for the show model info endpoint.
      type: object
      properties:
        name:
          type: string
          description: *model_name
          example: llama2:7b
      required:
        - name
    ModelInfo:
      description: Details about a model including modelfile, template, parameters, license, and system prompt.
      type: object
      properties:
        license:
          type: string
          description: The model's license.
          example: <contents of license block>
        modelfile:
          type: string
          description: The modelfile associated with the model.
          example: 'Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n"'
        parameters:
          type: string
          description: The model parameters.
          example: 'stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>'
        template:
          type: string
          description: The prompt template for the model.
          example: '[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST]'
    CopyModelRequest:
      description: Request class for copying a model.
      type: object
      properties:
        source:
          type: string
          description: Name of the model to copy.
          example: llama2:7b
        destination:
          type: string
          description: Name of the new model.
          example: llama2-backup
      required:
        - source
        - destination
    DeleteModelRequest:
      description: Request class for deleting a model.
      type: object
      properties:
        name:
          type: string
          description: *model_name
          example: llama2:13b
      required:
        - name
    PullModelRequest:
      description: Request class for pulling a model.
      type: object
      properties:
        name:
          type: string
          description: *model_name
          example: llama2:7b
        insecure:
          type: boolean
          description: |
            Allow insecure connections to the library.

            Only use this if you are pulling from your own library during development.
          default: false
        stream:
          type: boolean
          description: *stream
          default: false
      required:
        - name
    PullModelResponse:
      description: |
        Response class for pulling a model.

        The first object is the manifest. Then there is a series of downloading responses. Until any of the download is completed, the `completed` key may not be included.

        The number of files to be downloaded depends on the number of layers specified in the manifest.
      type: object
      properties:
        status:
          $ref: '#/components/schemas/PullModelStatus'
        digest:
          type: string
          description: The model's digest.
          example: 'sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711a'
        total:
          type: integer
          description: Total size of the model.
          example: 2142590208
        completed:
          type: integer
          description: Total bytes transferred.
          example: 2142590208
    PullModelStatus:
      type: string
      description: Status pulling the model.
      enum:
        - pulling manifest
        - downloading digestname
        - verifying sha256 digest
        - writing manifest
        - removing any unused layers
        - success
      example: pulling manifest
    PushModelRequest:
      description: Request class for pushing a model.
      type: object
      properties:
        name:
          type: string
          description: The name of the model to push in the form of <namespace>/<model>:<tag>.
          example: 'mattw/pygmalion:latest'
        insecure:
          type: boolean
          description: |
            Allow insecure connections to the library.

            Only use this if you are pushing to your library during development.
          default: false
        stream:
          type: boolean
          description: *stream
          default: false
      required:
        - name
    PushModelResponse:
      type: object
      description: Response class for pushing a model.
      properties:
        status:
          $ref: '#/components/schemas/PushModelStatus'
        digest:
          type: string
          description: the model's digest
          example: 'sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711a'
        total:
          type: integer
          description: total size of the model
          example: 2142590208
    PushModelStatus:
      type: string
      description: Status pushing the model.
      enum:
        - retrieving manifest
        - starting upload
        - pushing manifest
        - success