name	api
description	Direct REST API operations for Ollama using the requests library. Covers all /api/* endpoints for model management, text generation, chat completion, embeddings, and streaming responses.

Ollama REST API

Overview

The Ollama REST API provides direct HTTP access to all Ollama functionality. Use the requests library for maximum control over API interactions.

Default Endpoint: http://localhost:11434 (or http://ollama:11434 in containers)

Quick Reference

Endpoint	Method	Purpose
`/api/tags`	GET	List available models
`/api/show`	POST	Show model details
`/api/ps`	GET	List running models
`/api/generate`	POST	Generate text
`/api/chat`	POST	Chat completion
`/api/embed`	POST	Generate embeddings
`/api/copy`	POST	Copy a model
`/api/delete`	DELETE	Delete a model

Setup

import os
import requests
import json

OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")

List Models

response = requests.get(f"{OLLAMA_HOST}/api/tags")
models = response.json()

for model in models.get("models", []):
    size_gb = model.get("size", 0) / (1024**3)
    print(f"  - {model['name']} ({size_gb:.2f} GB)")

Show Model Details

response = requests.post(
    f"{OLLAMA_HOST}/api/show",
    json={"model": "llama3.2:latest"}
)
model_info = response.json()

details = model_info.get("details", {})
print(f"Family: {details.get('family', 'N/A')}")
print(f"Parameter Size: {details.get('parameter_size', 'N/A')}")
print(f"Quantization: {details.get('quantization_level', 'N/A')}")

List Running Models

response = requests.get(f"{OLLAMA_HOST}/api/ps")
running = response.json()

for model in running.get("models", []):
    name = model.get("name", "Unknown")
    size = model.get("size", 0) / (1024**3)
    vram = model.get("size_vram", 0) / (1024**3)
    print(f"  - {name}: {size:.2f} GB (VRAM: {vram:.2f} GB)")

Generate Text

Non-Streaming

response = requests.post(
    f"{OLLAMA_HOST}/api/generate",
    json={
        "model": "llama3.2:latest",
        "prompt": "Why is the sky blue?",
        "stream": False
    }
)
result = response.json()
print(result["response"])

Streaming

response = requests.post(
    f"{OLLAMA_HOST}/api/generate",
    json={
        "model": "llama3.2:latest",
        "prompt": "Count from 1 to 5.",
        "stream": True
    },
    stream=True
)

for line in response.iter_lines():
    if line:
        chunk = json.loads(line)
        print(chunk.get("response", ""), end="", flush=True)
        if chunk.get("done"):
            break

Chat Completion

response = requests.post(
    f"{OLLAMA_HOST}/api/chat",
    json={
        "model": "llama3.2:latest",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is Python?"}
        ],
        "stream": False
    }
)
result = response.json()
print(result["message"]["content"])

Generate Embeddings

response = requests.post(
    f"{OLLAMA_HOST}/api/embed",
    json={
        "model": "llama3.2:latest",
        "input": "Ollama makes running LLMs locally easy."
    }
)
result = response.json()
embeddings = result.get("embeddings", [[]])[0]
print(f"Dimensions: {len(embeddings)}")

Copy Model

response = requests.post(
    f"{OLLAMA_HOST}/api/copy",
    json={
        "source": "llama3.2:latest",
        "destination": "llama3.2-backup:latest"
    }
)
if response.status_code == 200:
    print("Copy successful!")

Delete Model

response = requests.delete(
    f"{OLLAMA_HOST}/api/delete",
    json={"model": "llama3.2-backup:latest"}
)
if response.status_code == 200:
    print("Delete successful!")

Error Handling

try:
    response = requests.post(
        f"{OLLAMA_HOST}/api/generate",
        json={"model": "nonexistent", "prompt": "Hello"},
        timeout=30
    )
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
    else:
        result = response.json()
        if "error" in result:
            print(f"API Error: {result['error']}")
except requests.exceptions.ConnectionError:
    print("Cannot connect to Ollama. Ensure server is running at OLLAMA_HOST")
except requests.exceptions.Timeout:
    print("Request timed out")

Connection Health Check

def check_ollama_health(model="llama3.2:latest"):
    """Check if Ollama server is running and model is available."""
    try:
        response = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=5)
        if response.status_code == 200:
            models = response.json()
            model_names = [m.get("name", "") for m in models.get("models", [])]
            return True, model in model_names
        return False, False
    except requests.exceptions.RequestException:
        return False, False

server_ok, model_ok = check_ollama_health()

Response Metrics

The generate endpoint returns useful metrics:

result = response.json()
print(f"Prompt eval count: {result.get('prompt_eval_count', 'N/A')}")
print(f"Prompt eval duration: {result.get('prompt_eval_duration', 0) / 1e9:.3f}s")
print(f"Eval count (tokens): {result.get('eval_count', 'N/A')}")
print(f"Eval duration: {result.get('eval_duration', 0) / 1e9:.3f}s")
print(f"Total duration: {result.get('total_duration', 0) / 1e9:.3f}s")

if result.get('eval_count') and result.get('eval_duration'):
    tokens_per_sec = result['eval_count'] / (result['eval_duration'] / 1e9)
    print(f"Tokens/second: {tokens_per_sec:.1f}")

When to Use This Skill

Use when:

You need direct control over HTTP requests
Debugging API interactions
Building custom integrations
Working with streaming responses
Checking raw API responses

Cross-References

bazzite-ai-ollama:python - Higher-level Python library
bazzite-ai-ollama:openai - OpenAI-compatible interface

api

Install Skill

SKILL.md