Files
abot/llm/local_llm_client.py
2026-01-15 17:01:02 -08:00

155 lines
4.5 KiB
Python

import logging
import requests
import os
from typing import List, Dict, Any
import urllib3
# Disable SSL warnings (only for development with self-signed certs!)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logger = logging.getLogger(__name__)
# Get LLM configuration from environment
LLM_API_URL = os.getenv("LLM_API_URL", "http://api.chat.pathcore.org")
LLM_API_KEY = os.getenv("LLM_API_KEY") # Optional API key
LLM_MODEL = os.getenv("LLM_MODEL", "llama3") # Default Ollama model
def chat_completion(
messages: List[Dict[str, Any]],
model: str = None,
temperature: float = 0.7,
max_tokens: int = 1000,
tools: List[Dict[str, Any]] | None = None,
) -> Dict[str, Any]:
"""
Call Ollama API for chat completion.
Args:
messages: List of message dicts with 'role' and 'content'
model: Model name to use (defaults to LLM_MODEL from env)
temperature: Sampling temperature (0.0 to 2.0)
max_tokens: Maximum tokens to generate
Returns:
Dict with 'content' key containing the response text
Raises:
Exception: If the API call fails
"""
# Use provided model or fall back to env variable
model = model or LLM_MODEL
# Ollama chat endpoint
url = f"{LLM_API_URL}/api/chat"
# Convert OpenAI-style messages to Ollama format
# Ollama expects messages in the same format, but we need to ensure proper structure
logger.info(f"Calling Ollama API at {url} with model: {model}")
payload = {
"model": model,
"messages": messages,
"stream": False,
"options": {
"temperature": temperature,
"num_predict": max_tokens,
}
}
if tools:
payload["tools"] = tools
headers = {
"Content-Type": "application/json"
}
# Add API key if configured (Ollama doesn't usually need this, but just in case)
if LLM_API_KEY:
headers["Authorization"] = f"Bearer {LLM_API_KEY}"
try:
resp = requests.post(
url,
json=payload,
headers=headers,
timeout=120, # Ollama can be slow on first request
verify=False
)
# Raise exception for HTTP errors
resp.raise_for_status()
data = resp.json()
logger.debug(f"Ollama API response: {data}")
# Extract the assistant's response from Ollama format
# Ollama returns: {"message": {"role": "assistant", "content": "..."}}
message = data.get("message", {})
content = message.get("content", "")
tool_calls = message.get("tool_calls", [])
logger.info(
f"Ollama response received "
f"(content={len(content)} chars, tool_calls={len(tool_calls)})"
)
return {
"content": content,
"tool_calls": tool_calls,
"raw": data
}
except requests.exceptions.Timeout:
logger.error("Ollama API request timed out after 120 seconds")
raise Exception("LLM request timed out. The model might be loading for the first time.")
except requests.exceptions.ConnectionError as e:
logger.error(f"Cannot connect to Ollama API: {e}")
raise Exception("Cannot connect to Ollama server. Is it running?")
except requests.exceptions.HTTPError as e:
logger.error(f"HTTP error from Ollama API: {e}")
if e.response.status_code == 404:
raise Exception(f"Model '{model}' not found. Try: ollama pull {model}")
raise Exception(f"Ollama API error: {e}")
except ValueError as e:
logger.error(f"Invalid response from Ollama API: {e}")
raise
except Exception as e:
logger.error(f"Unexpected error calling Ollama API: {e}", exc_info=True)
raise
def list_models():
"""
List available Ollama models.
Returns:
List of model names
"""
url = f"{LLM_API_URL}/api/tags"
try:
resp = requests.get(url, timeout=10, verify=False)
resp.raise_for_status()
data = resp.json()
if "models" in data:
models = [model["name"] for model in data["models"]]
logger.info(f"Available models: {models}")
return models
return []
except Exception as e:
logger.error(f"Error listing models: {e}")
return []
# Make functions available for import
__all__ = ['chat_completion', 'list_models']