Source code for paperrag.llm

"""LLM module for local inference via Ollama or llama.cpp (GGUF / HuggingFace models).

Backend selection rules
-----------------------
* Local ``*.gguf`` file path   → **llama-server** (from ``brew install llama-cpp``)
* HuggingFace repo ID          → download GGUF + **llama-server**
  (e.g. ``Qwen/Qwen3-1.7B-GGUF``)
* All other names              → **Ollama** (unchanged)

Example usage
-------------
::

    paperrag query "What is X?" --model qwen2.5:1.5b         # Ollama
    paperrag query "What is X?" --model Qwen/Qwen3-1.7B-GGUF # HF download + llama-server
    paperrag query "What is X?" --model /path/to/model.gguf  # local GGUF + llama-server
"""

from __future__ import annotations

import atexit
import logging
import os
import re
import shutil
import socket
import subprocess
import tempfile
import time
import urllib.request
from collections.abc import Iterator

from paperrag.config import LLMConfig

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Module-level caches
# ---------------------------------------------------------------------------

# Ollama OpenAI client cache
_client_cache: object | None = None
_model_checked: set[str] = set()

# llama-server process and client caches (keyed by (model_path, ctx_size, n_gpu_layers))
_llama_server_procs: dict[tuple, subprocess.Popen] = {}
_llama_server_clients: dict[tuple, object] = {}

# Seconds to wait for a llama-server process to exit cleanly before assuming port conflict
_PROC_WAIT_TIMEOUT = 5


def _cleanup_llama_servers() -> None:
    """Terminate all managed llama-server processes at interpreter exit."""
    for proc in list(_llama_server_procs.values()):
        try:
            if proc.poll() is not None:
                continue
            proc.terminate()
            proc.wait(timeout=_PROC_WAIT_TIMEOUT)
        except BaseException:
            try:
                if proc.poll() is None:
                    proc.kill()
            except BaseException:
                pass


atexit.register(_cleanup_llama_servers)

# Maximum characters per context chunk sent to the LLM.
# Set to 2000 to accommodate typical chunk sizes (1000 chars) without truncation
# when ctx_size is 4096+. Prevents loss of important context that causes
# inferior answers compared to llama-server's full-document approach.
_MAX_CHUNK_CHARS = 2000
_TRAILING_SOURCE_LINE_RE = re.compile(r"^\s*Sources?:\s*\[[0-9,\s-]+\]\s*$", re.IGNORECASE)

# Suffix appended to the user's configured system prompt for follow-up questions
# answered from conversation history only.
_FOLLOWUP_PROMPT_SUFFIX = (
    " Answer the follow-up question based on the conversation so far. "
    "If the previous conversation does not contain relevant information, say so."
)


def _get_followup_system_prompt(config: "LLMConfig") -> str:
    """Derive the follow-up system prompt from the user's configured system_prompt.

    This ensures that tone, language, and constraints set by the user via
    /prompt or /preset are respected even when answering from history only.
    """
    return config.system_prompt + _FOLLOWUP_PROMPT_SUFFIX


def _build_prompt(question: str, context_chunks: list[str], source_labels: list[int] | None = None) -> str:
    context_lines = []
    for i, chunk in enumerate(context_chunks):
        label = source_labels[i] if source_labels else i + 1
        # Truncate overly long chunks to keep prompt compact
        text = chunk[:_MAX_CHUNK_CHARS] + "..." if len(chunk) > _MAX_CHUNK_CHARS else chunk
        context_lines.append(f"[{label}] {text}")
    context_block = "\n\n---\n\n".join(context_lines)

    unique_labels = sorted(set(source_labels)) if source_labels else list(range(1, len(context_chunks) + 1))
    n = len(unique_labels)
    cite_instruction = (
        "Use inline citation [1] within your answer." if n == 1
        else f"Use inline citations [1]–[{n}] within your answer. Only cite sources from [1] to [{n}]."
    )

    # Use a more detailed instruction when we have substantial context
    # (> 3000 chars ≈ 3+ full chunks, indicating rich retrieval or full-document mode)
    total_context_chars = sum(len(c) for c in context_chunks)
    if total_context_chars > 3000:
        answer_style = "Answer thoroughly using ONLY the context. Provide detailed reasoning and cite specific statements."
    else:
        answer_style = "Answer using ONLY the context."

    return (
        f"Context:\n{context_block}\n\n"
        f"Question: {question}\n\n"
        f"{answer_style} {cite_instruction} Do not add a separate 'Source:' or 'Sources:' list at the end."
    )


def _strip_trailing_source_footers(text: str) -> str:
    """Remove standalone trailing ``Source: [n]`` footer lines from model output."""
    lines = text.splitlines()
    end = len(lines)

    while end > 0 and not lines[end - 1].strip():
        end -= 1

    while end > 0 and _TRAILING_SOURCE_LINE_RE.match(lines[end - 1]):
        end -= 1
        while end > 0 and not lines[end - 1].strip():
            end -= 1

    return "\n".join(lines[:end]).strip()


def _sanitize_stream(chunks: Iterator[str]) -> Iterator[str]:
    """Buffer streamed text and remove trailing source footers before yielding."""
    text = "".join(chunks)
    cleaned = _strip_trailing_source_footers(text)
    if cleaned:
        yield cleaned


_OLLAMA_BASE_URL = "http://localhost:11434"
_OLLAMA_API_URL = f"{_OLLAMA_BASE_URL}/v1"


# ---------------------------------------------------------------------------
# Backend detection
# ---------------------------------------------------------------------------


def _is_gguf_model(model_name: str) -> bool:
    """Return True if *model_name* is a local path to a GGUF file (ends with ``.gguf``)."""
    return model_name.lower().endswith(".gguf")


def _is_hf_model(model_name: str) -> bool:
    """Return True if *model_name* is a HuggingFace repo ID (e.g. ``Qwen/Qwen3-1.7B-GGUF``).

    HF repo IDs have the form ``owner/repo`` with no path prefix and no Ollama tag syntax.
    Excluded patterns:

    * Absolute or relative paths: ``/…``, ``./…``, ``~/…``, or any path containing ``..``
    * Ollama namespaced models with a tag: ``library/llama3:latest``, ``org/model:tag``
    """
    if model_name.startswith(("/", "./", "~/")):
        return False
    if ".." in model_name.split("/"):  # catch ../, ../../, etc.
        return False
    if ":" in model_name:  # Ollama tag syntax, e.g. qwen2.5:1.5b or library/model:tag
        return False
    parts = model_name.split("/")
    return len(parts) == 2 and all(parts)


def _is_llama_backend(model_name: str) -> bool:
    """Return True if this model should use the llama.cpp (``llama-server``) backend.

    Dispatch rules:

    * Local ``.gguf`` file path   → **llama-server**
    * HuggingFace repo ID (``org/repo``) → download GGUF + **llama-server**
    * All other names (e.g. ``qwen2.5:1.5b``) → **Ollama**
    """
    return _is_gguf_model(model_name) or _is_hf_model(model_name)


# ---------------------------------------------------------------------------
# Ollama helpers
# ---------------------------------------------------------------------------


def _check_ollama_model_available(model_name: str) -> bool:
    """Check if a model is available in Ollama.

    Returns True if the model is available, False otherwise.
    Uses fuzzy matching to handle version differences (e.g., llama3.2:3b vs llama3.2:2b).
    """
    try:
        import requests
        response = requests.get(f"{_OLLAMA_BASE_URL}/api/tags", timeout=2)
        if response.status_code == 200:
            data = response.json()
            available_models = [model["name"] for model in data.get("models", [])]

            # Exact match first
            if model_name in available_models:
                return True

            # Fuzzy match: check if base model name (without version tag) exists
            # e.g., "llama3.2:3b" -> check for any "llama3.2:*"
            base_model = model_name.split(':')[0] if ':' in model_name else model_name
            for available in available_models:
                available_base = available.split(':')[0] if ':' in available else available
                if base_model == available_base:
                    return True

            return False
        return False
    except Exception:
        # If we can't check (network error, Ollama not running, etc.), assume it's available
        # The actual API call will fail with a better error message
        return True



[docs]
def prewarm_ollama(config: LLMConfig) -> bool:
    """Send a minimal 1-token request to load the Ollama model into memory.

    Returns True if successful, False if Ollama is unreachable or llama-server backend.
    Only applies to the Ollama backend; llama-server has its own startup mechanism.
    """
    if _is_llama_backend(config.model_name):
        return False

    try:
        from openai import OpenAI
        client = OpenAI(api_key="not-needed", base_url=_OLLAMA_API_URL)
        client.chat.completions.create(
            model=config.model_name,
            messages=[{"role": "user", "content": "hi"}],
            max_tokens=1,
            stream=False,
            extra_body={"num_ctx": config.ctx_size, "keep_alive": "30m"},
        )
        _model_checked.add(config.model_name)  # skip redundant check on first real query
        return True
    except Exception:
        return False



# ---------------------------------------------------------------------------
# HuggingFace model download
# ---------------------------------------------------------------------------


def _download_hf_gguf(repo_id: str) -> str:
    """Download a GGUF file from *repo_id* on HuggingFace Hub.

    Prefers Q4_K_M quantization; falls back to the first ``.gguf`` file found.
    Models are cached in HuggingFace's default cache (``~/.cache/huggingface/hub/``),
    so re-runs will reuse the download without hitting the network again.

    Raises ``ImportError`` if ``huggingface-hub`` is not installed.
    Raises ``ValueError`` if no GGUF files are found in the repository.
    Raises ``RuntimeError`` if the repository cannot be listed.
    """
    try:
        from huggingface_hub import hf_hub_download, list_repo_files  # type: ignore[import]
    except ImportError as exc:
        raise ImportError(
            "huggingface-hub is required to download HuggingFace models. "
            "Install with: uv pip install huggingface-hub"
        ) from exc

    logger.info("Listing GGUF files in %s ...", repo_id)
    try:
        all_files = list(list_repo_files(repo_id))
    except Exception as exc:
        raise RuntimeError(
            f"Failed to list files in HuggingFace repo '{repo_id}': {exc}"
        ) from exc

    gguf_files = [f for f in all_files if f.lower().endswith(".gguf")]
    if not gguf_files:
        raise ValueError(f"No GGUF files found in HuggingFace repo: {repo_id}")

    # Prefer Q4_K_M (good quality/speed tradeoff), then Q4_0, then first available
    chosen = None
    for pattern in ("Q4_K_M", "q4_k_m", "Q4_0", "q4_0"):
        matches = [f for f in gguf_files if pattern in f]
        if matches:
            chosen = matches[0]
            break
    if chosen is None:
        chosen = gguf_files[0]

    logger.info("Downloading '%s' from %s ...", chosen, repo_id)
    local_path = hf_hub_download(repo_id=repo_id, filename=chosen)
    logger.info("Model ready at %s", local_path)
    return local_path


def _resolve_model_path(model_name: str) -> str:
    """Resolve a model name to a local GGUF file path.

    * Local ``.gguf`` path → validated and returned as-is.
    * HuggingFace repo ID → GGUF downloaded via ``huggingface-hub`` and path returned.
    """
    if _is_gguf_model(model_name):
        if not os.path.isfile(model_name):
            raise FileNotFoundError(f"GGUF model file not found: {model_name}")
        return model_name

    if _is_hf_model(model_name):
        return _download_hf_gguf(model_name)

    raise ValueError(f"Cannot resolve model path for: {model_name!r}")


# ---------------------------------------------------------------------------
# llama-server lifecycle
# ---------------------------------------------------------------------------


def _find_llama_server_binary() -> str:
    """Return the path to the ``llama-server`` binary.

    Checks ``PATH`` first, then common Homebrew install locations.

    Raises ``FileNotFoundError`` if not found with an actionable install hint.
    """
    if bin_path := shutil.which("llama-server"):
        return bin_path

    for candidate in (
        "/opt/homebrew/bin/llama-server",            # Homebrew on Apple Silicon
        "/usr/local/bin/llama-server",               # Homebrew on Intel Mac
        "/home/linuxbrew/.linuxbrew/bin/llama-server",  # Linuxbrew
    ):
        if os.path.isfile(candidate):
            return candidate

    raise FileNotFoundError(
        "llama-server not found.\n"
        "Install with:  brew install llama-cpp\n"
        "See also: https://github.com/ggerganov/llama.cpp"
    )


def _find_free_port() -> int:
    """Return a free TCP port on localhost."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(("127.0.0.1", 0))
        return s.getsockname()[1]


def _wait_for_llama_server(port: int, timeout: float = 120.0) -> bool:
    """Poll ``http://localhost:{port}/health`` until the server is ready."""
    url = f"http://localhost:{port}/health"
    deadline = time.monotonic() + timeout
    while time.monotonic() < deadline:
        try:
            with urllib.request.urlopen(url, timeout=2):
                return True
        except Exception:
            time.sleep(0.5)
    return False


def _get_or_start_llama_server(model_path: str, ctx_size: int, n_gpu_layers: int, n_threads: int = 0) -> object:
    """Return a cached OpenAI client connected to a running ``llama-server``.

    Starts a new ``llama-server`` process on a free port if none is cached for
    the given ``(model_path, ctx_size, n_gpu_layers, n_threads)`` combination, or if the
    previously started process has exited unexpectedly.

    The server process is registered with :func:`atexit` and terminated when
    the Python interpreter exits.

    Install ``llama-server`` with: ``brew install llama-cpp``
    """
    from openai import OpenAI

    effective_threads = n_threads if n_threads > 0 else (os.cpu_count() or 4)
    cache_key = (model_path, ctx_size, n_gpu_layers, effective_threads)

    # Re-use existing server if still alive
    if cache_key in _llama_server_clients:
        proc = _llama_server_procs.get(cache_key)
        if proc is None or proc.poll() is None:  # no proc tracked, or still running
            return _llama_server_clients[cache_key]
        # Process died — remove stale entries and fall through to restart
        logger.warning("llama-server process exited unexpectedly; restarting ...")
        del _llama_server_clients[cache_key]
        _llama_server_procs.pop(cache_key, None)

    llama_server = _find_llama_server_binary()

    # Retry port selection to guard against the TOCTOU window between
    # _find_free_port() releasing the ephemeral port and llama-server binding it.
    _MAX_PORT_RETRIES = 3
    _server_timeout = 120.0
    for attempt in range(1, _MAX_PORT_RETRIES + 1):
        port = _find_free_port()

        cmd = [
            llama_server,
            "--model", model_path,
            "--port", str(port),
            "--ctx-size", str(ctx_size),
            "--n-gpu-layers", str(n_gpu_layers),
            "--threads", str(effective_threads),
            "--parallel", "1",
        ]

        model_label = os.path.basename(model_path)
        log_path = os.path.join(
            tempfile.gettempdir(), f"paperrag-llama-server-{port}.log"
        )
        logger.info(
            "Starting llama-server for '%s' on port %d (attempt %d/%d, stderr → %s)",
            model_label,
            port,
            attempt,
            _MAX_PORT_RETRIES,
            log_path,
        )
        logger.info("llama-server cmd: %s", " ".join(cmd))

        with open(log_path, "w") as log_fh:
            proc = subprocess.Popen(
                cmd, stdout=subprocess.DEVNULL, stderr=log_fh
            )

        if _wait_for_llama_server(port, timeout=_server_timeout):
            # Register in the cache — cleanup is handled by _cleanup_llama_servers
            _llama_server_procs[cache_key] = proc
            logger.info("llama-server ready on port %d", port)
            client = OpenAI(api_key="not-needed", base_url=f"http://localhost:{port}/v1")
            _llama_server_clients[cache_key] = client
            return client

        # Server did not become ready — check if it died immediately (port race)
        # or just timed out (model loading issue)
        proc.terminate()
        try:
            return_code = proc.wait(timeout=_PROC_WAIT_TIMEOUT)
        except subprocess.TimeoutExpired:
            proc.kill()
            return_code = proc.wait()
        server_failed_immediately = return_code != 0
        if server_failed_immediately and attempt < _MAX_PORT_RETRIES:
            logger.warning(
                "llama-server exited (rc=%d) on port %d; retrying with a new port ...",
                return_code,
                port,
            )
            # Clean up the log file for this failed attempt
            try:
                os.unlink(log_path)
            except OSError:
                pass
            continue
        # Either timed out (model issue) or exhausted retries
        raise RuntimeError(
            f"llama-server failed to start on port {port} within "
            f"{int(_server_timeout)}s. "
            f"Check server log for details: {log_path}\n"
            "Also verify the model file is valid and llama-server is installed "
            "(brew install llama-cpp)."
        )

    # Should be unreachable (loop always returns or raises), but satisfies type checkers
    raise RuntimeError("llama-server could not be started after retries.")


# ---------------------------------------------------------------------------
# Shared message builder
# ---------------------------------------------------------------------------


def _build_messages(question: str, context_chunks: list[str], model_name: str, system_prompt: str, source_labels: list[int] | None = None, think: bool = False, conversation_history: list[dict] | None = None) -> list[dict]:
    """Build the chat messages list from question, context chunks, and model name."""
    user_prompt = _build_prompt(question, context_chunks, source_labels=source_labels)

    # For Qwen3/Qwen3.5 models, suppress thinking mode unless explicitly enabled.
    # Thinking mode generates a long internal reasoning chain before answering,
    # which is unnecessary for RAG Q&A and adds ~30-50s of latency.
    model_lower = model_name.lower()
    if "qwen3" in model_lower or "qwen-3" in model_lower:
        if not think:
            user_prompt += " /no_think"

    messages: list[dict] = [{"role": "system", "content": system_prompt}]

    # Include conversation history for follow-up questions
    if conversation_history:
        messages.extend(conversation_history)

    messages.append({"role": "user", "content": user_prompt})
    return messages


def _prepare(
    question: str,
    context_chunks: list[str],
    config: LLMConfig,
    source_labels: list[int] | None = None,
    conversation_history: list[dict] | None = None,
) -> tuple:
    """Ollama-specific setup: build messages, get/cache OpenAI client, return (client, messages)."""
    global _client_cache
    from openai import OpenAI

    messages = _build_messages(question, context_chunks, config.model_name, config.system_prompt, source_labels=source_labels, think=config.think, conversation_history=conversation_history)

    if _client_cache is not None:
        client = _client_cache
    else:
        client = OpenAI(api_key="not-needed", base_url=_OLLAMA_API_URL)
        _client_cache = client

    # Check if Ollama model is available (only once per model)
    if config.model_name not in _model_checked:
        if not _check_ollama_model_available(config.model_name):
            logger.warning(
                "Model '%s' not found in Ollama. Available models can be listed with: ollama list\n"
                "To pull this model, run: ollama pull %s",
                config.model_name,
                config.model_name,
            )
        _model_checked.add(config.model_name)

    logger.info("Calling Ollama LLM (model=%s, temp=%.2f)", config.model_name, config.temperature)
    return client, messages


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------



[docs]
def generate_answer(
    question: str,
    context_chunks: list[str],
    config: LLMConfig | None = None,
    conversation_history: list[dict] | None = None,
) -> str:
    """Generate an answer using the configured LLM backend (blocking).

    Backend selection:

    * HuggingFace repo IDs (``org/repo``) and local ``.gguf`` file paths use
      **llama.cpp** via ``llama-server`` (install: ``brew install llama-cpp``).
    * All other model names delegate to **Ollama**.

    Parameters
    ----------
    conversation_history : list[dict] | None
        Optional list of previous messages (role/content dicts) to provide
        context for follow-up questions.

    Examples::

        # Ollama (unchanged)
        paperrag query "What is X?" --model qwen2.5:1.5b

        # llama.cpp — download Qwen3 GGUF from HuggingFace automatically
        paperrag query "What is X?" --model Qwen/Qwen3-1.7B-GGUF

        # llama.cpp — use a local GGUF file
        paperrag query "What is X?" --model /path/to/model.gguf
    """
    config = config or LLMConfig()

    if not context_chunks:
        return "No context available to answer the question."

    if _is_llama_backend(config.model_name):
        model_path = _resolve_model_path(config.model_name)
        client = _get_or_start_llama_server(model_path, config.ctx_size, config.n_gpu_layers, config.n_threads)
        messages = _build_messages(question, context_chunks, config.model_name, config.system_prompt, think=config.think, conversation_history=conversation_history)
        logger.info(
            "Calling llama-server (model=%s, temp=%.2f)", config.model_name, config.temperature
        )
        response = client.chat.completions.create(  # type: ignore[union-attr]
            model=os.path.basename(model_path),
            messages=messages,
            temperature=config.temperature,
            max_tokens=config.max_tokens,
        )
        return _strip_trailing_source_footers(response.choices[0].message.content or "")

    try:
        client, messages = _prepare(question, context_chunks, config, conversation_history=conversation_history)
    except ImportError:
        raise ImportError(
            "The 'openai' package is required. Install with: uv pip install openai"
        )

    response = client.chat.completions.create(
        model=config.model_name,
        messages=messages,
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        extra_body={"num_ctx": config.ctx_size, "keep_alive": "30m"},
    )
    return _strip_trailing_source_footers(response.choices[0].message.content or "")




[docs]
def stream_answer(
    question: str,
    context_chunks: list[str],
    config: LLMConfig | None = None,
    source_files: list[str] | None = None,
    conversation_history: list[dict] | None = None,
) -> Iterator[str]:
    """Yield text chunks as they arrive from the LLM (streaming).

    Backend selection:

    * HuggingFace repo IDs (``org/repo``) and local ``.gguf`` file paths use
      **llama.cpp** via ``llama-server``.
    * All other model names delegate to **Ollama**.

    Parameters
    ----------
    conversation_history : list[dict] | None
        Optional list of previous messages (role/content dicts) to provide
        context for follow-up questions.

    Usage::

        for chunk in stream_answer(question, chunks, cfg.llm):
            sys.stdout.write(chunk)
            sys.stdout.flush()
    """
    config = config or LLMConfig()

    if not context_chunks:
        yield "No context available to answer the question."
        return

    # Compute per-chunk source labels: chunks from the same file get the same number
    source_labels: list[int] | None = None
    if source_files:
        file_to_label: dict[str, int] = {}
        source_labels = []
        for f in source_files:
            if f not in file_to_label:
                file_to_label[f] = len(file_to_label) + 1
            source_labels.append(file_to_label[f])

    if _is_llama_backend(config.model_name):
        model_path = _resolve_model_path(config.model_name)
        client = _get_or_start_llama_server(model_path, config.ctx_size, config.n_gpu_layers, config.n_threads)
        messages = _build_messages(question, context_chunks, config.model_name, config.system_prompt, source_labels=source_labels, think=config.think, conversation_history=conversation_history)
        logger.info(
            "Calling llama-server streaming (model=%s, temp=%.2f)",
            config.model_name,
            config.temperature,
        )
        response = client.chat.completions.create(  # type: ignore[union-attr]
            model=os.path.basename(model_path),
            messages=messages,
            temperature=config.temperature,
            max_tokens=config.max_tokens,
            stream=True,
        )
        yield from _sanitize_stream(
            delta
            for chunk in response
            if (delta := chunk.choices[0].delta.content)
        )
        return

    try:
        client, messages = _prepare(question, context_chunks, config, source_labels=source_labels, conversation_history=conversation_history)
    except ImportError:
        raise ImportError(
            "The 'openai' package is required. Install with: uv pip install openai"
        )

    response = client.chat.completions.create(
        model=config.model_name,
        messages=messages,
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        stream=True,
        extra_body={"num_ctx": config.ctx_size, "keep_alive": "30m"},
    )
    yield from _sanitize_stream(
        delta
        for chunk in response
        if (delta := chunk.choices[0].delta.content)
    )




[docs]
def stream_followup(
    question: str,
    conversation_history: list[dict],
    config: LLMConfig | None = None,
) -> Iterator[str]:
    """Yield text chunks for a follow-up question using conversation history only.

    This is used when retrieval returns no results but conversation history
    exists, allowing the LLM to answer based on previously discussed context.
    """
    config = config or LLMConfig()

    if not conversation_history:
        yield "No conversation history available to answer the question."
        return

    # Build messages with conversation history but no retrieval context
    model_lower = config.model_name.lower()
    user_prompt = question
    if "qwen3" in model_lower or "qwen-3" in model_lower:
        if not config.think:
            user_prompt += " /no_think"

    messages: list[dict] = [{"role": "system", "content": _get_followup_system_prompt(config)}]
    messages.extend(conversation_history)
    messages.append({"role": "user", "content": user_prompt})

    if _is_llama_backend(config.model_name):
        model_path = _resolve_model_path(config.model_name)
        client = _get_or_start_llama_server(model_path, config.ctx_size, config.n_gpu_layers, config.n_threads)
        logger.info(
            "Calling llama-server streaming follow-up (model=%s, temp=%.2f)",
            config.model_name,
            config.temperature,
        )
        response = client.chat.completions.create(  # type: ignore[union-attr]
            model=os.path.basename(model_path),
            messages=messages,
            temperature=config.temperature,
            max_tokens=config.max_tokens,
            stream=True,
        )
        yield from _sanitize_stream(
            delta
            for chunk in response
            if (delta := chunk.choices[0].delta.content)
        )
        return

    try:
        from openai import OpenAI

        global _client_cache
        if _client_cache is not None:
            client = _client_cache
        else:
            client = OpenAI(api_key="not-needed", base_url=_OLLAMA_API_URL)
            _client_cache = client
    except ImportError:
        raise ImportError(
            "The 'openai' package is required. Install with: uv pip install openai"
        )

    logger.info("Calling Ollama LLM follow-up (model=%s, temp=%.2f)", config.model_name, config.temperature)
    response = client.chat.completions.create(
        model=config.model_name,
        messages=messages,
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        stream=True,
        extra_body={"num_ctx": config.ctx_size, "keep_alive": "30m"},
    )
    yield from _sanitize_stream(
        delta
        for chunk in response
        if (delta := chunk.choices[0].delta.content)
    )




[docs]
def describe_llm_error(exc: Exception, model_name: str) -> tuple[str, str | None]:
    """Return (short_error, optional_hint) for a human-readable LLM error message.

    The hint is non-None when there's a concrete remediation action.
    """
    msg = str(exc)

    if _is_llama_backend(model_name):
        if isinstance(exc, FileNotFoundError):
            # Missing llama-server binary vs missing GGUF file
            if "llama-server" in msg:
                return (msg, "brew install llama-cpp")
            return (msg, None)
        if isinstance(exc, ImportError):
            return (msg, "uv pip install huggingface-hub")
        if isinstance(exc, RuntimeError) and "llama-server" in msg:
            return (msg, "brew install llama-cpp")
        return (f"llama.cpp error for '{model_name}': {msg}", None)

    try:
        from openai import APIStatusError
        if isinstance(exc, APIStatusError) and exc.status_code == 500:
            if "missing tensor" in msg or "failed to load model" in msg:
                return (
                    f"Model '{model_name}' failed to load (corrupted download).",
                    f"ollama pull {model_name}",
                )
            return (f"Ollama returned a server error for '{model_name}'.", None)
        if isinstance(exc, APIStatusError):
            return (f"API error {exc.status_code}: {exc.message}", None)
    except ImportError:
        pass
    return (f"LLM error: {msg}", None)