Source code for paperrag.config

"""Configuration module using Pydantic models."""

from __future__ import annotations

import json
import logging
import tomllib
from pathlib import Path
from typing import Literal

from pydantic import AliasChoices, BaseModel, Field, field_validator

_rc_logger = logging.getLogger(__name__)

# Mapping from .paperragrc keys to config field paths
_RC_KEY_MAP: dict[str, tuple[str, type]] = {
    "model": ("llm.model_name", str),
    "topk": ("retriever.top_k", int),
    "max-tokens": ("llm.max_tokens", int),
    "temperature": ("llm.temperature", float),
    "threshold": ("retriever.score_threshold", float),
    "index-dir": ("index_dir", str),
    "input-dir": ("input_dir", str),
    "ctx-size": ("llm.ctx_size", int),
    "system-prompt": ("llm.system_prompt", str),
    "n-gpu-layers": ("llm.n_gpu_layers", int),
    "n-threads": ("llm.n_threads", int),
    "think": ("llm.think", bool),
    "embed-model": ("embedder.model_name", str),
}


# Named prompt presets for the review command.
PROMPT_PRESETS: dict[str, str] = {
    "default": (
        "You are a helpful research assistant. "
        "Answer based on the provided context. "
        "If the context does not contain relevant information, say so. "
        "Be concise and cite sources."
    ),
    "reviewer": (
        "You are an expert peer reviewer for a top-tier academic venue. "
        "Assess the paper with scientific rigor. "
        "Address: (1) novelty and significance of the contribution, "
        "(2) soundness of the methodology, "
        "(3) quality and reproducibility of experiments, "
        "(4) clarity of writing and presentation, "
        "(5) limitations and potential weaknesses. "
        "Be specific and constructive. Cite relevant paper sections."
    ),
    "summarizer": (
        "You are a research analyst producing structured paper summaries. "
        "For each answer extract: Research Question, Proposed Method, Key Results, "
        "Limitations, and Broader Impact. Use bullet points. Cite sources."
    ),
    "explainer": (
        "You are a science communicator explaining academic research to a "
        "non-specialist audience. Avoid jargon, use analogies where helpful. "
        "Focus on what was done, why it matters, and what was found. Cite sections."
    ),
}

# Presets that need more output tokens than the 1024 default.
PRESET_MAX_TOKENS: dict[str, int] = {
    "default": 1024,
    "reviewer": 2048,
    "summarizer": 2048,
    "explainer": 1024,
}


[docs] def load_rc(path: Path) -> dict: """Load a .paperragrc TOML file, returning a flat dict of overrides.""" if not path.is_file(): return {} try: with open(path, "rb") as f: return tomllib.load(f) except Exception as exc: _rc_logger.warning("Failed to parse %s: %s", path, exc) return {}
[docs] def apply_rc(cfg: "PaperRAGConfig", overrides: dict) -> None: """Apply .paperragrc overrides to a PaperRAGConfig instance.""" for key, value in overrides.items(): if key not in _RC_KEY_MAP: _rc_logger.warning("Unknown .paperragrc key: %s", key) continue field_path, expected_type = _RC_KEY_MAP[key] try: casted = expected_type(value) except (ValueError, TypeError) as exc: _rc_logger.warning("Invalid value for %s in .paperragrc: %s", key, exc) continue parts = field_path.split(".") if len(parts) == 2: sub, attr = parts setattr(getattr(cfg, sub), attr, casted) else: setattr(cfg, parts[0], casted)
def _default_input_dir() -> str: return str(Path.home() / "Documents" / "Mendeley Desktop")
[docs] class ParserConfig(BaseModel): """PDF parsing configuration.""" extract_tables: bool = False fallback_to_raw: bool = True ocr_mode: Literal["auto", "always", "never"] = Field( default="auto", description="OCR strategy: 'auto'=detect per PDF (recommended), 'always'=force OCR, 'never'=skip OCR" ) manifest_file: str | None = Field( default=None, description="CSV manifest with columns: filename,title,authors,abstract,doi (optional)" )
[docs] class ChunkerConfig(BaseModel): """Chunking configuration.""" chunk_size: int = Field(default=1000, ge=100) chunk_overlap: int = Field(default=200, ge=0)
[docs] class EmbedderConfig(BaseModel): """Embedding model configuration.""" model_name: str = "sentence-transformers/all-MiniLM-L6-v2" batch_size: int = Field(default=64, ge=1) device: str | None = None # auto-detect if None normalize: bool = True seed: int = 42
[docs] class RetrieverConfig(BaseModel): """Retrieval configuration.""" top_k: int = Field(default=5, ge=1) score_threshold: float = Field( default=0.1, ge=0.0, le=1.0, description="Minimum similarity score threshold (0.0 = no filtering)" ) use_mmr: bool = Field( default=False, description="Use Maximal Marginal Relevance for diverse retrieval" ) mmr_lambda: float = Field( default=0.5, ge=0.0, le=1.0, description="MMR lambda parameter (0=max diversity, 1=max relevance)" ) max_results_per_paper: int = Field( default=2, ge=1, description="Maximum results from same paper (re-ranking)" )
[docs] class IndexingConfig(BaseModel): """Indexing configuration.""" checkpoint_interval: int = Field( default=50, ge=0, description="Save index every N PDFs during indexing (0 = no checkpoints)" ) n_workers: int = Field( default=0, ge=0, description="Number of parallel PDF processing workers (0 = auto-detect)" ) pdf_timeout: int = Field( default=300, ge=0, description="Timeout in seconds for processing a single PDF (0 = no timeout)" ) enable_gc_per_batch: bool = Field( default=True, description="Enable garbage collection after each batch" ) log_memory_usage: bool = Field( default=False, description="Log memory usage during indexing" ) continue_on_error: bool = Field( default=True, description="Continue indexing even if individual PDFs fail" ) max_failures: int = Field( default=-1, description="Maximum number of failures before stopping (-1 = unlimited)" )
[docs] def get_n_workers(self) -> int: """Get actual worker count, auto-detecting if needed. Uses RAM-aware calculation to prevent OOM kills: - Each worker needs ~2GB during peak Docling usage - Formula: min(cpu_cores - 1, available_ram_gb // 2) """ import multiprocessing if self.n_workers == 0: # Auto-detect: balance CPU and RAM constraints cpu_count = multiprocessing.cpu_count() cpu_workers = max(1, cpu_count - 1) # RAM-aware calculation (2GB per worker budget) try: import psutil available_gb = psutil.virtual_memory().available / (1024**3) # Reserve 2GB for base system + embedding, rest for workers ram_workers = max(1, int((available_gb - 2) / 2)) workers = min(cpu_workers, ram_workers) if workers < cpu_workers: import logging logging.getLogger(__name__).info( "Limited workers to %d (from %d) due to RAM constraints (%.1fGB available)", workers, cpu_workers, available_gb ) return workers except ImportError: # psutil not available, fall back to CPU-only calculation return cpu_workers return self.n_workers
[docs] class LLMConfig(BaseModel): """LLM configuration.""" model_config = {"extra": "ignore"} # tolerate old snapshots with api_base/api_key model_name: str = "qwen2.5:1.5b" system_prompt: str = ( "You are a helpful research assistant. " "Answer based on the provided context. " "If the context does not contain relevant information, say so. " "Be concise and cite sources." ) temperature: float = 0.0 max_tokens: int = 1024 ctx_size: int = Field( default=4096, ge=512, validation_alias=AliasChoices("ctx_size", "n_ctx"), description=( "Context window size. " "For llama.cpp (GGUF models via llama-server) this sets --ctx-size. " "For Ollama it is forwarded as num_ctx in the extra_body parameter." ), ) n_gpu_layers: int = Field( default=0, ge=0, description=( "Number of layers to offload to GPU when using the llama.cpp backend (0 = CPU only). " "Ollama manages GPU offloading independently via its own configuration; " "this field has no effect on the Ollama backend." ), ) n_threads: int = Field( default=0, ge=0, description=( "Number of CPU threads for the llama.cpp backend (0 = auto-detect via os.cpu_count()). " "Has no effect on the Ollama backend." ), ) think: bool = Field( default=False, description=( "Enable thinking/reasoning mode for models that support it (e.g. Qwen3, Qwen3.5). " "When False (default), thinking is suppressed via /no_think for supported models." ), )
[docs] class PaperRAGConfig(BaseModel): """Top-level configuration.""" input_dir: str = Field(default_factory=_default_input_dir) _index_dir: str | None = None # Private field for custom index directory
[docs] @field_validator("input_dir", mode="before") @classmethod def expand_input_dir(cls, v: str) -> str: return str(Path(v).expanduser()) if v else v
parser: ParserConfig = Field(default_factory=ParserConfig) chunker: ChunkerConfig = Field(default_factory=ChunkerConfig) embedder: EmbedderConfig = Field(default_factory=EmbedderConfig) retriever: RetrieverConfig = Field(default_factory=RetrieverConfig) indexing: IndexingConfig = Field(default_factory=IndexingConfig) llm: LLMConfig = Field(default_factory=LLMConfig) @property def index_dir(self) -> str: """Return index directory - custom path if set, otherwise input_dir/.paperrag-index.""" if self._index_dir is not None: return self._index_dir input_path = Path(self.input_dir) if input_path.suffix.lower() == ".pdf": return str(input_path.parent / ".paperrag-index") return str(input_path / ".paperrag-index") @index_dir.setter def index_dir(self, value: str) -> None: """Set custom index directory.""" self._index_dir = str(Path(value).expanduser()) if value else value
[docs] def snapshot(self) -> dict: """Return a JSON-serialisable config snapshot for index metadata.""" return self.model_dump(mode="json")
[docs] def save_snapshot(self, path: Path) -> None: path.write_text(json.dumps(self.snapshot(), indent=2))
[docs] @classmethod def load_snapshot(cls, path: Path) -> PaperRAGConfig: data = json.loads(path.read_text()) return cls(**data)