Source code for paperrag.config

"""Configuration module using Pydantic models."""

from __future__ import annotations

import json
import logging
import tomllib
from pathlib import Path
from typing import Literal

from pydantic import AliasChoices, BaseModel, Field, field_validator

_rc_logger = logging.getLogger(__name__)

# Mapping from .paperragrc keys to config field paths
_RC_KEY_MAP: dict[str, tuple[str, type]] = {
    "model": ("llm.model_name", str),
    "topk": ("retriever.top_k", int),
    "max-tokens": ("llm.max_tokens", int),
    "temperature": ("llm.temperature", float),
    "threshold": ("retriever.score_threshold", float),
    "index-dir": ("index_dir", str),
    "input-dir": ("input_dir", str),
    "ctx-size": ("llm.ctx_size", int),
    "system-prompt": ("llm.system_prompt", str),
    "n-gpu-layers": ("llm.n_gpu_layers", int),
    "n-threads": ("llm.n_threads", int),
    "think": ("llm.think", bool),
    "embed-model": ("embedder.model_name", str),
}


# Named prompt presets for the review command.
PROMPT_PRESETS: dict[str, str] = {
    "default": (
        "You are a helpful research assistant. "
        "Answer based on the provided context. "
        "If the context does not contain relevant information, say so. "
        "Be concise and cite sources."
    ),
    "reviewer": (
        "You are an expert peer reviewer for a top-tier academic venue. "
        "Assess the paper with scientific rigor. "
        "Address: (1) novelty and significance of the contribution, "
        "(2) soundness of the methodology, "
        "(3) quality and reproducibility of experiments, "
        "(4) clarity of writing and presentation, "
        "(5) limitations and potential weaknesses. "
        "Be specific and constructive. Cite relevant paper sections."
    ),
    "summarizer": (
        "You are a research analyst producing structured paper summaries. "
        "For each answer extract: Research Question, Proposed Method, Key Results, "
        "Limitations, and Broader Impact. Use bullet points. Cite sources."
    ),
    "explainer": (
        "You are a science communicator explaining academic research to a "
        "non-specialist audience. Avoid jargon, use analogies where helpful. "
        "Focus on what was done, why it matters, and what was found. Cite sections."
    ),
}

# Presets that need more output tokens than the 1024 default.
PRESET_MAX_TOKENS: dict[str, int] = {
    "default": 1024,
    "reviewer": 2048,
    "summarizer": 2048,
    "explainer": 1024,
}



[docs]
def load_rc(path: Path) -> dict:
    """Load a .paperragrc TOML file, returning a flat dict of overrides."""
    if not path.is_file():
        return {}
    try:
        with open(path, "rb") as f:
            return tomllib.load(f)
    except Exception as exc:
        _rc_logger.warning("Failed to parse %s: %s", path, exc)
        return {}




[docs]
def apply_rc(cfg: "PaperRAGConfig", overrides: dict) -> None:
    """Apply .paperragrc overrides to a PaperRAGConfig instance."""
    for key, value in overrides.items():
        if key not in _RC_KEY_MAP:
            _rc_logger.warning("Unknown .paperragrc key: %s", key)
            continue
        field_path, expected_type = _RC_KEY_MAP[key]
        try:
            casted = expected_type(value)
        except (ValueError, TypeError) as exc:
            _rc_logger.warning("Invalid value for %s in .paperragrc: %s", key, exc)
            continue

        parts = field_path.split(".")
        if len(parts) == 2:
            sub, attr = parts
            setattr(getattr(cfg, sub), attr, casted)
        else:
            setattr(cfg, parts[0], casted)



def _default_input_dir() -> str:
    return str(Path.home() / "Documents" / "Mendeley Desktop")



[docs]
class ParserConfig(BaseModel):
    """PDF parsing configuration."""

    extract_tables: bool = False
    fallback_to_raw: bool = True
    ocr_mode: Literal["auto", "always", "never"] = Field(
        default="auto",
        description="OCR strategy: 'auto'=detect per PDF (recommended), 'always'=force OCR, 'never'=skip OCR"
    )
    manifest_file: str | None = Field(
        default=None,
        description="CSV manifest with columns: filename,title,authors,abstract,doi (optional)"
    )




[docs]
class ChunkerConfig(BaseModel):
    """Chunking configuration."""

    chunk_size: int = Field(default=1000, ge=100)
    chunk_overlap: int = Field(default=200, ge=0)




[docs]
class EmbedderConfig(BaseModel):
    """Embedding model configuration."""

    model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
    batch_size: int = Field(default=64, ge=1)
    device: str | None = None  # auto-detect if None
    normalize: bool = True
    seed: int = 42




[docs]
class RetrieverConfig(BaseModel):
    """Retrieval configuration."""

    top_k: int = Field(default=5, ge=1)
    score_threshold: float = Field(
        default=0.1,
        ge=0.0,
        le=1.0,
        description="Minimum similarity score threshold (0.0 = no filtering)"
    )
    use_mmr: bool = Field(
        default=False,
        description="Use Maximal Marginal Relevance for diverse retrieval"
    )
    mmr_lambda: float = Field(
        default=0.5,
        ge=0.0,
        le=1.0,
        description="MMR lambda parameter (0=max diversity, 1=max relevance)"
    )
    max_results_per_paper: int = Field(
        default=2,
        ge=1,
        description="Maximum results from same paper (re-ranking)"
    )




[docs]
class IndexingConfig(BaseModel):
    """Indexing configuration."""

    checkpoint_interval: int = Field(
        default=50,
        ge=0,
        description="Save index every N PDFs during indexing (0 = no checkpoints)"
    )
    n_workers: int = Field(
        default=0,
        ge=0,
        description="Number of parallel PDF processing workers (0 = auto-detect)"
    )
    pdf_timeout: int = Field(
        default=300,
        ge=0,
        description="Timeout in seconds for processing a single PDF (0 = no timeout)"
    )
    enable_gc_per_batch: bool = Field(
        default=True,
        description="Enable garbage collection after each batch"
    )
    log_memory_usage: bool = Field(
        default=False,
        description="Log memory usage during indexing"
    )
    continue_on_error: bool = Field(
        default=True,
        description="Continue indexing even if individual PDFs fail"
    )
    max_failures: int = Field(
        default=-1,
        description="Maximum number of failures before stopping (-1 = unlimited)"
    )


[docs]
    def get_n_workers(self) -> int:
        """Get actual worker count, auto-detecting if needed.
        
        Uses RAM-aware calculation to prevent OOM kills:
        - Each worker needs ~2GB during peak Docling usage
        - Formula: min(cpu_cores - 1, available_ram_gb // 2)
        """
        import multiprocessing
        if self.n_workers == 0:
            # Auto-detect: balance CPU and RAM constraints
            cpu_count = multiprocessing.cpu_count()
            cpu_workers = max(1, cpu_count - 1)
            
            # RAM-aware calculation (2GB per worker budget)
            try:
                import psutil
                available_gb = psutil.virtual_memory().available / (1024**3)
                # Reserve 2GB for base system + embedding, rest for workers
                ram_workers = max(1, int((available_gb - 2) / 2))
                workers = min(cpu_workers, ram_workers)
                if workers < cpu_workers:
                    import logging
                    logging.getLogger(__name__).info(
                        "Limited workers to %d (from %d) due to RAM constraints (%.1fGB available)",
                        workers, cpu_workers, available_gb
                    )
                return workers
            except ImportError:
                # psutil not available, fall back to CPU-only calculation
                return cpu_workers
        return self.n_workers





[docs]
class LLMConfig(BaseModel):
    """LLM configuration."""

    model_config = {"extra": "ignore"}  # tolerate old snapshots with api_base/api_key

    model_name: str = "qwen2.5:1.5b"
    system_prompt: str = (
        "You are a helpful research assistant. "
        "Answer based on the provided context. "
        "If the context does not contain relevant information, say so. "
        "Be concise and cite sources."
    )
    temperature: float = 0.0
    max_tokens: int = 1024
    ctx_size: int = Field(
        default=4096,
        ge=512,
        validation_alias=AliasChoices("ctx_size", "n_ctx"),
        description=(
            "Context window size. "
            "For llama.cpp (GGUF models via llama-server) this sets --ctx-size. "
            "For Ollama it is forwarded as num_ctx in the extra_body parameter."
        ),
    )
    n_gpu_layers: int = Field(
        default=0,
        ge=0,
        description=(
            "Number of layers to offload to GPU when using the llama.cpp backend (0 = CPU only). "
            "Ollama manages GPU offloading independently via its own configuration; "
            "this field has no effect on the Ollama backend."
        ),
    )
    n_threads: int = Field(
        default=0,
        ge=0,
        description=(
            "Number of CPU threads for the llama.cpp backend (0 = auto-detect via os.cpu_count()). "
            "Has no effect on the Ollama backend."
        ),
    )
    think: bool = Field(
        default=False,
        description=(
            "Enable thinking/reasoning mode for models that support it (e.g. Qwen3, Qwen3.5). "
            "When False (default), thinking is suppressed via /no_think for supported models."
        ),
    )




[docs]
class PaperRAGConfig(BaseModel):
    """Top-level configuration."""

    input_dir: str = Field(default_factory=_default_input_dir)
    _index_dir: str | None = None  # Private field for custom index directory


[docs]
    @field_validator("input_dir", mode="before")
    @classmethod
    def expand_input_dir(cls, v: str) -> str:
        return str(Path(v).expanduser()) if v else v


    parser: ParserConfig = Field(default_factory=ParserConfig)
    chunker: ChunkerConfig = Field(default_factory=ChunkerConfig)
    embedder: EmbedderConfig = Field(default_factory=EmbedderConfig)
    retriever: RetrieverConfig = Field(default_factory=RetrieverConfig)
    indexing: IndexingConfig = Field(default_factory=IndexingConfig)
    llm: LLMConfig = Field(default_factory=LLMConfig)

    @property
    def index_dir(self) -> str:
        """Return index directory - custom path if set, otherwise input_dir/.paperrag-index."""
        if self._index_dir is not None:
            return self._index_dir
        input_path = Path(self.input_dir)
        if input_path.suffix.lower() == ".pdf":
            return str(input_path.parent / ".paperrag-index")
        return str(input_path / ".paperrag-index")
    
    @index_dir.setter
    def index_dir(self, value: str) -> None:
        """Set custom index directory."""
        self._index_dir = str(Path(value).expanduser()) if value else value


[docs]
    def snapshot(self) -> dict:
        """Return a JSON-serialisable config snapshot for index metadata."""
        return self.model_dump(mode="json")



[docs]
    def save_snapshot(self, path: Path) -> None:
        path.write_text(json.dumps(self.snapshot(), indent=2))



[docs]
    @classmethod
    def load_snapshot(cls, path: Path) -> PaperRAGConfig:
        data = json.loads(path.read_text())
        return cls(**data)