Source code for paperrag.embedder
"""Embedding module using sentence-transformers."""
from __future__ import annotations
import logging
from typing import Sequence
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from paperrag.config import EmbedderConfig
logger = logging.getLogger(__name__)
[docs]
class Embedder:
"""Wrapper around a SentenceTransformer model with batched encoding
and deterministic seed control."""
def __init__(self, config: EmbedderConfig | None = None) -> None:
self.config = config or EmbedderConfig()
self._set_seed(self.config.seed)
device = self.config.device or ("cuda" if torch.cuda.is_available() else "cpu")
logger.info(
"Loading embedding model %s on %s",
self.config.model_name,
device,
)
try:
self.model = SentenceTransformer(
self.config.model_name, device=device, local_files_only=True
)
except Exception:
self.model = SentenceTransformer(self.config.model_name, device=device)
self.device = device
self.dimension: int = self.model.get_sentence_embedding_dimension() # type: ignore[assignment]
logger.info("Embedding dimension: %d", self.dimension)
@staticmethod
def _set_seed(seed: int) -> None:
torch.manual_seed(seed)
np.random.seed(seed)
[docs]
def embed(self, texts: Sequence[str]) -> np.ndarray:
"""Encode *texts* and return an (N, D) float32 array.
Uses batched encoding with the configured batch size.
"""
if not texts:
return np.empty((0, self.dimension), dtype=np.float32)
embeddings = self.model.encode(
list(texts),
batch_size=self.config.batch_size,
show_progress_bar=False,
normalize_embeddings=self.config.normalize,
convert_to_numpy=True,
)
arr = np.asarray(embeddings, dtype=np.float32)
if arr.ndim == 1:
arr = arr.reshape(1, -1)
assert arr.shape[1] == self.dimension, (
f"Dimension mismatch: got {arr.shape[1]}, expected {self.dimension}"
)
return arr