Source code for paperrag.chunker

"""Section-aware deterministic chunking module."""

from __future__ import annotations

import hashlib
import logging
from dataclasses import dataclass

from paperrag.config import ChunkerConfig
from paperrag.parser import ParsedPaper

logger = logging.getLogger(__name__)


[docs] @dataclass class Chunk: """A single text chunk with full provenance metadata.""" chunk_id: int hash_id: str text: str paper_title: str section_name: str file_path: str file_hash: str
[docs] def to_dict(self) -> dict: return { "chunk_id": self.chunk_id, "hash_id": self.hash_id, "text": self.text, "paper_title": self.paper_title, "section_name": self.section_name, "file_path": self.file_path, "file_hash": self.file_hash, }
[docs] @classmethod def from_dict(cls, d: dict) -> Chunk: return cls(**d)
def _deterministic_hash(text: str, file_hash: str, chunk_id: int) -> str: """Produce a deterministic hash for a chunk.""" payload = f"{file_hash}:{chunk_id}:{text}" return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:16]
[docs] def chunk_text( text: str, chunk_size: int, chunk_overlap: int, ) -> list[str]: """Split *text* into overlapping windows of characters. Deterministic: same input always produces the same output list. """ if not text: return [] chunks: list[str] = [] start = 0 while start < len(text): end = start + chunk_size chunks.append(text[start:end]) start += chunk_size - chunk_overlap return chunks
[docs] def chunk_paper(paper: ParsedPaper, config: ChunkerConfig | None = None) -> list[Chunk]: """Chunk a parsed paper into a list of Chunk objects. Chunking is section-aware: each section is chunked independently and chunks carry the section name in their metadata. Ordering is deterministic. """ config = config or ChunkerConfig() all_chunks: list[Chunk] = [] global_id = 0 for section in paper.sections: text_pieces = chunk_text(section.text, config.chunk_size, config.chunk_overlap) for piece in text_pieces: piece = piece.strip() if not piece: continue h = _deterministic_hash(piece, paper.file_hash, global_id) all_chunks.append( Chunk( chunk_id=global_id, hash_id=h, text=piece, paper_title=paper.title, section_name=section.name, file_path=paper.file_path, file_hash=paper.file_hash, ) ) global_id += 1 logger.info( "Chunked '%s' into %d chunks (sections=%d)", paper.title[:60], len(all_chunks), len(paper.sections), ) return all_chunks