Source code for paperrag.chunker
"""Section-aware deterministic chunking module."""
from __future__ import annotations
import hashlib
import logging
from dataclasses import dataclass
from paperrag.config import ChunkerConfig
from paperrag.parser import ParsedPaper
logger = logging.getLogger(__name__)
[docs]
@dataclass
class Chunk:
"""A single text chunk with full provenance metadata."""
chunk_id: int
hash_id: str
text: str
paper_title: str
section_name: str
file_path: str
file_hash: str
[docs]
def to_dict(self) -> dict:
return {
"chunk_id": self.chunk_id,
"hash_id": self.hash_id,
"text": self.text,
"paper_title": self.paper_title,
"section_name": self.section_name,
"file_path": self.file_path,
"file_hash": self.file_hash,
}
[docs]
@classmethod
def from_dict(cls, d: dict) -> Chunk:
return cls(**d)
def _deterministic_hash(text: str, file_hash: str, chunk_id: int) -> str:
"""Produce a deterministic hash for a chunk."""
payload = f"{file_hash}:{chunk_id}:{text}"
return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:16]
[docs]
def chunk_text(
text: str,
chunk_size: int,
chunk_overlap: int,
) -> list[str]:
"""Split *text* into overlapping windows of characters.
Deterministic: same input always produces the same output list.
"""
if not text:
return []
chunks: list[str] = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += chunk_size - chunk_overlap
return chunks
[docs]
def chunk_paper(paper: ParsedPaper, config: ChunkerConfig | None = None) -> list[Chunk]:
"""Chunk a parsed paper into a list of Chunk objects.
Chunking is section-aware: each section is chunked independently and
chunks carry the section name in their metadata. Ordering is
deterministic.
"""
config = config or ChunkerConfig()
all_chunks: list[Chunk] = []
global_id = 0
for section in paper.sections:
text_pieces = chunk_text(section.text, config.chunk_size, config.chunk_overlap)
for piece in text_pieces:
piece = piece.strip()
if not piece:
continue
h = _deterministic_hash(piece, paper.file_hash, global_id)
all_chunks.append(
Chunk(
chunk_id=global_id,
hash_id=h,
text=piece,
paper_title=paper.title,
section_name=section.name,
file_path=paper.file_path,
file_hash=paper.file_hash,
)
)
global_id += 1
logger.info(
"Chunked '%s' into %d chunks (sections=%d)",
paper.title[:60],
len(all_chunks),
len(paper.sections),
)
return all_chunks