Source code for aiecs.application.knowledge_graph.builder.text_chunker

# /*---------------------------------------------------------------------------------------------
#  *  Copyright (c) IRETBL Corporation. All rights reserved.
#  *  Licensed under the Apache-2.0. See License.txt in the project root for license information.
#  *--------------------------------------------------------------------------------------------*/
"""
Text Chunker

Splits large texts into manageable chunks for processing.
"""

from typing import List, Optional, Dict, Any
from dataclasses import dataclass, field


@dataclass
class TextChunk:
    """
    A chunk of text with metadata

    Attributes:
        text: The chunk text content
        start_char: Starting character position in original text
        end_char: Ending character position in original text
        chunk_index: Index of this chunk (0-based)
        metadata: Optional metadata about this chunk
    """

    text: str
    start_char: int
    end_char: int
    chunk_index: int
    metadata: Dict[str, Any] = field(default_factory=dict)


[docs] class TextChunker: """ Split large texts into smaller chunks Strategies: - Fixed size chunking (by character or token count) - Sentence-aware chunking (don't break sentences) - Paragraph-aware chunking (preserve paragraphs) - Overlapping chunks (for context preservation) Example: ```python chunker = TextChunker(chunk_size=1000, overlap=100) chunks = chunker.chunk_text(long_document) for chunk in chunks: # Process each chunk separately result = await process(chunk.text) ``` """
[docs] def __init__( self, chunk_size: int = 1000, overlap: int = 100, respect_sentences: bool = True, respect_paragraphs: bool = False, min_chunk_size: int = 100, ): """ Initialize text chunker Args: chunk_size: Target size for each chunk (in characters) overlap: Number of characters to overlap between chunks respect_sentences: Try to break at sentence boundaries respect_paragraphs: Try to break at paragraph boundaries min_chunk_size: Minimum chunk size (don't create tiny chunks) """ self.chunk_size = chunk_size self.overlap = overlap self.respect_sentences = respect_sentences self.respect_paragraphs = respect_paragraphs self.min_chunk_size = min_chunk_size
[docs] def chunk_text(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> List[TextChunk]: """ Split text into chunks Args: text: Text to chunk metadata: Optional metadata to attach to chunks Returns: List of TextChunk objects """ if not text: return [] # Handle short texts if len(text) <= self.chunk_size: return [ TextChunk( text=text, start_char=0, end_char=len(text), chunk_index=0, metadata=metadata or {}, ) ] # Choose chunking strategy if self.respect_paragraphs: return self._chunk_by_paragraphs(text, metadata) elif self.respect_sentences: return self._chunk_by_sentences(text, metadata) else: return self._chunk_fixed_size(text, metadata)
def _chunk_fixed_size(self, text: str, metadata: Optional[Dict[str, Any]]) -> List[TextChunk]: """ Chunk text by fixed size with overlap Args: text: Text to chunk metadata: Optional metadata Returns: List of TextChunk objects """ chunks = [] start = 0 chunk_index = 0 while start < len(text): end = min(start + self.chunk_size, len(text)) chunk = TextChunk( text=text[start:end], start_char=start, end_char=end, chunk_index=chunk_index, metadata=metadata or {}, ) chunks.append(chunk) # Move to next chunk with overlap start += self.chunk_size - self.overlap chunk_index += 1 return chunks def _chunk_by_sentences(self, text: str, metadata: Optional[Dict[str, Any]]) -> List[TextChunk]: """ Chunk text respecting sentence boundaries Args: text: Text to chunk metadata: Optional metadata Returns: List of TextChunk objects """ # Simple sentence splitting (can be improved with NLTK/spaCy) sentences = self._split_sentences(text) chunks: List[TextChunk] = [] current_chunk: List[str] = [] current_length = 0 current_start = 0 chunk_index = 0 for sent in sentences: sent_length = len(sent) # If adding this sentence would exceed chunk_size if current_length + sent_length > self.chunk_size and current_chunk: # Finalize current chunk chunk_text = " ".join(current_chunk) chunk_end = current_start + len(chunk_text) chunks.append( TextChunk( text=chunk_text, start_char=current_start, end_char=chunk_end, chunk_index=chunk_index, metadata=metadata or {}, ) ) # Start new chunk with overlap (last few sentences) overlap_sentences: List[str] = self._get_overlap_sentences(current_chunk) current_chunk = overlap_sentences current_length = sum(len(s) + 1 for s in current_chunk) # +1 for spaces current_start = chunk_end - current_length chunk_index += 1 current_chunk.append(sent) current_length += sent_length + 1 # +1 for space # Add final chunk if current_chunk: chunk_text = " ".join(current_chunk) chunks.append( TextChunk( text=chunk_text, start_char=current_start, end_char=len(text), chunk_index=chunk_index, metadata=metadata or {}, ) ) return chunks def _chunk_by_paragraphs(self, text: str, metadata: Optional[Dict[str, Any]]) -> List[TextChunk]: """ Chunk text respecting paragraph boundaries Args: text: Text to chunk metadata: Optional metadata Returns: List of TextChunk objects """ # Split by double newlines (paragraphs) paragraphs = text.split("\n\n") chunks: List[TextChunk] = [] current_chunk: List[str] = [] current_length = 0 current_start = 0 chunk_index = 0 for para in paragraphs: para = para.strip() if not para: continue para_length = len(para) # If adding this paragraph would exceed chunk_size if current_length + para_length > self.chunk_size and current_chunk: # Finalize current chunk chunk_text = "\n\n".join(current_chunk) chunk_end = current_start + len(chunk_text) chunks.append( TextChunk( text=chunk_text, start_char=current_start, end_char=chunk_end, chunk_index=chunk_index, metadata=metadata or {}, ) ) # Start new chunk current_chunk = [] current_length = 0 current_start = chunk_end chunk_index += 1 current_chunk.append(para) current_length += para_length + 2 # +2 for \n\n # Add final chunk if current_chunk: chunk_text = "\n\n".join(current_chunk) chunks.append( TextChunk( text=chunk_text, start_char=current_start, end_char=len(text), chunk_index=chunk_index, metadata=metadata or {}, ) ) return chunks def _split_sentences(self, text: str) -> List[str]: """ Split text into sentences (simple implementation) For production, consider using NLTK's sent_tokenize or spaCy. Args: text: Text to split Returns: List of sentences """ import re # Simple sentence splitting by period, question mark, exclamation # This is a basic implementation - can be improved sentences = re.split(r"(?<=[.!?])\s+", text) return [s.strip() for s in sentences if s.strip()] def _get_overlap_sentences(self, sentences: List[str]) -> List[str]: """ Get last few sentences for overlap Args: sentences: List of sentences Returns: Last few sentences that fit in overlap size """ if not sentences or self.overlap == 0: return [] overlap_sentences: List[str] = [] overlap_length = 0 # Take sentences from end until we reach overlap size for sent in reversed(sentences): if overlap_length + len(sent) + 1 <= self.overlap: overlap_sentences.insert(0, sent) overlap_length += len(sent) + 1 else: break return overlap_sentences