Files
2026-04-29 08:17:35 +00:00

42 lines
1.1 KiB
Python

from typing import Optional
def chunk_text(
text: str,
chunk_size: int = 512,
overlap: int = 50,
separator: str = "\n"
) -> list[dict]:
"""
Text in ueberlappende Chunks aufteilen
chunk_size: Max Zeichen pro Chunk
overlap: Ueberlappung zwischen Chunks
separator: Trennzeichen fuer saubere Splits
"""
# Zuerst an Separatoren splitten
paragraphs = text.split(separator)
chunks = []
current = ""
index = 0
for para in paragraphs:
if len(current) + len(para) > chunk_size and current:
chunks.append({
"text": current.strip(),
"index": index,
"start": text.find(current.strip()),
})
# Overlap: letzten Teil behalten
current = current[-overlap:] + para
index += 1
else:
current += separator + para
if current.strip():
chunks.append({
"text": current.strip(),
"index": index,
"start": text.find(current.strip()),
})
return chunks