42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
from typing import Optional
|
|
|
|
def chunk_text(
|
|
text: str,
|
|
chunk_size: int = 512,
|
|
overlap: int = 50,
|
|
separator: str = "\n"
|
|
) -> list[dict]:
|
|
"""
|
|
Text in ueberlappende Chunks aufteilen
|
|
chunk_size: Max Zeichen pro Chunk
|
|
overlap: Ueberlappung zwischen Chunks
|
|
separator: Trennzeichen fuer saubere Splits
|
|
"""
|
|
# Zuerst an Separatoren splitten
|
|
paragraphs = text.split(separator)
|
|
chunks = []
|
|
current = ""
|
|
index = 0
|
|
|
|
for para in paragraphs:
|
|
if len(current) + len(para) > chunk_size and current:
|
|
chunks.append({
|
|
"text": current.strip(),
|
|
"index": index,
|
|
"start": text.find(current.strip()),
|
|
})
|
|
# Overlap: letzten Teil behalten
|
|
current = current[-overlap:] + para
|
|
index += 1
|
|
else:
|
|
current += separator + para
|
|
|
|
if current.strip():
|
|
chunks.append({
|
|
"text": current.strip(),
|
|
"index": index,
|
|
"start": text.find(current.strip()),
|
|
})
|
|
|
|
return chunks
|