37 lines
895 B
Python
37 lines
895 B
Python
def chunk_text(
|
|
text: str,
|
|
chunk_size: int = 512,
|
|
overlap: int = 50,
|
|
) -> list[dict]:
|
|
"""Text in ueberlappende Chunks aufteilen"""
|
|
chunks = []
|
|
start = 0
|
|
index = 0
|
|
|
|
while start < len(text):
|
|
end = start + chunk_size
|
|
chunk = text[start:end]
|
|
|
|
if end < len(text):
|
|
last_period = max(
|
|
chunk.rfind(". "),
|
|
chunk.rfind(".\n"),
|
|
chunk.rfind("! "),
|
|
chunk.rfind("? "),
|
|
)
|
|
if last_period > chunk_size // 2:
|
|
end = start + last_period + 1
|
|
chunk = text[start:end]
|
|
|
|
if chunk.strip():
|
|
chunks.append({
|
|
"text": chunk.strip(),
|
|
"index": index,
|
|
"start": start,
|
|
})
|
|
|
|
start = end - overlap
|
|
index += 1
|
|
|
|
return chunks
|