Initial commit
This commit is contained in:
36
app/utils/chunking.py
Normal file
36
app/utils/chunking.py
Normal file
@@ -0,0 +1,36 @@
|
||||
def chunk_text(
|
||||
text: str,
|
||||
chunk_size: int = 512,
|
||||
overlap: int = 50,
|
||||
) -> list[dict]:
|
||||
"""Text in ueberlappende Chunks aufteilen"""
|
||||
chunks = []
|
||||
start = 0
|
||||
index = 0
|
||||
|
||||
while start < len(text):
|
||||
end = start + chunk_size
|
||||
chunk = text[start:end]
|
||||
|
||||
if end < len(text):
|
||||
last_period = max(
|
||||
chunk.rfind(". "),
|
||||
chunk.rfind(".\n"),
|
||||
chunk.rfind("! "),
|
||||
chunk.rfind("? "),
|
||||
)
|
||||
if last_period > chunk_size // 2:
|
||||
end = start + last_period + 1
|
||||
chunk = text[start:end]
|
||||
|
||||
if chunk.strip():
|
||||
chunks.append({
|
||||
"text": chunk.strip(),
|
||||
"index": index,
|
||||
"start": start,
|
||||
})
|
||||
|
||||
start = end - overlap
|
||||
index += 1
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user