Initial commit
This commit is contained in:
41
app/routers/chunking.py
Normal file
41
app/routers/chunking.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from typing import Optional
|
||||
|
||||
def chunk_text(
|
||||
text: str,
|
||||
chunk_size: int = 512,
|
||||
overlap: int = 50,
|
||||
separator: str = "\n"
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Text in ueberlappende Chunks aufteilen
|
||||
chunk_size: Max Zeichen pro Chunk
|
||||
overlap: Ueberlappung zwischen Chunks
|
||||
separator: Trennzeichen fuer saubere Splits
|
||||
"""
|
||||
# Zuerst an Separatoren splitten
|
||||
paragraphs = text.split(separator)
|
||||
chunks = []
|
||||
current = ""
|
||||
index = 0
|
||||
|
||||
for para in paragraphs:
|
||||
if len(current) + len(para) > chunk_size and current:
|
||||
chunks.append({
|
||||
"text": current.strip(),
|
||||
"index": index,
|
||||
"start": text.find(current.strip()),
|
||||
})
|
||||
# Overlap: letzten Teil behalten
|
||||
current = current[-overlap:] + para
|
||||
index += 1
|
||||
else:
|
||||
current += separator + para
|
||||
|
||||
if current.strip():
|
||||
chunks.append({
|
||||
"text": current.strip(),
|
||||
"index": index,
|
||||
"start": text.find(current.strip()),
|
||||
})
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user