Initial commit

This commit is contained in:
root
2026-04-29 08:17:35 +00:00
commit ef55253cbd
49 changed files with 3073 additions and 0 deletions

41
app/routers/chunking.py Normal file
View File

@@ -0,0 +1,41 @@
from typing import Optional
def chunk_text(
text: str,
chunk_size: int = 512,
overlap: int = 50,
separator: str = "\n"
) -> list[dict]:
"""
Text in ueberlappende Chunks aufteilen
chunk_size: Max Zeichen pro Chunk
overlap: Ueberlappung zwischen Chunks
separator: Trennzeichen fuer saubere Splits
"""
# Zuerst an Separatoren splitten
paragraphs = text.split(separator)
chunks = []
current = ""
index = 0
for para in paragraphs:
if len(current) + len(para) > chunk_size and current:
chunks.append({
"text": current.strip(),
"index": index,
"start": text.find(current.strip()),
})
# Overlap: letzten Teil behalten
current = current[-overlap:] + para
index += 1
else:
current += separator + para
if current.strip():
chunks.append({
"text": current.strip(),
"index": index,
"start": text.find(current.strip()),
})
return chunks