36 lines
1.0 KiB
Python
36 lines
1.0 KiB
Python
from fastapi import APIRouter, UploadFile, File, Depends, HTTPException
|
|
import pypdf
|
|
import docx
|
|
import io
|
|
|
|
router = APIRouter()
|
|
|
|
async def extract_text(file: UploadFile) -> str:
|
|
"""Text aus verschiedenen Dateiformaten extrahieren"""
|
|
content = await file.read()
|
|
|
|
if file.filename.endswith(".pdf"):
|
|
pdf = pypdf.PdfReader(io.BytesIO(content))
|
|
return "\n".join(page.extract_text() for page in pdf.pages)
|
|
|
|
elif file.filename.endswith(".docx"):
|
|
doc = docx.Document(io.BytesIO(content))
|
|
return "\n".join(p.text for p in doc.paragraphs)
|
|
|
|
elif file.filename.endswith(".txt"):
|
|
return content.decode("utf-8")
|
|
|
|
else:
|
|
raise HTTPException(400, f"Nicht unterstütztes Format: {file.filename}")
|
|
|
|
|
|
@router.post("/v1/vector_stores/{store_id}/upload")
|
|
async def upload_file(
|
|
store_id: str,
|
|
file: UploadFile = File(...),
|
|
user: dict = Depends(verify_api_key),
|
|
db=Depends(get_db)
|
|
):
|
|
text = await extract_text(file)
|
|
chunks = chunk_text(text)
|