More datatypes

This commit is contained in:
root
2026-04-29 09:11:46 +00:00
parent ef55253cbd
commit 965e900743
7 changed files with 519 additions and 51 deletions

View File

@@ -7,6 +7,12 @@ import pypdf
import docx
import io
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
from app.utils.image_processor import (
image_to_text,
is_image,
SUPPORTED_IMAGE_FORMATS,
DEFAULT_PROMPT
)
from pydantic import BaseModel, Field
from typing import Optional
from app.auth import verify_api_key
@@ -14,6 +20,12 @@ from app.database import get_db
from app.utils.stats import track_usage
from app.utils.chunking import chunk_text
SUPPORTED_FORMATS = (
".txt .md .pdf .docx .xlsx .csv "
".pptx .html .htm .msg .eml .json "
+ " ".join(SUPPORTED_IMAGE_FORMATS)
)
router = APIRouter()
logger = logging.getLogger(__name__)
@@ -691,59 +703,169 @@ async def rag(
async def upload_file(
store_id: str,
file: UploadFile = File(...),
chunk_size: int = Form(default=512),
chunk_overlap: int = Form(default=50),
chunk_size: int = Form(default=512),
chunk_overlap: int = Form(default=50),
vision_prompt: str = Form(default=DEFAULT_PROMPT),
vision_model: str = Form(default=None),
user: dict = Depends(verify_api_key),
db=Depends(get_db)
):
"""Datei hochladen, chunken und in Vector Store speichern"""
start = time.time()
start = time.time()
await _check_access(db, store_id, user["user_id"])
content = await file.read()
filename = file.filename.lower()
try:
if filename.endswith(".pdf"):
pdf = pypdf.PdfReader(io.BytesIO(content))
text = "\n".join(
if is_image(filename):
# Modell validieren falls angegeben
if vision_model:
await validate_vision_model(vision_model, user["token"])
use_model = vision_model
else:
use_model = None
text = await image_to_text(
content=content,
filename=filename,
token=user["token"],
model=use_model,
prompt=vision_prompt
)
chunks = [{"text": text, "index": 0, "start": 0}]
elif filename.endswith((".txt", ".md")):
text = content.decode("utf-8")
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".pdf"):
import pypdf, io
pdf = pypdf.PdfReader(io.BytesIO(content))
text = "\n".join(
page.extract_text()
for page in pdf.pages
if page.extract_text()
)
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".docx"):
doc = docx.Document(io.BytesIO(content))
text = "\n".join(
p.text for p in doc.paragraphs if p.text.strip()
import docx, io
doc = docx.Document(io.BytesIO(content))
text = "\n".join(
p.text for p in doc.paragraphs
if p.text.strip()
)
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".txt"):
text = content.decode("utf-8")
elif filename.endswith(".xlsx"):
import openpyxl, io
wb = openpyxl.load_workbook(io.BytesIO(content))
lines = []
for sheet in wb.worksheets:
lines.append(f"=== Tabelle: {sheet.title} ===")
for row in sheet.iter_rows(values_only=True):
if any(cell is not None for cell in row):
lines.append(
" | ".join(
str(c) for c in row if c is not None
)
)
text = "\n".join(lines)
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".md"):
text = content.decode("utf-8")
elif filename.endswith(".csv"):
import csv, io
reader = csv.reader(
io.StringIO(content.decode("utf-8"))
)
text = "\n".join(
" | ".join(row)
for row in reader
if any(cell.strip() for cell in row)
)
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".pptx"):
from pptx import Presentation
import io
prs = Presentation(io.BytesIO(content))
lines = []
for i, slide in enumerate(prs.slides):
lines.append(f"=== Folie {i+1} ===")
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
lines.append(shape.text)
text = "\n".join(lines)
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith((".html", ".htm")):
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, "html.parser")
for tag in soup(["script", "style", "nav", "footer"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".msg"):
import extract_msg, io
msg = extract_msg.Message(io.BytesIO(content))
text = "\n".join(filter(None, [
f"Von: {msg.sender}",
f"An: {msg.to}",
f"Betreff: {msg.subject}",
f"Datum: {msg.date}",
"" * 40,
msg.body
]))
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".eml"):
import email
msg = email.message_from_bytes(content)
body = ""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(
decode=True
).decode("utf-8", errors="ignore")
break
else:
body = msg.get_payload(
decode=True
).decode("utf-8", errors="ignore")
text = "\n".join(filter(None, [
f"Von: {msg.get('From')}",
f"An: {msg.get('To')}",
f"Betreff: {msg.get('Subject')}",
f"Datum: {msg.get('Date')}",
"" * 40,
body
]))
chunks = chunk_text(text, chunk_size, chunk_overlap)
elif filename.endswith(".json"):
import json as jsonlib
data = jsonlib.loads(content.decode("utf-8"))
text = jsonlib.dumps(data, indent=2, ensure_ascii=False)
chunks = chunk_text(text, chunk_size, chunk_overlap)
else:
raise HTTPException(
400,
f"Nicht unterstütztes Format: {file.filename}. "
f"Unterstützt: .pdf, .docx, .txt, .md"
f"Unterstützt: {SUPPORTED_FORMATS}"
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(422, f"Datei konnte nicht gelesen werden: {e}")
raise HTTPException(
422,
f"Datei konnte nicht gelesen werden: {e}"
)
if not text.strip():
raise HTTPException(422, "Datei enthaelt keinen Text")
chunks = chunk_text(
text=text,
chunk_size=chunk_size,
overlap=chunk_overlap
)
if not any(c["text"].strip() for c in chunks):
raise HTTPException(422, "Datei enthält keinen Text")
ids = []
failed = 0
@@ -752,14 +874,16 @@ async def upload_file(
try:
embedding = await _embed(chunk["text"], user["token"])
doc_id = await db.fetchval(
"""INSERT INTO documents (store_id, content, metadata, embedding)
"""INSERT INTO documents
(store_id, content, metadata, embedding)
VALUES ($1, $2, $3, $4::vector) RETURNING id""",
store_id,
chunk["text"],
json.dumps({
"source": file.filename,
"chunk": chunk["index"],
"start": chunk.get("start", 0),
"source": file.filename,
"type": "image" if is_image(filename) else "document",
"chunk": chunk["index"],
"start": chunk.get("start", 0),
}),
str(embedding)
)
@@ -778,6 +902,7 @@ async def upload_file(
return {
"object": "vector_store.file_batch",
"filename": file.filename,
"type": "image" if is_image(filename) else "document",
"counts": {
"completed": len(ids),
"failed": failed,
@@ -785,3 +910,27 @@ async def upload_file(
},
"ids": ids
}
@router.get("/vision/models")
async def list_vision_models(
user: dict = Depends(verify_api_key),
):
"""Alle verfügbaren Vision Modelle"""
all_models = await _get_all_models()
vision_models = [
{
"id": m["id"],
"object": "model",
"owned_by": "system",
"default": m["id"] == VISION_MODEL,
}
for m in all_models
if m.get("supports_vision") is True
]
return {
"object": "list",
"default": VISION_MODEL,
"data": vision_models
}