More datatypes

2026-04-29 09:11:46 +00:00
parent ef55253cbd
commit 965e900743
7 changed files with 519 additions and 51 deletions
--- a/app/routers/openai_compat.py
+++ b/app/routers/openai_compat.py
@@ -7,6 +7,12 @@ import pypdf
 import docx
 import io
 from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
+from app.utils.image_processor import (
+    image_to_text,
+    is_image,
+    SUPPORTED_IMAGE_FORMATS,
+    DEFAULT_PROMPT
+)
 from pydantic import BaseModel, Field
 from typing import Optional
 from app.auth import verify_api_key
@@ -14,6 +20,12 @@ from app.database import get_db
 from app.utils.stats import track_usage
 from app.utils.chunking import chunk_text

+SUPPORTED_FORMATS = (
+    ".txt .md .pdf .docx .xlsx .csv "
+    ".pptx .html .htm .msg .eml .json "
+    + " ".join(SUPPORTED_IMAGE_FORMATS)
+)
+
 router = APIRouter()
 logger = logging.getLogger(__name__)

@@ -691,59 +703,169 @@ async def rag(
 async def upload_file(
    store_id:      str,
    file:          UploadFile = File(...),
-    chunk_size:    int = Form(default=512),
-    chunk_overlap: int = Form(default=50),
+    chunk_size:    int  = Form(default=512),
+    chunk_overlap: int  = Form(default=50),
+    vision_prompt: str  = Form(default=DEFAULT_PROMPT),
+    vision_model:  str  = Form(default=None),
    user:          dict = Depends(verify_api_key),
    db=Depends(get_db)
 ):
-    """Datei hochladen, chunken und in Vector Store speichern"""
-    start = time.time()
+    start    = time.time()
    await _check_access(db, store_id, user["user_id"])

    content  = await file.read()
    filename = file.filename.lower()

    try:
-        if filename.endswith(".pdf"):
-            pdf  = pypdf.PdfReader(io.BytesIO(content))
-            text = "\n".join(
+        if is_image(filename):
+            # Modell validieren falls angegeben
+            if vision_model:
+                await validate_vision_model(vision_model, user["token"])
+                use_model = vision_model
+            else:
+                use_model = None
+
+            text   = await image_to_text(
+                content=content,
+                filename=filename,
+                token=user["token"],
+                model=use_model,
+                prompt=vision_prompt
+            )
+            chunks = [{"text": text, "index": 0, "start": 0}]
+        elif filename.endswith((".txt", ".md")):
+            text   = content.decode("utf-8")
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".pdf"):
+            import pypdf, io
+            pdf    = pypdf.PdfReader(io.BytesIO(content))
+            text   = "\n".join(
                page.extract_text()
                for page in pdf.pages
                if page.extract_text()
            )
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

        elif filename.endswith(".docx"):
-            doc  = docx.Document(io.BytesIO(content))
-            text = "\n".join(
-                p.text for p in doc.paragraphs if p.text.strip()
+            import docx, io
+            doc    = docx.Document(io.BytesIO(content))
+            text   = "\n".join(
+                p.text for p in doc.paragraphs
+                if p.text.strip()
            )
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

-        elif filename.endswith(".txt"):
-            text = content.decode("utf-8")
+        elif filename.endswith(".xlsx"):
+            import openpyxl, io
+            wb     = openpyxl.load_workbook(io.BytesIO(content))
+            lines  = []
+            for sheet in wb.worksheets:
+                lines.append(f"=== Tabelle: {sheet.title} ===")
+                for row in sheet.iter_rows(values_only=True):
+                    if any(cell is not None for cell in row):
+                        lines.append(
+                            " | ".join(
+                                str(c) for c in row if c is not None
+                            )
+                        )
+            text   = "\n".join(lines)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

-        elif filename.endswith(".md"):
-            text = content.decode("utf-8")
+        elif filename.endswith(".csv"):
+            import csv, io
+            reader = csv.reader(
+                io.StringIO(content.decode("utf-8"))
+            )
+            text   = "\n".join(
+                " | ".join(row)
+                for row in reader
+                if any(cell.strip() for cell in row)
+            )
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".pptx"):
+            from pptx import Presentation
+            import io
+            prs    = Presentation(io.BytesIO(content))
+            lines  = []
+            for i, slide in enumerate(prs.slides):
+                lines.append(f"=== Folie {i+1} ===")
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text.strip():
+                        lines.append(shape.text)
+            text   = "\n".join(lines)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith((".html", ".htm")):
+            from bs4 import BeautifulSoup
+            soup   = BeautifulSoup(content, "html.parser")
+            for tag in soup(["script", "style", "nav", "footer"]):
+                tag.decompose()
+            text   = soup.get_text(separator="\n", strip=True)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".msg"):
+            import extract_msg, io
+            msg    = extract_msg.Message(io.BytesIO(content))
+            text   = "\n".join(filter(None, [
+                f"Von:     {msg.sender}",
+                f"An:      {msg.to}",
+                f"Betreff: {msg.subject}",
+                f"Datum:   {msg.date}",
+                "─" * 40,
+                msg.body
+            ]))
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".eml"):
+            import email
+            msg    = email.message_from_bytes(content)
+            body   = ""
+            if msg.is_multipart():
+                for part in msg.walk():
+                    if part.get_content_type() == "text/plain":
+                        body = part.get_payload(
+                            decode=True
+                        ).decode("utf-8", errors="ignore")
+                        break
+            else:
+                body = msg.get_payload(
+                    decode=True
+                ).decode("utf-8", errors="ignore")
+            text   = "\n".join(filter(None, [
+                f"Von:     {msg.get('From')}",
+                f"An:      {msg.get('To')}",
+                f"Betreff: {msg.get('Subject')}",
+                f"Datum:   {msg.get('Date')}",
+                "─" * 40,
+                body
+            ]))
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".json"):
+            import json as jsonlib
+            data   = jsonlib.loads(content.decode("utf-8"))
+            text   = jsonlib.dumps(data, indent=2, ensure_ascii=False)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

        else:
            raise HTTPException(
                400,
                f"Nicht unterstütztes Format: {file.filename}. "
-                f"Unterstützt: .pdf, .docx, .txt, .md"
+                f"Unterstützt: {SUPPORTED_FORMATS}"
            )

    except HTTPException:
        raise
    except Exception as e:
-        raise HTTPException(422, f"Datei konnte nicht gelesen werden: {e}")
+        raise HTTPException(
+            422,
+            f"Datei konnte nicht gelesen werden: {e}"
+        )

-    if not text.strip():
-        raise HTTPException(422, "Datei enthaelt keinen Text")
-
-    chunks = chunk_text(
-        text=text,
-        chunk_size=chunk_size,
-        overlap=chunk_overlap
-    )
+    if not any(c["text"].strip() for c in chunks):
+        raise HTTPException(422, "Datei enthält keinen Text")

    ids    = []
    failed = 0
@@ -752,14 +874,16 @@ async def upload_file(
        try:
            embedding = await _embed(chunk["text"], user["token"])
            doc_id    = await db.fetchval(
-                """INSERT INTO documents (store_id, content, metadata, embedding)
+                """INSERT INTO documents
+                   (store_id, content, metadata, embedding)
                   VALUES ($1, $2, $3, $4::vector) RETURNING id""",
                store_id,
                chunk["text"],
                json.dumps({
-                    "source":   file.filename,
-                    "chunk":    chunk["index"],
-                    "start":    chunk.get("start", 0),
+                    "source":  file.filename,
+                    "type":    "image" if is_image(filename) else "document",
+                    "chunk":   chunk["index"],
+                    "start":   chunk.get("start", 0),
                }),
                str(embedding)
            )
@@ -778,6 +902,7 @@ async def upload_file(
    return {
        "object":   "vector_store.file_batch",
        "filename": file.filename,
+        "type":     "image" if is_image(filename) else "document",
        "counts": {
            "completed": len(ids),
            "failed":    failed,
@@ -785,3 +910,27 @@ async def upload_file(
        },
        "ids": ids
    }
+
+@router.get("/vision/models")
+async def list_vision_models(
+    user: dict = Depends(verify_api_key),
+):
+    """Alle verfügbaren Vision Modelle"""
+    all_models = await _get_all_models()
+
+    vision_models = [
+        {
+            "id":       m["id"],
+            "object":   "model",
+            "owned_by": "system",
+            "default":  m["id"] == VISION_MODEL,
+        }
+        for m in all_models
+        if m.get("supports_vision") is True
+    ]
+
+    return {
+        "object":  "list",
+        "default": VISION_MODEL,
+        "data":    vision_models
+    }