More datatypes

2026-04-29 09:11:46 +00:00
parent ef55253cbd
commit 965e900743
7 changed files with 519 additions and 51 deletions
@@ -7,6 +7,12 @@ import pypdf
 import docx
 import io
 from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
+from app.utils.image_processor import (
+    image_to_text,
+    is_image,
+    SUPPORTED_IMAGE_FORMATS,
+    DEFAULT_PROMPT
+)
 from pydantic import BaseModel, Field
 from typing import Optional
 from app.auth import verify_api_key
@@ -14,6 +20,12 @@ from app.database import get_db
 from app.utils.stats import track_usage
 from app.utils.chunking import chunk_text

+SUPPORTED_FORMATS = (
+    ".txt .md .pdf .docx .xlsx .csv "
+    ".pptx .html .htm .msg .eml .json "
+    + " ".join(SUPPORTED_IMAGE_FORMATS)
+)
+
 router = APIRouter()
 logger = logging.getLogger(__name__)

@@ -691,59 +703,169 @@ async def rag(
 async def upload_file(
    store_id:      str,
    file:          UploadFile = File(...),
-    chunk_size:    int = Form(default=512),
-    chunk_overlap: int = Form(default=50),
+    chunk_size:    int  = Form(default=512),
+    chunk_overlap: int  = Form(default=50),
+    vision_prompt: str  = Form(default=DEFAULT_PROMPT),
+    vision_model:  str  = Form(default=None),
    user:          dict = Depends(verify_api_key),
    db=Depends(get_db)
 ):
-    """Datei hochladen, chunken und in Vector Store speichern"""
-    start = time.time()
+    start    = time.time()
    await _check_access(db, store_id, user["user_id"])

    content  = await file.read()
    filename = file.filename.lower()

    try:
-        if filename.endswith(".pdf"):
-            pdf  = pypdf.PdfReader(io.BytesIO(content))
-            text = "\n".join(
+        if is_image(filename):
+            # Modell validieren falls angegeben
+            if vision_model:
+                await validate_vision_model(vision_model, user["token"])
+                use_model = vision_model
+            else:
+                use_model = None
+
+            text   = await image_to_text(
+                content=content,
+                filename=filename,
+                token=user["token"],
+                model=use_model,
+                prompt=vision_prompt
+            )
+            chunks = [{"text": text, "index": 0, "start": 0}]
+        elif filename.endswith((".txt", ".md")):
+            text   = content.decode("utf-8")
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".pdf"):
+            import pypdf, io
+            pdf    = pypdf.PdfReader(io.BytesIO(content))
+            text   = "\n".join(
                page.extract_text()
                for page in pdf.pages
                if page.extract_text()
            )
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

        elif filename.endswith(".docx"):
-            doc  = docx.Document(io.BytesIO(content))
-            text = "\n".join(
-                p.text for p in doc.paragraphs if p.text.strip()
+            import docx, io
+            doc    = docx.Document(io.BytesIO(content))
+            text   = "\n".join(
+                p.text for p in doc.paragraphs
+                if p.text.strip()
            )
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

-        elif filename.endswith(".txt"):
-            text = content.decode("utf-8")
+        elif filename.endswith(".xlsx"):
+            import openpyxl, io
+            wb     = openpyxl.load_workbook(io.BytesIO(content))
+            lines  = []
+            for sheet in wb.worksheets:
+                lines.append(f"=== Tabelle: {sheet.title} ===")
+                for row in sheet.iter_rows(values_only=True):
+                    if any(cell is not None for cell in row):
+                        lines.append(
+                            " | ".join(
+                                str(c) for c in row if c is not None
+                            )
+                        )
+            text   = "\n".join(lines)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

-        elif filename.endswith(".md"):
-            text = content.decode("utf-8")
+        elif filename.endswith(".csv"):
+            import csv, io
+            reader = csv.reader(
+                io.StringIO(content.decode("utf-8"))
+            )
+            text   = "\n".join(
+                " | ".join(row)
+                for row in reader
+                if any(cell.strip() for cell in row)
+            )
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".pptx"):
+            from pptx import Presentation
+            import io
+            prs    = Presentation(io.BytesIO(content))
+            lines  = []
+            for i, slide in enumerate(prs.slides):
+                lines.append(f"=== Folie {i+1} ===")
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text.strip():
+                        lines.append(shape.text)
+            text   = "\n".join(lines)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith((".html", ".htm")):
+            from bs4 import BeautifulSoup
+            soup   = BeautifulSoup(content, "html.parser")
+            for tag in soup(["script", "style", "nav", "footer"]):
+                tag.decompose()
+            text   = soup.get_text(separator="\n", strip=True)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".msg"):
+            import extract_msg, io
+            msg    = extract_msg.Message(io.BytesIO(content))
+            text   = "\n".join(filter(None, [
+                f"Von:     {msg.sender}",
+                f"An:      {msg.to}",
+                f"Betreff: {msg.subject}",
+                f"Datum:   {msg.date}",
+                "─" * 40,
+                msg.body
+            ]))
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".eml"):
+            import email
+            msg    = email.message_from_bytes(content)
+            body   = ""
+            if msg.is_multipart():
+                for part in msg.walk():
+                    if part.get_content_type() == "text/plain":
+                        body = part.get_payload(
+                            decode=True
+                        ).decode("utf-8", errors="ignore")
+                        break
+            else:
+                body = msg.get_payload(
+                    decode=True
+                ).decode("utf-8", errors="ignore")
+            text   = "\n".join(filter(None, [
+                f"Von:     {msg.get('From')}",
+                f"An:      {msg.get('To')}",
+                f"Betreff: {msg.get('Subject')}",
+                f"Datum:   {msg.get('Date')}",
+                "─" * 40,
+                body
+            ]))
+            chunks = chunk_text(text, chunk_size, chunk_overlap)
+
+        elif filename.endswith(".json"):
+            import json as jsonlib
+            data   = jsonlib.loads(content.decode("utf-8"))
+            text   = jsonlib.dumps(data, indent=2, ensure_ascii=False)
+            chunks = chunk_text(text, chunk_size, chunk_overlap)

        else:
            raise HTTPException(
                400,
                f"Nicht unterstütztes Format: {file.filename}. "
-                f"Unterstützt: .pdf, .docx, .txt, .md"
+                f"Unterstützt: {SUPPORTED_FORMATS}"
            )

    except HTTPException:
        raise
    except Exception as e:
-        raise HTTPException(422, f"Datei konnte nicht gelesen werden: {e}")
+        raise HTTPException(
+            422,
+            f"Datei konnte nicht gelesen werden: {e}"
+        )

-    if not text.strip():
-        raise HTTPException(422, "Datei enthaelt keinen Text")
-
-    chunks = chunk_text(
-        text=text,
-        chunk_size=chunk_size,
-        overlap=chunk_overlap
-    )
+    if not any(c["text"].strip() for c in chunks):
+        raise HTTPException(422, "Datei enthält keinen Text")

    ids    = []
    failed = 0
@@ -752,14 +874,16 @@ async def upload_file(
        try:
            embedding = await _embed(chunk["text"], user["token"])
            doc_id    = await db.fetchval(
-                """INSERT INTO documents (store_id, content, metadata, embedding)
+                """INSERT INTO documents
+                   (store_id, content, metadata, embedding)
                   VALUES ($1, $2, $3, $4::vector) RETURNING id""",
                store_id,
                chunk["text"],
                json.dumps({
-                    "source":   file.filename,
-                    "chunk":    chunk["index"],
-                    "start":    chunk.get("start", 0),
+                    "source":  file.filename,
+                    "type":    "image" if is_image(filename) else "document",
+                    "chunk":   chunk["index"],
+                    "start":   chunk.get("start", 0),
                }),
                str(embedding)
            )
@@ -778,6 +902,7 @@ async def upload_file(
    return {
        "object":   "vector_store.file_batch",
        "filename": file.filename,
+        "type":     "image" if is_image(filename) else "document",
        "counts": {
            "completed": len(ids),
            "failed":    failed,
@@ -785,3 +910,27 @@ async def upload_file(
        },
        "ids": ids
    }
+
+@router.get("/vision/models")
+async def list_vision_models(
+    user: dict = Depends(verify_api_key),
+):
+    """Alle verfügbaren Vision Modelle"""
+    all_models = await _get_all_models()
+
+    vision_models = [
+        {
+            "id":       m["id"],
+            "object":   "model",
+            "owned_by": "system",
+            "default":  m["id"] == VISION_MODEL,
+        }
+        for m in all_models
+        if m.get("supports_vision") is True
+    ]
+
+    return {
+        "object":  "list",
+        "default": VISION_MODEL,
+        "data":    vision_models
+    }
@@ -0,0 +1,153 @@
+import base64
+import httpx
+import os
+import logging
+from fastapi import HTTPException
+
+logger       = logging.getLogger(__name__)
+LITELLM_URL  = os.getenv("LITELLM_PROXY_URL", "http://litellm:4000")
+VISION_MODEL = os.getenv("VISION_MODEL", "openai/gpt-4o-mini")
+
+SUPPORTED_IMAGE_FORMATS = [
+    ".jpg", ".jpeg",
+    ".png",
+    ".gif",
+    ".webp",
+    ".tiff"
+]
+
+MIME_TYPES = {
+    "jpg":  "image/jpeg",
+    "jpeg": "image/jpeg",
+    "png":  "image/png",
+    "gif":  "image/gif",
+    "webp": "image/webp",
+    "tiff": "image/tiff"
+}
+
+DEFAULT_PROMPT = (
+    "Beschreibe den Inhalt dieses Bildes detailliert. "
+    "Falls Text vorhanden ist, gib ihn vollstaendig wieder. "
+    "Falls es ein Diagramm oder Chart ist, erklaere die Daten. "
+    "Falls es ein Screenshot ist, beschreibe was zu sehen ist. "
+    "Antworte auf Deutsch."
+)
+
+
+def is_image(filename: str) -> bool:
+    """Prueft ob eine Datei ein Bild ist"""
+    return any(
+        filename.lower().endswith(ext)
+        for ext in SUPPORTED_IMAGE_FORMATS
+    )
+
+
+async def image_to_text(
+    content:  bytes,
+    filename: str,
+    token:    str,
+    model:    str = None,
+    prompt:   str = DEFAULT_PROMPT
+) -> str:
+    """Bild ueber Vision LLM in Text umwandeln"""
+    use_model = model or VISION_MODEL
+    ext       = filename.lower().split(".")[-1]
+    mime_type = MIME_TYPES.get(ext, "image/jpeg")
+    image_b64 = base64.b64encode(content).decode("utf-8")
+
+    async with httpx.AsyncClient() as client:
+        resp = await client.post(
+            f"{LITELLM_URL}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {token}",
+                "Content-Type":  "application/json"
+            },
+            json={
+                "model": use_model,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{mime_type};base64,{image_b64}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                "max_tokens": 2048
+            },
+            timeout=60.0
+        )
+
+    if resp.status_code != 200:
+        logger.error(f"Vision Fehler: {resp.status_code} - {resp.text}")
+        raise HTTPException(
+            502,
+            f"Bild konnte nicht verarbeitet werden: {resp.text}"
+        )
+
+    return resp.json()["choices"][0]["message"]["content"]
+
+async def validate_vision_model(
+    model: str,
+    token: str
+) -> str:
+    """
+    Prüft ob das gewählte Modell Vision unterstützt.
+    Gibt das validierte Modell zurück.
+    """
+    LITELLM_MASTER = os.getenv("LITELLM_MASTER_KEY")
+
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.get(
+                f"{LITELLM_URL}/model_group/info",
+                headers={"Authorization": f"Bearer {LITELLM_MASTER}"},
+                timeout=10.0
+            )
+        except httpx.RequestError as e:
+            raise HTTPException(503, f"LiteLLM nicht erreichbar: {e}")
+
+    if resp.status_code != 200:
+        raise HTTPException(502, "Modelle konnten nicht abgerufen werden")
+
+    models = {
+        m.get("model_group"): m
+        for m in resp.json().get("data", [])
+    }
+
+    if model not in models:
+        raise HTTPException(404, {
+            "error": {
+                "message": f"Modell '{model}' nicht gefunden",
+                "type":    "invalid_request_error",
+                "code":    "model_not_found"
+            }
+        })
+
+    if not models[model].get("supports_vision"):
+        vision_models = [
+            m.get("model_group")
+            for m in resp.json().get("data", [])
+            if m.get("supports_vision")
+        ]
+        raise HTTPException(400, {
+            "error": {
+                "message": (
+                    f"Modell '{model}' unterstützt kein Vision. "
+                    f"Verfügbare Vision Modelle: "
+                    f"{', '.join(vision_models)}"
+                ),
+                "type":    "invalid_request_error",
+                "code":    "model_not_supported"
+            }
+        })
+
+    return model