More datatypes
This commit is contained in:
@@ -7,6 +7,12 @@ import pypdf
|
||||
import docx
|
||||
import io
|
||||
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
|
||||
from app.utils.image_processor import (
|
||||
image_to_text,
|
||||
is_image,
|
||||
SUPPORTED_IMAGE_FORMATS,
|
||||
DEFAULT_PROMPT
|
||||
)
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional
|
||||
from app.auth import verify_api_key
|
||||
@@ -14,6 +20,12 @@ from app.database import get_db
|
||||
from app.utils.stats import track_usage
|
||||
from app.utils.chunking import chunk_text
|
||||
|
||||
SUPPORTED_FORMATS = (
|
||||
".txt .md .pdf .docx .xlsx .csv "
|
||||
".pptx .html .htm .msg .eml .json "
|
||||
+ " ".join(SUPPORTED_IMAGE_FORMATS)
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -691,59 +703,169 @@ async def rag(
|
||||
async def upload_file(
|
||||
store_id: str,
|
||||
file: UploadFile = File(...),
|
||||
chunk_size: int = Form(default=512),
|
||||
chunk_overlap: int = Form(default=50),
|
||||
chunk_size: int = Form(default=512),
|
||||
chunk_overlap: int = Form(default=50),
|
||||
vision_prompt: str = Form(default=DEFAULT_PROMPT),
|
||||
vision_model: str = Form(default=None),
|
||||
user: dict = Depends(verify_api_key),
|
||||
db=Depends(get_db)
|
||||
):
|
||||
"""Datei hochladen, chunken und in Vector Store speichern"""
|
||||
start = time.time()
|
||||
start = time.time()
|
||||
await _check_access(db, store_id, user["user_id"])
|
||||
|
||||
content = await file.read()
|
||||
filename = file.filename.lower()
|
||||
|
||||
try:
|
||||
if filename.endswith(".pdf"):
|
||||
pdf = pypdf.PdfReader(io.BytesIO(content))
|
||||
text = "\n".join(
|
||||
if is_image(filename):
|
||||
# Modell validieren falls angegeben
|
||||
if vision_model:
|
||||
await validate_vision_model(vision_model, user["token"])
|
||||
use_model = vision_model
|
||||
else:
|
||||
use_model = None
|
||||
|
||||
text = await image_to_text(
|
||||
content=content,
|
||||
filename=filename,
|
||||
token=user["token"],
|
||||
model=use_model,
|
||||
prompt=vision_prompt
|
||||
)
|
||||
chunks = [{"text": text, "index": 0, "start": 0}]
|
||||
elif filename.endswith((".txt", ".md")):
|
||||
text = content.decode("utf-8")
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".pdf"):
|
||||
import pypdf, io
|
||||
pdf = pypdf.PdfReader(io.BytesIO(content))
|
||||
text = "\n".join(
|
||||
page.extract_text()
|
||||
for page in pdf.pages
|
||||
if page.extract_text()
|
||||
)
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".docx"):
|
||||
doc = docx.Document(io.BytesIO(content))
|
||||
text = "\n".join(
|
||||
p.text for p in doc.paragraphs if p.text.strip()
|
||||
import docx, io
|
||||
doc = docx.Document(io.BytesIO(content))
|
||||
text = "\n".join(
|
||||
p.text for p in doc.paragraphs
|
||||
if p.text.strip()
|
||||
)
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".txt"):
|
||||
text = content.decode("utf-8")
|
||||
elif filename.endswith(".xlsx"):
|
||||
import openpyxl, io
|
||||
wb = openpyxl.load_workbook(io.BytesIO(content))
|
||||
lines = []
|
||||
for sheet in wb.worksheets:
|
||||
lines.append(f"=== Tabelle: {sheet.title} ===")
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
if any(cell is not None for cell in row):
|
||||
lines.append(
|
||||
" | ".join(
|
||||
str(c) for c in row if c is not None
|
||||
)
|
||||
)
|
||||
text = "\n".join(lines)
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".md"):
|
||||
text = content.decode("utf-8")
|
||||
elif filename.endswith(".csv"):
|
||||
import csv, io
|
||||
reader = csv.reader(
|
||||
io.StringIO(content.decode("utf-8"))
|
||||
)
|
||||
text = "\n".join(
|
||||
" | ".join(row)
|
||||
for row in reader
|
||||
if any(cell.strip() for cell in row)
|
||||
)
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".pptx"):
|
||||
from pptx import Presentation
|
||||
import io
|
||||
prs = Presentation(io.BytesIO(content))
|
||||
lines = []
|
||||
for i, slide in enumerate(prs.slides):
|
||||
lines.append(f"=== Folie {i+1} ===")
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
lines.append(shape.text)
|
||||
text = "\n".join(lines)
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith((".html", ".htm")):
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for tag in soup(["script", "style", "nav", "footer"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".msg"):
|
||||
import extract_msg, io
|
||||
msg = extract_msg.Message(io.BytesIO(content))
|
||||
text = "\n".join(filter(None, [
|
||||
f"Von: {msg.sender}",
|
||||
f"An: {msg.to}",
|
||||
f"Betreff: {msg.subject}",
|
||||
f"Datum: {msg.date}",
|
||||
"─" * 40,
|
||||
msg.body
|
||||
]))
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".eml"):
|
||||
import email
|
||||
msg = email.message_from_bytes(content)
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
body = part.get_payload(
|
||||
decode=True
|
||||
).decode("utf-8", errors="ignore")
|
||||
break
|
||||
else:
|
||||
body = msg.get_payload(
|
||||
decode=True
|
||||
).decode("utf-8", errors="ignore")
|
||||
text = "\n".join(filter(None, [
|
||||
f"Von: {msg.get('From')}",
|
||||
f"An: {msg.get('To')}",
|
||||
f"Betreff: {msg.get('Subject')}",
|
||||
f"Datum: {msg.get('Date')}",
|
||||
"─" * 40,
|
||||
body
|
||||
]))
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
elif filename.endswith(".json"):
|
||||
import json as jsonlib
|
||||
data = jsonlib.loads(content.decode("utf-8"))
|
||||
text = jsonlib.dumps(data, indent=2, ensure_ascii=False)
|
||||
chunks = chunk_text(text, chunk_size, chunk_overlap)
|
||||
|
||||
else:
|
||||
raise HTTPException(
|
||||
400,
|
||||
f"Nicht unterstütztes Format: {file.filename}. "
|
||||
f"Unterstützt: .pdf, .docx, .txt, .md"
|
||||
f"Unterstützt: {SUPPORTED_FORMATS}"
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(422, f"Datei konnte nicht gelesen werden: {e}")
|
||||
raise HTTPException(
|
||||
422,
|
||||
f"Datei konnte nicht gelesen werden: {e}"
|
||||
)
|
||||
|
||||
if not text.strip():
|
||||
raise HTTPException(422, "Datei enthaelt keinen Text")
|
||||
|
||||
chunks = chunk_text(
|
||||
text=text,
|
||||
chunk_size=chunk_size,
|
||||
overlap=chunk_overlap
|
||||
)
|
||||
if not any(c["text"].strip() for c in chunks):
|
||||
raise HTTPException(422, "Datei enthält keinen Text")
|
||||
|
||||
ids = []
|
||||
failed = 0
|
||||
@@ -752,14 +874,16 @@ async def upload_file(
|
||||
try:
|
||||
embedding = await _embed(chunk["text"], user["token"])
|
||||
doc_id = await db.fetchval(
|
||||
"""INSERT INTO documents (store_id, content, metadata, embedding)
|
||||
"""INSERT INTO documents
|
||||
(store_id, content, metadata, embedding)
|
||||
VALUES ($1, $2, $3, $4::vector) RETURNING id""",
|
||||
store_id,
|
||||
chunk["text"],
|
||||
json.dumps({
|
||||
"source": file.filename,
|
||||
"chunk": chunk["index"],
|
||||
"start": chunk.get("start", 0),
|
||||
"source": file.filename,
|
||||
"type": "image" if is_image(filename) else "document",
|
||||
"chunk": chunk["index"],
|
||||
"start": chunk.get("start", 0),
|
||||
}),
|
||||
str(embedding)
|
||||
)
|
||||
@@ -778,6 +902,7 @@ async def upload_file(
|
||||
return {
|
||||
"object": "vector_store.file_batch",
|
||||
"filename": file.filename,
|
||||
"type": "image" if is_image(filename) else "document",
|
||||
"counts": {
|
||||
"completed": len(ids),
|
||||
"failed": failed,
|
||||
@@ -785,3 +910,27 @@ async def upload_file(
|
||||
},
|
||||
"ids": ids
|
||||
}
|
||||
|
||||
@router.get("/vision/models")
|
||||
async def list_vision_models(
|
||||
user: dict = Depends(verify_api_key),
|
||||
):
|
||||
"""Alle verfügbaren Vision Modelle"""
|
||||
all_models = await _get_all_models()
|
||||
|
||||
vision_models = [
|
||||
{
|
||||
"id": m["id"],
|
||||
"object": "model",
|
||||
"owned_by": "system",
|
||||
"default": m["id"] == VISION_MODEL,
|
||||
}
|
||||
for m in all_models
|
||||
if m.get("supports_vision") is True
|
||||
]
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"default": VISION_MODEL,
|
||||
"data": vision_models
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user