first commit

2026-03-06 08:20:07 +01:00
commit cb34aa00af
15 changed files with 3280 additions and 0 deletions
@@ -0,0 +1,246 @@
+import io
+import os
+import asyncio
+import logging
+from pathlib import Path
+
+import fitz  # PyMuPDF
+from PIL import Image
+from pyzbar.pyzbar import decode as decode_qr
+from pypdf import PdfReader, PdfWriter
+import qrcode
+from qrcode.constants import ERROR_CORRECT_H
+
+from app.database import get_settings, add_log_entry
+from app.mail_processor import _connect_smtp, _build_forward_email
+
+logger = logging.getLogger(__name__)
+
+SEPARATOR_QR_CONTENT = "LEXOFFICE-TRENNUNG"
+UPLOAD_DIR = Path(os.environ.get("UPLOAD_DIR", "/data/uploads"))
+
+
+def detect_separator_pages(pdf_path: str, progress_callback=None) -> list[int]:
+    """Scan each page for QR codes. Returns list of page indices that are separators."""
+    separator_pages = []
+    doc = fitz.open(pdf_path)
+    total = len(doc)
+
+    for page_num in range(total):
+        if progress_callback:
+            progress_callback("scan", page_num + 1, total)
+
+        page = doc[page_num]
+        # Render page as image at 150 DPI (enough for QR detection)
+        pix = page.get_pixmap(dpi=150)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+
+        # Scan for QR codes
+        codes = decode_qr(img)
+        for code in codes:
+            try:
+                data = code.data.decode("utf-8")
+            except Exception:
+                continue
+            if data == SEPARATOR_QR_CONTENT:
+                separator_pages.append(page_num)
+                logger.debug(f"Trennseite erkannt auf Seite {page_num + 1}")
+                break
+
+    doc.close()
+    return separator_pages
+
+
+def split_pdf(pdf_path: str, separator_pages: list[int]) -> list[bytes]:
+    """Split PDF at separator pages. Separator pages are excluded from output."""
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+    separator_set = set(separator_pages)
+
+    documents = []
+    current_writer = None
+
+    for page_num in range(total_pages):
+        if page_num in separator_set:
+            # This is a separator page - finalize current document if any
+            if current_writer and len(current_writer.pages) > 0:
+                buf = io.BytesIO()
+                current_writer.write(buf)
+                documents.append(buf.getvalue())
+            current_writer = None
+            continue
+
+        # Regular page - add to current document
+        if current_writer is None:
+            current_writer = PdfWriter()
+        current_writer.add_page(reader.pages[page_num])
+
+    # Don't forget the last document (after the last separator or if no separator at end)
+    if current_writer and len(current_writer.pages) > 0:
+        buf = io.BytesIO()
+        current_writer.write(buf)
+        documents.append(buf.getvalue())
+
+    return documents
+
+
+async def process_scanned_pdf(pdf_path: str, progress_callback=None) -> dict:
+    """Full pipeline: detect separators, split, send each document to LexOffice."""
+    settings = await get_settings()
+
+    if not settings.get("smtp_server") or not settings.get("lexoffice_email"):
+        return {"error": "SMTP oder LexOffice-Email nicht konfiguriert", "total_pages": 0, "documents": 0, "sent": 0, "errors": 1}
+
+    # Step 1: Detect separator pages (CPU-bound, run in thread)
+    if progress_callback:
+        progress_callback("status", 0, 0, "Analysiere PDF...")
+
+    separator_pages = await asyncio.to_thread(
+        detect_separator_pages, pdf_path, progress_callback
+    )
+
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+
+    if not separator_pages:
+        # No separators found - treat entire PDF as one document
+        if progress_callback:
+            progress_callback("status", 0, 0, "Keine Trennseiten gefunden - sende gesamte PDF als ein Dokument")
+
+    # Step 2: Split PDF
+    if progress_callback:
+        progress_callback("status", 0, 0, f"{len(separator_pages)} Trennseite(n) erkannt, splitte PDF...")
+
+    documents = await asyncio.to_thread(split_pdf, pdf_path, separator_pages)
+
+    if not documents:
+        return {"error": "Keine Dokumente nach dem Splitting gefunden", "total_pages": total_pages, "documents": 0, "sent": 0, "errors": 1}
+
+    # Step 3: Send each document to LexOffice
+    if progress_callback:
+        progress_callback("status", 0, 0, f"{len(documents)} Dokument(e) erkannt, starte Versand...")
+
+    sent = 0
+    errors = 0
+    smtp_conn = None
+
+    try:
+        smtp_conn = _connect_smtp(settings)
+
+        for i, doc_bytes in enumerate(documents):
+            try:
+                if progress_callback:
+                    progress_callback("send", i + 1, len(documents))
+
+                filename = f"Scan_Dokument_{i + 1}.pdf"
+                msg = _build_forward_email(
+                    from_addr=settings["smtp_username"],
+                    to_addr=settings["lexoffice_email"],
+                    original_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
+                    original_from="Scan-Upload",
+                    attachments=[(filename, doc_bytes)],
+                )
+                smtp_conn.send_message(msg)
+                sent += 1
+
+                await add_log_entry(
+                    email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
+                    email_from="Scan-Upload",
+                    attachments_count=1,
+                    status="success",
+                )
+
+            except Exception as e:
+                errors += 1
+                logger.error(f"Fehler beim Senden von Dokument {i + 1}: {e}")
+                await add_log_entry(
+                    email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
+                    email_from="Scan-Upload",
+                    attachments_count=1,
+                    status="error",
+                    error_message=str(e),
+                )
+
+    except Exception as e:
+        logger.error(f"SMTP-Verbindungsfehler: {e}")
+        return {
+            "error": f"SMTP-Verbindungsfehler: {e}",
+            "total_pages": total_pages,
+            "documents": len(documents),
+            "sent": sent,
+            "errors": errors + 1,
+        }
+    finally:
+        if smtp_conn:
+            try:
+                smtp_conn.quit()
+            except Exception:
+                pass
+
+    return {
+        "total_pages": total_pages,
+        "separator_pages": len(separator_pages),
+        "documents": len(documents),
+        "sent": sent,
+        "errors": errors,
+    }
+
+
+def _centered_textbox(page, y, text, fontsize, color):
+    """Insert centered text using textbox across full page width."""
+    rect = fitz.Rect(40, y - fontsize, 555, y + fontsize)
+    page.insert_textbox(
+        rect, text,
+        fontsize=fontsize,
+        fontname="helv",
+        color=color,
+        align=fitz.TEXT_ALIGN_CENTER,
+    )
+
+
+def generate_separator_pdf() -> bytes:
+    """Generate a printable A4 PDF with QR code for use as separator page."""
+    # Generate QR code image
+    qr = qrcode.QRCode(
+        version=2,
+        error_correction=ERROR_CORRECT_H,
+        box_size=10,
+        border=4,
+    )
+    qr.add_data(SEPARATOR_QR_CONTENT)
+    qr.make(fit=True)
+    qr_img = qr.make_image(fill_color="black", back_color="white").convert("RGB")
+
+    # Create A4 PDF with PyMuPDF
+    doc = fitz.open()
+    page = doc.new_page(width=595, height=842)  # A4 in points
+
+    # Title text
+    _centered_textbox(page, 120, "TRENNSEITE", 36, (0, 0, 0))
+    _centered_textbox(page, 170, "LexOffice Belegimport", 16, (0.4, 0.4, 0.4))
+
+    # Insert QR code image centered
+    qr_bytes = io.BytesIO()
+    qr_img.save(qr_bytes, format="PNG")
+    qr_bytes.seek(0)
+
+    qr_size = 200
+    x_center = (595 - qr_size) / 2
+    y_center = 250
+    rect = fitz.Rect(x_center, y_center, x_center + qr_size, y_center + qr_size)
+    page.insert_image(rect, stream=qr_bytes.getvalue())
+
+    # Description text below QR
+    _centered_textbox(page, y_center + qr_size + 40, "Dieses Blatt zwischen Dokumente legen.", 14, (0.3, 0.3, 0.3))
+    _centered_textbox(page, y_center + qr_size + 70, "Es wird beim Scan-Upload automatisch erkannt und entfernt.", 12, (0.4, 0.4, 0.4))
+
+    # Dashed line border
+    border_rect = fitz.Rect(40, 40, 555, 802)
+    page.draw_rect(border_rect, color=(0.7, 0.7, 0.7), width=1)
+
+    # Bottom info
+    _centered_textbox(page, 770, "--- Nicht entfernen - wird automatisch erkannt ---", 10, (0.5, 0.5, 0.5))
+
+    pdf_bytes = doc.tobytes()
+    doc.close()
+    return pdf_bytes