import io import os import asyncio import logging from pathlib import Path import fitz # PyMuPDF from PIL import Image from pyzbar.pyzbar import decode as decode_qr from pypdf import PdfReader, PdfWriter import qrcode from qrcode.constants import ERROR_CORRECT_H from app.database import get_settings, add_log_entry from app.mail_processor import _connect_smtp, _build_forward_email logger = logging.getLogger(__name__) SEPARATOR_QR_CONTENT = "BELEGIMPORT-TRENNUNG" UPLOAD_DIR = Path(os.environ.get("UPLOAD_DIR", "/data/uploads")) def detect_separator_pages(pdf_path: str, progress_callback=None) -> list[int]: """Scan each page for QR codes. Returns list of page indices that are separators.""" separator_pages = [] doc = fitz.open(pdf_path) total = len(doc) for page_num in range(total): if progress_callback: progress_callback("scan", page_num + 1, total) page = doc[page_num] # Render page as image at 150 DPI (enough for QR detection) pix = page.get_pixmap(dpi=150) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # Scan for QR codes codes = decode_qr(img) for code in codes: try: data = code.data.decode("utf-8") except Exception: continue if data == SEPARATOR_QR_CONTENT: separator_pages.append(page_num) logger.debug(f"Trennseite erkannt auf Seite {page_num + 1}") break doc.close() return separator_pages def split_pdf(pdf_path: str, separator_pages: list[int]) -> list[bytes]: """Split PDF at separator pages. Separator pages are excluded from output.""" reader = PdfReader(pdf_path) total_pages = len(reader.pages) separator_set = set(separator_pages) documents = [] current_writer = None for page_num in range(total_pages): if page_num in separator_set: # This is a separator page - finalize current document if any if current_writer and len(current_writer.pages) > 0: buf = io.BytesIO() current_writer.write(buf) documents.append(buf.getvalue()) current_writer = None continue # Regular page - add to current document if current_writer is None: current_writer = PdfWriter() current_writer.add_page(reader.pages[page_num]) # Don't forget the last document (after the last separator or if no separator at end) if current_writer and len(current_writer.pages) > 0: buf = io.BytesIO() current_writer.write(buf) documents.append(buf.getvalue()) return documents async def process_scanned_pdf(pdf_path: str, progress_callback=None) -> dict: """Full pipeline: detect separators, split, send each document via email.""" settings = await get_settings() if not settings.get("smtp_server") or not settings.get("import_email"): return {"error": "SMTP oder Import-Email nicht konfiguriert", "total_pages": 0, "documents": 0, "sent": 0, "errors": 1} # Step 1: Detect separator pages (CPU-bound, run in thread) if progress_callback: progress_callback("status", 0, 0, "Analysiere PDF...") separator_pages = await asyncio.to_thread( detect_separator_pages, pdf_path, progress_callback ) reader = PdfReader(pdf_path) total_pages = len(reader.pages) if not separator_pages: # No separators found - treat entire PDF as one document if progress_callback: progress_callback("status", 0, 0, "Keine Trennseiten gefunden - sende gesamte PDF als ein Dokument") # Step 2: Split PDF if progress_callback: progress_callback("status", 0, 0, f"{len(separator_pages)} Trennseite(n) erkannt, splitte PDF...") documents = await asyncio.to_thread(split_pdf, pdf_path, separator_pages) if not documents: return {"error": "Keine Dokumente nach dem Splitting gefunden", "total_pages": total_pages, "documents": 0, "sent": 0, "errors": 1} # Step 3: Send each document via email if progress_callback: progress_callback("status", 0, 0, f"{len(documents)} Dokument(e) erkannt, starte Versand...") sent = 0 errors = 0 smtp_conn = None try: smtp_conn = _connect_smtp(settings) for i, doc_bytes in enumerate(documents): try: if progress_callback: progress_callback("send", i + 1, len(documents)) filename = f"Scan_Dokument_{i + 1}.pdf" msg = _build_forward_email( from_addr=settings["smtp_username"], to_addr=settings["import_email"], original_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}", original_from="Scan-Upload", attachments=[(filename, doc_bytes)], ) smtp_conn.send_message(msg) sent += 1 await add_log_entry( email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}", email_from="Scan-Upload", attachments_count=1, status="success", ) except Exception as e: errors += 1 logger.error(f"Fehler beim Senden von Dokument {i + 1}: {e}") await add_log_entry( email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}", email_from="Scan-Upload", attachments_count=1, status="error", error_message=str(e), ) except Exception as e: logger.error(f"SMTP-Verbindungsfehler: {e}") return { "error": f"SMTP-Verbindungsfehler: {e}", "total_pages": total_pages, "documents": len(documents), "sent": sent, "errors": errors + 1, } finally: if smtp_conn: try: smtp_conn.quit() except Exception: pass return { "total_pages": total_pages, "separator_pages": len(separator_pages), "documents": len(documents), "sent": sent, "errors": errors, } def _centered_textbox(page, y, text, fontsize, color): """Insert centered text using textbox across full page width.""" rect = fitz.Rect(40, y - fontsize, 555, y + fontsize) page.insert_textbox( rect, text, fontsize=fontsize, fontname="helv", color=color, align=fitz.TEXT_ALIGN_CENTER, ) def generate_separator_pdf() -> bytes: """Generate a printable A4 PDF with QR code for use as separator page.""" # Generate QR code image qr = qrcode.QRCode( version=2, error_correction=ERROR_CORRECT_H, box_size=10, border=4, ) qr.add_data(SEPARATOR_QR_CONTENT) qr.make(fit=True) qr_img = qr.make_image(fill_color="black", back_color="white").convert("RGB") # Create A4 PDF with PyMuPDF doc = fitz.open() page = doc.new_page(width=595, height=842) # A4 in points # Title text _centered_textbox(page, 120, "TRENNSEITE", 36, (0, 0, 0)) _centered_textbox(page, 170, "Belegimport", 16, (0.4, 0.4, 0.4)) # Insert QR code image centered qr_bytes = io.BytesIO() qr_img.save(qr_bytes, format="PNG") qr_bytes.seek(0) qr_size = 200 x_center = (595 - qr_size) / 2 y_center = 250 rect = fitz.Rect(x_center, y_center, x_center + qr_size, y_center + qr_size) page.insert_image(rect, stream=qr_bytes.getvalue()) # Description text below QR _centered_textbox(page, y_center + qr_size + 40, "Dieses Blatt zwischen Dokumente legen.", 14, (0.3, 0.3, 0.3)) _centered_textbox(page, y_center + qr_size + 70, "Es wird beim Scan-Upload automatisch erkannt und entfernt.", 12, (0.4, 0.4, 0.4)) # Dashed line border border_rect = fitz.Rect(40, 40, 555, 802) page.draw_rect(border_rect, color=(0.7, 0.7, 0.7), width=1) # Bottom info _centered_textbox(page, 770, "--- Nicht entfernen - wird automatisch erkannt ---", 10, (0.5, 0.5, 0.5)) pdf_bytes = doc.tobytes() doc.close() return pdf_bytes