247 lines
8.2 KiB
Python
247 lines
8.2 KiB
Python
import io
|
|
import os
|
|
import asyncio
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
from pyzbar.pyzbar import decode as decode_qr
|
|
from pypdf import PdfReader, PdfWriter
|
|
import qrcode
|
|
from qrcode.constants import ERROR_CORRECT_H
|
|
|
|
from app.database import get_settings, add_log_entry
|
|
from app.mail_processor import _connect_smtp, _build_forward_email
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
SEPARATOR_QR_CONTENT = "BELEGIMPORT-TRENNUNG"
|
|
UPLOAD_DIR = Path(os.environ.get("UPLOAD_DIR", "/data/uploads"))
|
|
|
|
|
|
def detect_separator_pages(pdf_path: str, progress_callback=None) -> list[int]:
|
|
"""Scan each page for QR codes. Returns list of page indices that are separators."""
|
|
separator_pages = []
|
|
doc = fitz.open(pdf_path)
|
|
total = len(doc)
|
|
|
|
for page_num in range(total):
|
|
if progress_callback:
|
|
progress_callback("scan", page_num + 1, total)
|
|
|
|
page = doc[page_num]
|
|
# Render page as image at 150 DPI (enough for QR detection)
|
|
pix = page.get_pixmap(dpi=150)
|
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
|
|
# Scan for QR codes
|
|
codes = decode_qr(img)
|
|
for code in codes:
|
|
try:
|
|
data = code.data.decode("utf-8")
|
|
except Exception:
|
|
continue
|
|
if data == SEPARATOR_QR_CONTENT:
|
|
separator_pages.append(page_num)
|
|
logger.debug(f"Trennseite erkannt auf Seite {page_num + 1}")
|
|
break
|
|
|
|
doc.close()
|
|
return separator_pages
|
|
|
|
|
|
def split_pdf(pdf_path: str, separator_pages: list[int]) -> list[bytes]:
|
|
"""Split PDF at separator pages. Separator pages are excluded from output."""
|
|
reader = PdfReader(pdf_path)
|
|
total_pages = len(reader.pages)
|
|
separator_set = set(separator_pages)
|
|
|
|
documents = []
|
|
current_writer = None
|
|
|
|
for page_num in range(total_pages):
|
|
if page_num in separator_set:
|
|
# This is a separator page - finalize current document if any
|
|
if current_writer and len(current_writer.pages) > 0:
|
|
buf = io.BytesIO()
|
|
current_writer.write(buf)
|
|
documents.append(buf.getvalue())
|
|
current_writer = None
|
|
continue
|
|
|
|
# Regular page - add to current document
|
|
if current_writer is None:
|
|
current_writer = PdfWriter()
|
|
current_writer.add_page(reader.pages[page_num])
|
|
|
|
# Don't forget the last document (after the last separator or if no separator at end)
|
|
if current_writer and len(current_writer.pages) > 0:
|
|
buf = io.BytesIO()
|
|
current_writer.write(buf)
|
|
documents.append(buf.getvalue())
|
|
|
|
return documents
|
|
|
|
|
|
async def process_scanned_pdf(pdf_path: str, progress_callback=None) -> dict:
|
|
"""Full pipeline: detect separators, split, send each document via email."""
|
|
settings = await get_settings()
|
|
|
|
if not settings.get("smtp_server") or not settings.get("import_email"):
|
|
return {"error": "SMTP oder Import-Email nicht konfiguriert", "total_pages": 0, "documents": 0, "sent": 0, "errors": 1}
|
|
|
|
# Step 1: Detect separator pages (CPU-bound, run in thread)
|
|
if progress_callback:
|
|
progress_callback("status", 0, 0, "Analysiere PDF...")
|
|
|
|
separator_pages = await asyncio.to_thread(
|
|
detect_separator_pages, pdf_path, progress_callback
|
|
)
|
|
|
|
reader = PdfReader(pdf_path)
|
|
total_pages = len(reader.pages)
|
|
|
|
if not separator_pages:
|
|
# No separators found - treat entire PDF as one document
|
|
if progress_callback:
|
|
progress_callback("status", 0, 0, "Keine Trennseiten gefunden - sende gesamte PDF als ein Dokument")
|
|
|
|
# Step 2: Split PDF
|
|
if progress_callback:
|
|
progress_callback("status", 0, 0, f"{len(separator_pages)} Trennseite(n) erkannt, splitte PDF...")
|
|
|
|
documents = await asyncio.to_thread(split_pdf, pdf_path, separator_pages)
|
|
|
|
if not documents:
|
|
return {"error": "Keine Dokumente nach dem Splitting gefunden", "total_pages": total_pages, "documents": 0, "sent": 0, "errors": 1}
|
|
|
|
# Step 3: Send each document via email
|
|
if progress_callback:
|
|
progress_callback("status", 0, 0, f"{len(documents)} Dokument(e) erkannt, starte Versand...")
|
|
|
|
sent = 0
|
|
errors = 0
|
|
smtp_conn = None
|
|
|
|
try:
|
|
smtp_conn = _connect_smtp(settings)
|
|
|
|
for i, doc_bytes in enumerate(documents):
|
|
try:
|
|
if progress_callback:
|
|
progress_callback("send", i + 1, len(documents))
|
|
|
|
filename = f"Scan_Dokument_{i + 1}.pdf"
|
|
msg = _build_forward_email(
|
|
from_addr=settings["smtp_username"],
|
|
to_addr=settings["import_email"],
|
|
original_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
|
|
original_from="Scan-Upload",
|
|
attachments=[(filename, doc_bytes)],
|
|
)
|
|
smtp_conn.send_message(msg)
|
|
sent += 1
|
|
|
|
await add_log_entry(
|
|
email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
|
|
email_from="Scan-Upload",
|
|
attachments_count=1,
|
|
status="success",
|
|
)
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
logger.error(f"Fehler beim Senden von Dokument {i + 1}: {e}")
|
|
await add_log_entry(
|
|
email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
|
|
email_from="Scan-Upload",
|
|
attachments_count=1,
|
|
status="error",
|
|
error_message=str(e),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"SMTP-Verbindungsfehler: {e}")
|
|
return {
|
|
"error": f"SMTP-Verbindungsfehler: {e}",
|
|
"total_pages": total_pages,
|
|
"documents": len(documents),
|
|
"sent": sent,
|
|
"errors": errors + 1,
|
|
}
|
|
finally:
|
|
if smtp_conn:
|
|
try:
|
|
smtp_conn.quit()
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
"total_pages": total_pages,
|
|
"separator_pages": len(separator_pages),
|
|
"documents": len(documents),
|
|
"sent": sent,
|
|
"errors": errors,
|
|
}
|
|
|
|
|
|
def _centered_textbox(page, y, text, fontsize, color):
|
|
"""Insert centered text using textbox across full page width."""
|
|
rect = fitz.Rect(40, y - fontsize, 555, y + fontsize)
|
|
page.insert_textbox(
|
|
rect, text,
|
|
fontsize=fontsize,
|
|
fontname="helv",
|
|
color=color,
|
|
align=fitz.TEXT_ALIGN_CENTER,
|
|
)
|
|
|
|
|
|
def generate_separator_pdf() -> bytes:
|
|
"""Generate a printable A4 PDF with QR code for use as separator page."""
|
|
# Generate QR code image
|
|
qr = qrcode.QRCode(
|
|
version=2,
|
|
error_correction=ERROR_CORRECT_H,
|
|
box_size=10,
|
|
border=4,
|
|
)
|
|
qr.add_data(SEPARATOR_QR_CONTENT)
|
|
qr.make(fit=True)
|
|
qr_img = qr.make_image(fill_color="black", back_color="white").convert("RGB")
|
|
|
|
# Create A4 PDF with PyMuPDF
|
|
doc = fitz.open()
|
|
page = doc.new_page(width=595, height=842) # A4 in points
|
|
|
|
# Title text
|
|
_centered_textbox(page, 120, "TRENNSEITE", 36, (0, 0, 0))
|
|
_centered_textbox(page, 170, "Belegimport", 16, (0.4, 0.4, 0.4))
|
|
|
|
# Insert QR code image centered
|
|
qr_bytes = io.BytesIO()
|
|
qr_img.save(qr_bytes, format="PNG")
|
|
qr_bytes.seek(0)
|
|
|
|
qr_size = 200
|
|
x_center = (595 - qr_size) / 2
|
|
y_center = 250
|
|
rect = fitz.Rect(x_center, y_center, x_center + qr_size, y_center + qr_size)
|
|
page.insert_image(rect, stream=qr_bytes.getvalue())
|
|
|
|
# Description text below QR
|
|
_centered_textbox(page, y_center + qr_size + 40, "Dieses Blatt zwischen Dokumente legen.", 14, (0.3, 0.3, 0.3))
|
|
_centered_textbox(page, y_center + qr_size + 70, "Es wird beim Scan-Upload automatisch erkannt und entfernt.", 12, (0.4, 0.4, 0.4))
|
|
|
|
# Dashed line border
|
|
border_rect = fitz.Rect(40, 40, 555, 802)
|
|
page.draw_rect(border_rect, color=(0.7, 0.7, 0.7), width=1)
|
|
|
|
# Bottom info
|
|
_centered_textbox(page, 770, "--- Nicht entfernen - wird automatisch erkannt ---", 10, (0.5, 0.5, 0.5))
|
|
|
|
pdf_bytes = doc.tobytes()
|
|
doc.close()
|
|
return pdf_bytes
|