belege-import/app/scanner.py

247 lines
8.2 KiB
Python

import io
import os
import asyncio
import logging
from pathlib import Path
import fitz # PyMuPDF
from PIL import Image
from pyzbar.pyzbar import decode as decode_qr
from pypdf import PdfReader, PdfWriter
import qrcode
from qrcode.constants import ERROR_CORRECT_H
from app.database import get_settings, add_log_entry
from app.mail_processor import _connect_smtp, _build_forward_email
logger = logging.getLogger(__name__)
SEPARATOR_QR_CONTENT = "LEXOFFICE-TRENNUNG"
UPLOAD_DIR = Path(os.environ.get("UPLOAD_DIR", "/data/uploads"))
def detect_separator_pages(pdf_path: str, progress_callback=None) -> list[int]:
"""Scan each page for QR codes. Returns list of page indices that are separators."""
separator_pages = []
doc = fitz.open(pdf_path)
total = len(doc)
for page_num in range(total):
if progress_callback:
progress_callback("scan", page_num + 1, total)
page = doc[page_num]
# Render page as image at 150 DPI (enough for QR detection)
pix = page.get_pixmap(dpi=150)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Scan for QR codes
codes = decode_qr(img)
for code in codes:
try:
data = code.data.decode("utf-8")
except Exception:
continue
if data == SEPARATOR_QR_CONTENT:
separator_pages.append(page_num)
logger.debug(f"Trennseite erkannt auf Seite {page_num + 1}")
break
doc.close()
return separator_pages
def split_pdf(pdf_path: str, separator_pages: list[int]) -> list[bytes]:
"""Split PDF at separator pages. Separator pages are excluded from output."""
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
separator_set = set(separator_pages)
documents = []
current_writer = None
for page_num in range(total_pages):
if page_num in separator_set:
# This is a separator page - finalize current document if any
if current_writer and len(current_writer.pages) > 0:
buf = io.BytesIO()
current_writer.write(buf)
documents.append(buf.getvalue())
current_writer = None
continue
# Regular page - add to current document
if current_writer is None:
current_writer = PdfWriter()
current_writer.add_page(reader.pages[page_num])
# Don't forget the last document (after the last separator or if no separator at end)
if current_writer and len(current_writer.pages) > 0:
buf = io.BytesIO()
current_writer.write(buf)
documents.append(buf.getvalue())
return documents
async def process_scanned_pdf(pdf_path: str, progress_callback=None) -> dict:
"""Full pipeline: detect separators, split, send each document to LexOffice."""
settings = await get_settings()
if not settings.get("smtp_server") or not settings.get("lexoffice_email"):
return {"error": "SMTP oder LexOffice-Email nicht konfiguriert", "total_pages": 0, "documents": 0, "sent": 0, "errors": 1}
# Step 1: Detect separator pages (CPU-bound, run in thread)
if progress_callback:
progress_callback("status", 0, 0, "Analysiere PDF...")
separator_pages = await asyncio.to_thread(
detect_separator_pages, pdf_path, progress_callback
)
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
if not separator_pages:
# No separators found - treat entire PDF as one document
if progress_callback:
progress_callback("status", 0, 0, "Keine Trennseiten gefunden - sende gesamte PDF als ein Dokument")
# Step 2: Split PDF
if progress_callback:
progress_callback("status", 0, 0, f"{len(separator_pages)} Trennseite(n) erkannt, splitte PDF...")
documents = await asyncio.to_thread(split_pdf, pdf_path, separator_pages)
if not documents:
return {"error": "Keine Dokumente nach dem Splitting gefunden", "total_pages": total_pages, "documents": 0, "sent": 0, "errors": 1}
# Step 3: Send each document to LexOffice
if progress_callback:
progress_callback("status", 0, 0, f"{len(documents)} Dokument(e) erkannt, starte Versand...")
sent = 0
errors = 0
smtp_conn = None
try:
smtp_conn = _connect_smtp(settings)
for i, doc_bytes in enumerate(documents):
try:
if progress_callback:
progress_callback("send", i + 1, len(documents))
filename = f"Scan_Dokument_{i + 1}.pdf"
msg = _build_forward_email(
from_addr=settings["smtp_username"],
to_addr=settings["lexoffice_email"],
original_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
original_from="Scan-Upload",
attachments=[(filename, doc_bytes)],
)
smtp_conn.send_message(msg)
sent += 1
await add_log_entry(
email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
email_from="Scan-Upload",
attachments_count=1,
status="success",
)
except Exception as e:
errors += 1
logger.error(f"Fehler beim Senden von Dokument {i + 1}: {e}")
await add_log_entry(
email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
email_from="Scan-Upload",
attachments_count=1,
status="error",
error_message=str(e),
)
except Exception as e:
logger.error(f"SMTP-Verbindungsfehler: {e}")
return {
"error": f"SMTP-Verbindungsfehler: {e}",
"total_pages": total_pages,
"documents": len(documents),
"sent": sent,
"errors": errors + 1,
}
finally:
if smtp_conn:
try:
smtp_conn.quit()
except Exception:
pass
return {
"total_pages": total_pages,
"separator_pages": len(separator_pages),
"documents": len(documents),
"sent": sent,
"errors": errors,
}
def _centered_textbox(page, y, text, fontsize, color):
"""Insert centered text using textbox across full page width."""
rect = fitz.Rect(40, y - fontsize, 555, y + fontsize)
page.insert_textbox(
rect, text,
fontsize=fontsize,
fontname="helv",
color=color,
align=fitz.TEXT_ALIGN_CENTER,
)
def generate_separator_pdf() -> bytes:
"""Generate a printable A4 PDF with QR code for use as separator page."""
# Generate QR code image
qr = qrcode.QRCode(
version=2,
error_correction=ERROR_CORRECT_H,
box_size=10,
border=4,
)
qr.add_data(SEPARATOR_QR_CONTENT)
qr.make(fit=True)
qr_img = qr.make_image(fill_color="black", back_color="white").convert("RGB")
# Create A4 PDF with PyMuPDF
doc = fitz.open()
page = doc.new_page(width=595, height=842) # A4 in points
# Title text
_centered_textbox(page, 120, "TRENNSEITE", 36, (0, 0, 0))
_centered_textbox(page, 170, "LexOffice Belegimport", 16, (0.4, 0.4, 0.4))
# Insert QR code image centered
qr_bytes = io.BytesIO()
qr_img.save(qr_bytes, format="PNG")
qr_bytes.seek(0)
qr_size = 200
x_center = (595 - qr_size) / 2
y_center = 250
rect = fitz.Rect(x_center, y_center, x_center + qr_size, y_center + qr_size)
page.insert_image(rect, stream=qr_bytes.getvalue())
# Description text below QR
_centered_textbox(page, y_center + qr_size + 40, "Dieses Blatt zwischen Dokumente legen.", 14, (0.3, 0.3, 0.3))
_centered_textbox(page, y_center + qr_size + 70, "Es wird beim Scan-Upload automatisch erkannt und entfernt.", 12, (0.4, 0.4, 0.4))
# Dashed line border
border_rect = fitz.Rect(40, 40, 555, 802)
page.draw_rect(border_rect, color=(0.7, 0.7, 0.7), width=1)
# Bottom info
_centered_textbox(page, 770, "--- Nicht entfernen - wird automatisch erkannt ---", 10, (0.5, 0.5, 0.5))
pdf_bytes = doc.tobytes()
doc.close()
return pdf_bytes