first commit
This commit is contained in:
+246
@@ -0,0 +1,246 @@
|
||||
import io
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from PIL import Image
|
||||
from pyzbar.pyzbar import decode as decode_qr
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
import qrcode
|
||||
from qrcode.constants import ERROR_CORRECT_H
|
||||
|
||||
from app.database import get_settings, add_log_entry
|
||||
from app.mail_processor import _connect_smtp, _build_forward_email
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SEPARATOR_QR_CONTENT = "LEXOFFICE-TRENNUNG"
|
||||
UPLOAD_DIR = Path(os.environ.get("UPLOAD_DIR", "/data/uploads"))
|
||||
|
||||
|
||||
def detect_separator_pages(pdf_path: str, progress_callback=None) -> list[int]:
|
||||
"""Scan each page for QR codes. Returns list of page indices that are separators."""
|
||||
separator_pages = []
|
||||
doc = fitz.open(pdf_path)
|
||||
total = len(doc)
|
||||
|
||||
for page_num in range(total):
|
||||
if progress_callback:
|
||||
progress_callback("scan", page_num + 1, total)
|
||||
|
||||
page = doc[page_num]
|
||||
# Render page as image at 150 DPI (enough for QR detection)
|
||||
pix = page.get_pixmap(dpi=150)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
|
||||
# Scan for QR codes
|
||||
codes = decode_qr(img)
|
||||
for code in codes:
|
||||
try:
|
||||
data = code.data.decode("utf-8")
|
||||
except Exception:
|
||||
continue
|
||||
if data == SEPARATOR_QR_CONTENT:
|
||||
separator_pages.append(page_num)
|
||||
logger.debug(f"Trennseite erkannt auf Seite {page_num + 1}")
|
||||
break
|
||||
|
||||
doc.close()
|
||||
return separator_pages
|
||||
|
||||
|
||||
def split_pdf(pdf_path: str, separator_pages: list[int]) -> list[bytes]:
|
||||
"""Split PDF at separator pages. Separator pages are excluded from output."""
|
||||
reader = PdfReader(pdf_path)
|
||||
total_pages = len(reader.pages)
|
||||
separator_set = set(separator_pages)
|
||||
|
||||
documents = []
|
||||
current_writer = None
|
||||
|
||||
for page_num in range(total_pages):
|
||||
if page_num in separator_set:
|
||||
# This is a separator page - finalize current document if any
|
||||
if current_writer and len(current_writer.pages) > 0:
|
||||
buf = io.BytesIO()
|
||||
current_writer.write(buf)
|
||||
documents.append(buf.getvalue())
|
||||
current_writer = None
|
||||
continue
|
||||
|
||||
# Regular page - add to current document
|
||||
if current_writer is None:
|
||||
current_writer = PdfWriter()
|
||||
current_writer.add_page(reader.pages[page_num])
|
||||
|
||||
# Don't forget the last document (after the last separator or if no separator at end)
|
||||
if current_writer and len(current_writer.pages) > 0:
|
||||
buf = io.BytesIO()
|
||||
current_writer.write(buf)
|
||||
documents.append(buf.getvalue())
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
async def process_scanned_pdf(pdf_path: str, progress_callback=None) -> dict:
|
||||
"""Full pipeline: detect separators, split, send each document to LexOffice."""
|
||||
settings = await get_settings()
|
||||
|
||||
if not settings.get("smtp_server") or not settings.get("lexoffice_email"):
|
||||
return {"error": "SMTP oder LexOffice-Email nicht konfiguriert", "total_pages": 0, "documents": 0, "sent": 0, "errors": 1}
|
||||
|
||||
# Step 1: Detect separator pages (CPU-bound, run in thread)
|
||||
if progress_callback:
|
||||
progress_callback("status", 0, 0, "Analysiere PDF...")
|
||||
|
||||
separator_pages = await asyncio.to_thread(
|
||||
detect_separator_pages, pdf_path, progress_callback
|
||||
)
|
||||
|
||||
reader = PdfReader(pdf_path)
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
if not separator_pages:
|
||||
# No separators found - treat entire PDF as one document
|
||||
if progress_callback:
|
||||
progress_callback("status", 0, 0, "Keine Trennseiten gefunden - sende gesamte PDF als ein Dokument")
|
||||
|
||||
# Step 2: Split PDF
|
||||
if progress_callback:
|
||||
progress_callback("status", 0, 0, f"{len(separator_pages)} Trennseite(n) erkannt, splitte PDF...")
|
||||
|
||||
documents = await asyncio.to_thread(split_pdf, pdf_path, separator_pages)
|
||||
|
||||
if not documents:
|
||||
return {"error": "Keine Dokumente nach dem Splitting gefunden", "total_pages": total_pages, "documents": 0, "sent": 0, "errors": 1}
|
||||
|
||||
# Step 3: Send each document to LexOffice
|
||||
if progress_callback:
|
||||
progress_callback("status", 0, 0, f"{len(documents)} Dokument(e) erkannt, starte Versand...")
|
||||
|
||||
sent = 0
|
||||
errors = 0
|
||||
smtp_conn = None
|
||||
|
||||
try:
|
||||
smtp_conn = _connect_smtp(settings)
|
||||
|
||||
for i, doc_bytes in enumerate(documents):
|
||||
try:
|
||||
if progress_callback:
|
||||
progress_callback("send", i + 1, len(documents))
|
||||
|
||||
filename = f"Scan_Dokument_{i + 1}.pdf"
|
||||
msg = _build_forward_email(
|
||||
from_addr=settings["smtp_username"],
|
||||
to_addr=settings["lexoffice_email"],
|
||||
original_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
|
||||
original_from="Scan-Upload",
|
||||
attachments=[(filename, doc_bytes)],
|
||||
)
|
||||
smtp_conn.send_message(msg)
|
||||
sent += 1
|
||||
|
||||
await add_log_entry(
|
||||
email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
|
||||
email_from="Scan-Upload",
|
||||
attachments_count=1,
|
||||
status="success",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
logger.error(f"Fehler beim Senden von Dokument {i + 1}: {e}")
|
||||
await add_log_entry(
|
||||
email_subject=f"Scan-Upload Dokument {i + 1}/{len(documents)}",
|
||||
email_from="Scan-Upload",
|
||||
attachments_count=1,
|
||||
status="error",
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"SMTP-Verbindungsfehler: {e}")
|
||||
return {
|
||||
"error": f"SMTP-Verbindungsfehler: {e}",
|
||||
"total_pages": total_pages,
|
||||
"documents": len(documents),
|
||||
"sent": sent,
|
||||
"errors": errors + 1,
|
||||
}
|
||||
finally:
|
||||
if smtp_conn:
|
||||
try:
|
||||
smtp_conn.quit()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"total_pages": total_pages,
|
||||
"separator_pages": len(separator_pages),
|
||||
"documents": len(documents),
|
||||
"sent": sent,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
def _centered_textbox(page, y, text, fontsize, color):
|
||||
"""Insert centered text using textbox across full page width."""
|
||||
rect = fitz.Rect(40, y - fontsize, 555, y + fontsize)
|
||||
page.insert_textbox(
|
||||
rect, text,
|
||||
fontsize=fontsize,
|
||||
fontname="helv",
|
||||
color=color,
|
||||
align=fitz.TEXT_ALIGN_CENTER,
|
||||
)
|
||||
|
||||
|
||||
def generate_separator_pdf() -> bytes:
|
||||
"""Generate a printable A4 PDF with QR code for use as separator page."""
|
||||
# Generate QR code image
|
||||
qr = qrcode.QRCode(
|
||||
version=2,
|
||||
error_correction=ERROR_CORRECT_H,
|
||||
box_size=10,
|
||||
border=4,
|
||||
)
|
||||
qr.add_data(SEPARATOR_QR_CONTENT)
|
||||
qr.make(fit=True)
|
||||
qr_img = qr.make_image(fill_color="black", back_color="white").convert("RGB")
|
||||
|
||||
# Create A4 PDF with PyMuPDF
|
||||
doc = fitz.open()
|
||||
page = doc.new_page(width=595, height=842) # A4 in points
|
||||
|
||||
# Title text
|
||||
_centered_textbox(page, 120, "TRENNSEITE", 36, (0, 0, 0))
|
||||
_centered_textbox(page, 170, "LexOffice Belegimport", 16, (0.4, 0.4, 0.4))
|
||||
|
||||
# Insert QR code image centered
|
||||
qr_bytes = io.BytesIO()
|
||||
qr_img.save(qr_bytes, format="PNG")
|
||||
qr_bytes.seek(0)
|
||||
|
||||
qr_size = 200
|
||||
x_center = (595 - qr_size) / 2
|
||||
y_center = 250
|
||||
rect = fitz.Rect(x_center, y_center, x_center + qr_size, y_center + qr_size)
|
||||
page.insert_image(rect, stream=qr_bytes.getvalue())
|
||||
|
||||
# Description text below QR
|
||||
_centered_textbox(page, y_center + qr_size + 40, "Dieses Blatt zwischen Dokumente legen.", 14, (0.3, 0.3, 0.3))
|
||||
_centered_textbox(page, y_center + qr_size + 70, "Es wird beim Scan-Upload automatisch erkannt und entfernt.", 12, (0.4, 0.4, 0.4))
|
||||
|
||||
# Dashed line border
|
||||
border_rect = fitz.Rect(40, 40, 555, 802)
|
||||
page.draw_rect(border_rect, color=(0.7, 0.7, 0.7), width=1)
|
||||
|
||||
# Bottom info
|
||||
_centered_textbox(page, 770, "--- Nicht entfernen - wird automatisch erkannt ---", 10, (0.5, 0.5, 0.5))
|
||||
|
||||
pdf_bytes = doc.tobytes()
|
||||
doc.close()
|
||||
return pdf_bytes
|
||||
Reference in New Issue
Block a user