belege-import/app/ftp_processor.py

615 lines
22 KiB
Python

"""FTP / SFTP file import processor.
Same design as smb_processor but for FTP (passive, unencrypted) and SFTP (SSH).
Reads PDF files from a remote source folder, forwards them via SMTP, then moves
them to a processed folder.
"""
import asyncio
import ftplib
import io
import logging
import os
import posixpath
import tempfile
import paramiko
from app.database import get_settings, add_log_entry, get_import_email
from app.mail_processor import _connect_smtp, _build_forward_email, _send_with_log
from app.scanner import detect_separator_pages, split_pdf
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Generic adapter interface
# ---------------------------------------------------------------------------
class _FtpAdapter:
"""Common interface for FTP and SFTP backends."""
def list_pdfs(self, path: str) -> list[str]:
raise NotImplementedError
def list_dirs(self, path: str, max_depth: int = 5) -> list[str]:
raise NotImplementedError
def read_file(self, path: str) -> bytes:
raise NotImplementedError
def ensure_dir(self, path: str):
raise NotImplementedError
def stat_exists(self, path: str) -> bool:
raise NotImplementedError
def rename(self, src: str, dst: str):
raise NotImplementedError
def close(self):
pass
# ---------------------------------------------------------------------------
# FTP (passive, unencrypted)
# ---------------------------------------------------------------------------
class _PlainFtpAdapter(_FtpAdapter):
def __init__(self, server: str, port: int, username: str, password: str):
self.ftp = ftplib.FTP()
self.ftp.connect(server, port, timeout=15)
self.ftp.login(username or "anonymous", password or "")
self.ftp.set_pasv(True)
# Remember initial CWD - all subsequent operations should resolve relative to this
try:
self._initial_cwd = self.ftp.pwd()
except Exception:
self._initial_cwd = None
logger.debug(f"FTP initial CWD: {self._initial_cwd}")
def _reset_cwd(self):
"""Reset CWD to initial directory after stateful operations."""
if self._initial_cwd:
try:
self.ftp.cwd(self._initial_cwd)
except Exception:
pass
def list_pdfs(self, path: str) -> list[str]:
"""List PDF files via LIST (more reliable than NLST across FTP servers)."""
self._reset_cwd()
target = path if path else "."
lines = []
try:
self.ftp.retrlines(f"LIST {target}", lines.append)
except ftplib.error_perm:
return []
files = []
for line in lines:
if not line or line[0:1] == "d":
continue # skip directories
parts = line.split(maxsplit=8)
if len(parts) < 9:
continue
name = parts[-1]
if name.lower().endswith(".pdf") and not name.startswith("."):
files.append(name)
return sorted(files)
def list_dirs(self, path: str, max_depth: int = 5) -> list[str]:
self._reset_cwd()
base = path or ""
logger.debug(f"FTP list_dirs: base={base!r}, max_depth={max_depth}")
return self._list_dirs_rec(base, max_depth, 0, "")
def _list_dirs_rec(self, base: str, max_depth: int, depth: int, prefix: str) -> list[str]:
result = []
try:
entries = []
target = base if base else "."
self.ftp.retrlines(f"LIST {target}", entries.append)
logger.debug(f"FTP LIST {target!r} -> {len(entries)} entries")
except ftplib.error_perm as e:
logger.warning(f"FTP LIST {base!r} failed: {e}")
return []
for line in entries:
# Try to detect dirs (line starts with 'd') - works for unix-style listings
if not line or not line[0:1] == "d":
continue
parts = line.split(maxsplit=8)
if len(parts) < 9:
continue
name = parts[-1]
if name in (".", "..") or name.startswith("."):
continue
rel = f"{prefix}/{name}" if prefix else name
result.append(rel)
if depth < max_depth - 1:
sub = posixpath.join(base, name) if base else name
result.extend(self._list_dirs_rec(sub, max_depth, depth + 1, rel))
return result
def read_file(self, path: str) -> bytes:
self._reset_cwd()
buf = io.BytesIO()
self.ftp.retrbinary(f"RETR {path}", buf.write)
return buf.getvalue()
def ensure_dir(self, path: str):
"""Create directory tree, walking step by step from initial CWD.
IMPORTANT: FTP's cwd() is stateful - it changes the current working
directory for ALL subsequent operations. We must walk the path one
segment at a time relative to the current position, not concatenate
and re-cwd from initial each iteration.
"""
if not path:
return
try:
self._reset_cwd()
parts = [p for p in path.split("/") if p]
for p in parts:
try:
self.ftp.cwd(p)
except ftplib.error_perm:
# Doesn't exist - create and enter
try:
self.ftp.mkd(p)
self.ftp.cwd(p)
except ftplib.error_perm as e:
logger.warning(f"FTP mkd({p}) failed: {e}")
return
finally:
self._reset_cwd()
def stat_exists(self, path: str) -> bool:
"""Check if a file or directory exists at path."""
try:
self.ftp.size(path)
return True
except (ftplib.error_perm, ftplib.error_temp):
pass
# Try as directory - cwd then immediately reset
try:
self.ftp.cwd(path)
self._reset_cwd()
return True
except ftplib.error_perm:
self._reset_cwd()
return False
def rename(self, src: str, dst: str):
self._reset_cwd()
self.ftp.rename(src, dst)
def close(self):
try:
self.ftp.quit()
except Exception:
try:
self.ftp.close()
except Exception:
pass
# ---------------------------------------------------------------------------
# SFTP (paramiko)
# ---------------------------------------------------------------------------
class _SftpAdapter(_FtpAdapter):
def __init__(self, server: str, port: int, username: str, password: str):
self.transport = paramiko.Transport((server, port))
self.transport.connect(username=username, password=password)
self.sftp = paramiko.SFTPClient.from_transport(self.transport)
def _resolve(self, path: str) -> str:
"""Resolve path - empty/None means user's home/root directory."""
if not path:
try:
return self.sftp.normalize(".")
except IOError:
return "."
return path
def list_pdfs(self, path: str) -> list[str]:
try:
entries = self.sftp.listdir(self._resolve(path))
except IOError:
return []
return sorted(
e for e in entries if e.lower().endswith(".pdf") and not e.startswith(".")
)
def list_dirs(self, path: str, max_depth: int = 5) -> list[str]:
base = self._resolve(path)
logger.debug(f"SFTP list_dirs: base={base!r}, max_depth={max_depth}")
return self._list_dirs_rec(base, max_depth, 0, "")
def _list_dirs_rec(self, base: str, max_depth: int, depth: int, prefix: str) -> list[str]:
from stat import S_ISDIR
result = []
try:
entries = self.sftp.listdir_attr(base)
logger.debug(f"SFTP listdir_attr({base!r}) -> {[e.filename for e in entries]}")
except IOError as e:
logger.warning(f"SFTP listdir_attr({base!r}) failed: {e}")
return result
for entry in entries:
if entry.filename.startswith(".") or entry.filename in ("..", "."):
continue
if entry.st_mode and S_ISDIR(entry.st_mode):
rel = f"{prefix}/{entry.filename}" if prefix else entry.filename
result.append(rel)
if depth < max_depth - 1:
sub = posixpath.join(base, entry.filename)
result.extend(self._list_dirs_rec(sub, max_depth, depth + 1, rel))
return result
def read_file(self, path: str) -> bytes:
with self.sftp.open(path, "rb") as f:
return f.read()
def ensure_dir(self, path: str):
if not path:
return
parts = [p for p in path.split("/") if p]
cur = ""
for p in parts:
cur = f"{cur}/{p}" if cur else p
try:
self.sftp.stat(cur)
except IOError:
try:
self.sftp.mkdir(cur)
except IOError:
pass
def stat_exists(self, path: str) -> bool:
try:
self.sftp.stat(path)
return True
except IOError:
return False
def rename(self, src: str, dst: str):
self.sftp.rename(src, dst)
def close(self):
try:
self.sftp.close()
except Exception:
pass
try:
self.transport.close()
except Exception:
pass
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_adapter(settings: dict) -> _FtpAdapter:
protocol = settings.get("ftp_protocol", "sftp").lower()
server = settings["ftp_server"]
username = settings.get("ftp_username", "")
password = settings.get("ftp_password", "")
if protocol == "sftp":
port = int(settings.get("ftp_port") or 22)
return _SftpAdapter(server, port, username, password)
else:
port = int(settings.get("ftp_port") or 21)
return _PlainFtpAdapter(server, port, username, password)
def _join_path(*parts: str) -> str:
"""Join FTP/SFTP path segments using forward slash."""
result = ""
for p in parts:
if not p:
continue
p = p.replace("\\", "/").strip("/")
if not p:
continue
result = f"{result}/{p}" if result else p
return result
def _move_with_dedup(adapter: _FtpAdapter, src: str, dest_dir: str, filename: str):
"""Move file to dest_dir, renaming if a duplicate exists."""
dest = _join_path(dest_dir, filename)
if adapter.stat_exists(dest):
name, ext = os.path.splitext(filename)
counter = 1
while True:
new_name = f"{name}_{counter}{ext}"
new_dest = _join_path(dest_dir, new_name)
if not adapter.stat_exists(new_dest):
dest = new_dest
break
counter += 1
adapter.rename(src, dest)
# ---------------------------------------------------------------------------
# Processing pipeline
# ---------------------------------------------------------------------------
async def _process_ftp_folder(
smtp_conn, settings: dict, adapter: _FtpAdapter,
source_path: str, processed_path: str,
import_email: str, beleg_type: str, mode: str,
) -> dict:
"""Process one FTP folder pair. Returns counts dict."""
smtp_from = settings.get("smtp_username", "")
protocol = settings.get("ftp_protocol", "sftp").upper()
processed = 0
skipped = 0
errors = 0
await asyncio.to_thread(adapter.ensure_dir, processed_path)
pdf_files = await asyncio.to_thread(adapter.list_pdfs, source_path)
if not pdf_files:
logger.info(f"Keine PDF-Dateien im {protocol}-Ordner '{source_path}' ({beleg_type})")
return {"processed": 0, "skipped": 0, "errors": 0}
logger.info(f"{len(pdf_files)} PDF-Datei(en) im {protocol}-Ordner '{source_path}' ({beleg_type})")
for filename in pdf_files:
file_path = _join_path(source_path, filename)
try:
pdf_data = await asyncio.to_thread(adapter.read_file, file_path)
if mode == "separator":
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp.write(pdf_data)
tmp_path = tmp.name
try:
separator_pages = await asyncio.to_thread(detect_separator_pages, tmp_path, None)
documents = await asyncio.to_thread(split_pdf, tmp_path, separator_pages)
finally:
os.unlink(tmp_path)
if not documents:
skipped += 1
continue
smtp_log_parts = []
for i, doc_bytes in enumerate(documents):
doc_filename = f"{os.path.splitext(filename)[0]}_Teil_{i + 1}.pdf"
subject = f"{protocol}-Import: {filename} (Dokument {i + 1}/{len(documents)})"
msg = _build_forward_email(
from_addr=smtp_from,
to_addr=import_email,
original_subject=subject,
original_from=f"{protocol}-Import",
attachments=[(doc_filename, doc_bytes)],
)
smtp_log_parts.append(_send_with_log(smtp_conn, msg))
await add_log_entry(
email_subject=f"{protocol}: {filename}",
email_from=f"{protocol}-Import",
attachments_count=len(documents),
status="success",
sent_to=import_email,
smtp_log="\n---\n".join(smtp_log_parts),
beleg_type=beleg_type,
)
logger.info(f"{protocol} verarbeitet ({beleg_type}): {filename} -> {len(documents)} Dokument(e)")
else:
msg = _build_forward_email(
from_addr=smtp_from,
to_addr=import_email,
original_subject=f"{protocol}-Import: {filename}",
original_from=f"{protocol}-Import",
attachments=[(filename, pdf_data)],
)
smtp_log = _send_with_log(smtp_conn, msg)
await add_log_entry(
email_subject=f"{protocol}: {filename}",
email_from=f"{protocol}-Import",
attachments_count=1,
status="success",
sent_to=import_email,
smtp_log=smtp_log,
beleg_type=beleg_type,
)
logger.info(f"{protocol} verarbeitet ({beleg_type}): {filename}")
await asyncio.to_thread(_move_with_dedup, adapter, file_path, processed_path, filename)
processed += 1
except Exception as e:
errors += 1
logger.error(f"Fehler bei {protocol}-Datei {filename}: {e}")
try:
await add_log_entry(
email_subject=f"{protocol}: {filename}",
email_from=f"{protocol}-Import",
attachments_count=0,
status="error",
error_message=str(e),
beleg_type=beleg_type,
)
except Exception:
pass
return {"processed": processed, "skipped": skipped, "errors": errors}
async def process_ftp() -> dict:
"""Process PDF files from FTP/SFTP server - main pipeline."""
settings = await get_settings()
if settings.get("ftp_enabled") != "true":
return {"processed": 0, "skipped": 0, "errors": 0}
if not settings.get("ftp_server"):
return {"processed": 0, "skipped": 0, "errors": 0, "error": "FTP nicht konfiguriert"}
import_email_eingang = get_import_email(settings, "eingang")
if not import_email_eingang:
return {"processed": 0, "skipped": 0, "errors": 0, "error": "Import-Email nicht konfiguriert"}
mode = settings.get("ftp_mode", "forward")
protocol = settings.get("ftp_protocol", "sftp").upper()
total = {"processed": 0, "skipped": 0, "errors": 0}
smtp_conn = None
adapter = None
try:
adapter = await asyncio.to_thread(_make_adapter, settings)
smtp_conn = _connect_smtp(settings)
# Eingangsbelege
source = settings.get("ftp_source_path", "")
processed_path = settings.get("ftp_processed_path", "Verarbeitet")
result = await _process_ftp_folder(
smtp_conn, settings, adapter,
source, processed_path,
import_email_eingang, "eingang", mode,
)
for k in total:
total[k] += result[k]
# Ausgangsbelege (optional)
import_email_ausgang = get_import_email(settings, "ausgang")
source_ausgang = settings.get("ftp_source_path_ausgang", "")
processed_ausgang = settings.get("ftp_processed_path_ausgang", "")
if import_email_ausgang and source_ausgang:
if not processed_ausgang:
processed_ausgang = source_ausgang + "/Verarbeitet"
result = await _process_ftp_folder(
smtp_conn, settings, adapter,
source_ausgang, processed_ausgang,
import_email_ausgang, "ausgang", mode,
)
for k in total:
total[k] += result[k]
except Exception as e:
logger.error(f"{protocol}-Verbindungsfehler: {e}")
try:
await add_log_entry(
email_subject="",
email_from=f"{protocol}-Import",
attachments_count=0,
status="error",
error_message=f"{protocol}-Verbindungsfehler: {e}",
)
except Exception:
pass
return {**total, "errors": total["errors"] + 1, "error": str(e)}
finally:
if adapter:
try:
await asyncio.to_thread(adapter.close)
except Exception:
pass
if smtp_conn:
try:
smtp_conn.quit()
except Exception:
pass
logger.info(f"{protocol} fertig: {total['processed']} verarbeitet, {total['skipped']} übersprungen, {total['errors']} Fehler")
return total
async def test_ftp_connection() -> dict:
"""Test FTP/SFTP connection and return TOP-LEVEL folders only (lazy loading)."""
settings = await get_settings()
if not settings.get("ftp_server"):
return {"success": False, "error": "FTP-Server nicht konfiguriert", "folders": []}
adapter = None
try:
adapter = await asyncio.to_thread(_make_adapter, settings)
folders = await asyncio.to_thread(adapter.list_dirs, "", 1)
return {"success": True, "folders": sorted(folders)}
except Exception as e:
logger.error(f"FTP-Test fehlgeschlagen: {e}")
return {"success": False, "error": str(e), "folders": []}
finally:
if adapter:
try:
await asyncio.to_thread(adapter.close)
except Exception:
pass
async def create_ftp_folder(folder_path: str) -> dict:
"""Create a folder on the FTP/SFTP server."""
settings = await get_settings()
if not settings.get("ftp_server"):
return {"success": False, "error": "FTP nicht konfiguriert"}
if not folder_path or not folder_path.strip():
return {"success": False, "error": "Ordnername darf nicht leer sein"}
folder_path = folder_path.strip().replace("\\", "/")
adapter = None
try:
adapter = await asyncio.to_thread(_make_adapter, settings)
await asyncio.to_thread(adapter.ensure_dir, folder_path)
return {"success": True}
except Exception as e:
logger.error(f"FTP-Ordner erstellen fehlgeschlagen: {e}")
return {"success": False, "error": str(e)}
finally:
if adapter:
try:
await asyncio.to_thread(adapter.close)
except Exception:
pass
async def list_ftp_folders() -> dict:
"""Return TOP-LEVEL folder list from FTP/SFTP server (lazy loading)."""
settings = await get_settings()
if not settings.get("ftp_server"):
return {"folders": []}
adapter = None
try:
adapter = await asyncio.to_thread(_make_adapter, settings)
folders = await asyncio.to_thread(adapter.list_dirs, "", 1)
return {"folders": sorted(folders)}
except Exception:
return {"folders": []}
finally:
if adapter:
try:
await asyncio.to_thread(adapter.close)
except Exception:
pass
async def list_ftp_subfolders(parent_path: str) -> dict:
"""List direct subfolders of a path (one level deep, for lazy tree expansion)."""
settings = await get_settings()
if not settings.get("ftp_server"):
return {"success": False, "error": "FTP nicht konfiguriert", "folders": []}
adapter = None
try:
adapter = await asyncio.to_thread(_make_adapter, settings)
rel_folders = await asyncio.to_thread(adapter.list_dirs, parent_path, 1)
# Prefix with parent_path so the frontend has full paths
if parent_path:
folders = [f"{parent_path}/{f}" for f in rel_folders]
else:
folders = rel_folders
return {"success": True, "folders": sorted(folders)}
except Exception as e:
logger.error(f"FTP-Subfolder-Liste fehlgeschlagen: {e}")
return {"success": False, "error": str(e), "folders": []}
finally:
if adapter:
try:
await asyncio.to_thread(adapter.close)
except Exception:
pass