1385 lines
54 KiB
Python
1385 lines
54 KiB
Python
import asyncio
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import tempfile
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
from app.database import get_settings, save_settings, add_log_entry, is_invoice_downloaded, mark_invoice_downloaded
|
|
from app.mail_processor import _connect_smtp, _build_forward_email, _send_with_log
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
SESSION_DIR = Path(os.environ.get("AMAZON_SESSION_DIR", "/data/amazon_session"))
|
|
DEBUG_DIR = Path(os.environ.get("UPLOAD_DIR", "/data/uploads")) / "amazon_debug"
|
|
|
|
# Login state machine
|
|
_login_state = {"status": "idle", "message": ""}
|
|
_login_lock = asyncio.Lock()
|
|
_otp_future: asyncio.Future | None = None
|
|
_browser_context = None
|
|
_playwright_instance = None
|
|
|
|
# Process lock to prevent concurrent runs
|
|
_process_lock = asyncio.Lock()
|
|
# Flag: True while process_amazon is actively working (page consumed but session valid)
|
|
_processing_active = False
|
|
|
|
# Interactive login session (browser page kept alive for user interaction)
|
|
_interactive_page = None
|
|
|
|
|
|
async def _human_delay(min_s: float = 1.0, max_s: float = 3.0):
|
|
"""Random delay to mimic human behavior."""
|
|
await asyncio.sleep(random.uniform(min_s, max_s))
|
|
|
|
|
|
async def _apply_stealth_to_context(context):
|
|
"""Apply stealth measures to the browser context (all pages)."""
|
|
try:
|
|
from playwright_stealth import Stealth
|
|
stealth = Stealth()
|
|
await stealth.apply_stealth_async(context)
|
|
logger.info("Stealth erfolgreich auf Browser-Kontext angewendet")
|
|
except ImportError:
|
|
logger.warning("playwright-stealth nicht installiert, überspringe")
|
|
except Exception as e:
|
|
logger.warning(f"Stealth konnte nicht angewendet werden: {e}")
|
|
|
|
|
|
async def _add_virtual_authenticator(page):
|
|
"""Add virtual WebAuthn authenticator to prevent passkey dialogs."""
|
|
try:
|
|
client = await page.context.new_cdp_session(page)
|
|
await client.send("WebAuthn.enable")
|
|
await client.send("WebAuthn.addVirtualAuthenticator", {
|
|
"options": {
|
|
"protocol": "ctap2",
|
|
"transport": "internal",
|
|
"hasResidentKey": True,
|
|
"hasUserVerification": True,
|
|
"isUserVerified": True,
|
|
"automaticPresenceSimulation": True,
|
|
}
|
|
})
|
|
logger.debug("Virtueller WebAuthn-Authenticator hinzugefügt")
|
|
except Exception as e:
|
|
logger.debug(f"Virtueller Authenticator fehlgeschlagen: {e}")
|
|
|
|
|
|
async def _get_browser_context():
|
|
"""Get or create persistent Chromium browser context."""
|
|
global _browser_context, _playwright_instance
|
|
|
|
if _browser_context is not None:
|
|
try:
|
|
# Check if context is still alive
|
|
pages = _browser_context.pages
|
|
return _browser_context
|
|
except Exception:
|
|
_browser_context = None
|
|
|
|
from playwright.async_api import async_playwright
|
|
|
|
SESSION_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Clean up stale Chromium lock files from previous container runs
|
|
for lock_file in ["SingletonLock", "SingletonSocket", "SingletonCookie"]:
|
|
lock_path = SESSION_DIR / lock_file
|
|
if lock_path.exists():
|
|
try:
|
|
lock_path.unlink()
|
|
logger.info(f"Stale Lock-File entfernt: {lock_file}")
|
|
except Exception:
|
|
pass
|
|
|
|
if _playwright_instance is None:
|
|
_playwright_instance = await async_playwright().start()
|
|
|
|
_browser_context = await _playwright_instance.chromium.launch_persistent_context(
|
|
user_data_dir=str(SESSION_DIR),
|
|
headless=True,
|
|
locale="de-DE",
|
|
user_agent=(
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
),
|
|
viewport={"width": 1280, "height": 800},
|
|
args=[
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--disable-gpu",
|
|
"--disable-dev-shm-usage",
|
|
"--disable-extensions",
|
|
"--disable-background-networking",
|
|
"--disable-translate",
|
|
"--no-first-run",
|
|
"--no-sandbox",
|
|
],
|
|
)
|
|
await _apply_stealth_to_context(_browser_context)
|
|
return _browser_context
|
|
|
|
|
|
async def close_browser_context():
|
|
"""Close browser context and playwright instance."""
|
|
global _browser_context, _playwright_instance
|
|
if _browser_context is not None:
|
|
try:
|
|
await _browser_context.close()
|
|
except Exception:
|
|
pass
|
|
_browser_context = None
|
|
if _playwright_instance is not None:
|
|
try:
|
|
await _playwright_instance.stop()
|
|
except Exception:
|
|
pass
|
|
_playwright_instance = None
|
|
|
|
|
|
def get_login_state() -> dict:
|
|
"""Return current login state for polling."""
|
|
return dict(_login_state)
|
|
|
|
|
|
async def _save_debug(page, name: str):
|
|
"""Save screenshot and HTML dump for debugging (max 50 files)."""
|
|
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
try:
|
|
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
|
|
# Limit to 50 files - delete oldest if over limit
|
|
existing = sorted(DEBUG_DIR.iterdir(), key=lambda p: p.stat().st_mtime)
|
|
while len(existing) > 48: # leave room for 2 new files
|
|
existing.pop(0).unlink()
|
|
except Exception as e:
|
|
logger.error(f"Amazon Debug-Verzeichnis Fehler: {e}")
|
|
return
|
|
# Save HTML (most reliable)
|
|
try:
|
|
html_path = DEBUG_DIR / f"{name}_{ts}.html"
|
|
content = await page.content()
|
|
html_path.write_text(content, encoding="utf-8")
|
|
logger.info(f"Amazon Debug-HTML gespeichert: {html_path} ({len(content)} Bytes)")
|
|
except Exception as e:
|
|
logger.error(f"Amazon Debug-HTML fehlgeschlagen: {e}")
|
|
# Save screenshot
|
|
try:
|
|
png_path = DEBUG_DIR / f"{name}_{ts}.png"
|
|
await page.screenshot(path=str(png_path), full_page=True)
|
|
logger.info(f"Amazon Debug-Screenshot gespeichert: {png_path}")
|
|
except Exception as e:
|
|
logger.error(f"Amazon Debug-Screenshot fehlgeschlagen: {e}")
|
|
|
|
|
|
async def check_session_valid() -> bool:
|
|
"""Check if an active Amazon session exists.
|
|
|
|
Returns True if we have a live interactive page, or if processing is active
|
|
(page consumed but still working), or if login is in progress.
|
|
"""
|
|
has_page = _interactive_page is not None
|
|
is_logging_in = _login_state.get("status") in ("interactive", "logging_in")
|
|
logger.info(f"Amazon Session-Check: has_page={has_page}, login_active={is_logging_in}, processing={_processing_active}")
|
|
return has_page or is_logging_in or _processing_active
|
|
|
|
|
|
def is_interactive_login_active() -> bool:
|
|
"""Check if interactive login modal is currently open (browser in use by user).
|
|
|
|
Also returns True if user has logged in but hasn't closed the modal yet.
|
|
"""
|
|
status = _login_state.get("status", "idle")
|
|
# Active if login dialog is open (interactive, logging_in, or logged_in but page still held)
|
|
if status in ("interactive", "logging_in"):
|
|
return True
|
|
if status == "logged_in" and _interactive_page is not None:
|
|
return True
|
|
return False
|
|
|
|
|
|
async def clear_session():
|
|
"""Clear browser session data."""
|
|
global _login_state, _interactive_page
|
|
await close_interactive_login(force_close=True)
|
|
await close_browser_context()
|
|
# Remove session files
|
|
if SESSION_DIR.exists():
|
|
import shutil
|
|
try:
|
|
shutil.rmtree(SESSION_DIR)
|
|
except Exception as e:
|
|
logger.warning(f"Session-Verzeichnis konnte nicht gelöscht werden: {e}")
|
|
SESSION_DIR.mkdir(parents=True, exist_ok=True)
|
|
_login_state = {"status": "idle", "message": ""}
|
|
|
|
|
|
# --- Interactive Login (user solves CAPTCHAs via screenshot/click/type) ---
|
|
|
|
|
|
async def start_interactive_login():
|
|
"""Open browser page to Amazon login and keep it alive for user interaction."""
|
|
global _login_state, _interactive_page
|
|
|
|
if _interactive_page is not None:
|
|
# Already have an interactive session
|
|
return
|
|
|
|
if _process_lock.locked():
|
|
_login_state = {"status": "login_failed", "message": "Amazon-Abruf läuft gerade. Bitte warten bis der Abruf fertig ist."}
|
|
return
|
|
|
|
_login_state = {"status": "interactive", "message": "Browser wird gestartet..."}
|
|
|
|
try:
|
|
settings = await get_settings()
|
|
domain = settings.get("amazon_domain", "amazon.de")
|
|
ctx = await _get_browser_context()
|
|
page = await ctx.new_page()
|
|
# Stealth is applied at context level
|
|
await _add_virtual_authenticator(page)
|
|
|
|
# Navigate to order history - Amazon redirects to login if not authenticated
|
|
await page.goto(
|
|
f"https://www.{domain}/gp/css/order-history",
|
|
wait_until="domcontentloaded",
|
|
timeout=60000,
|
|
)
|
|
# Wait a bit for page to settle
|
|
await asyncio.sleep(2)
|
|
|
|
_interactive_page = page
|
|
|
|
# Check if already logged in (not on a login/auth page)
|
|
url = page.url
|
|
is_login = "signin" in url or "/ap/" in url or "/auth/" in url
|
|
if not is_login and "amazon." in url:
|
|
_login_state = {"status": "logged_in", "message": "Bereits angemeldet"}
|
|
else:
|
|
_login_state = {"status": "interactive", "message": "Bitte im Browser anmelden"}
|
|
|
|
logger.info(f"Interaktive Login-Session gestartet, URL: {url}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Interaktive Login-Session fehlgeschlagen: {e}")
|
|
_login_state = {"status": "login_failed", "message": f"Browser konnte nicht gestartet werden: {e}"}
|
|
if _interactive_page:
|
|
try:
|
|
await _interactive_page.close()
|
|
except Exception:
|
|
pass
|
|
_interactive_page = None
|
|
|
|
|
|
async def get_browser_screenshot() -> bytes | None:
|
|
"""Take a screenshot of the interactive login page."""
|
|
if _interactive_page is None:
|
|
return None
|
|
try:
|
|
return await _interactive_page.screenshot(type="png")
|
|
except Exception as e:
|
|
logger.error(f"Screenshot fehlgeschlagen: {e}")
|
|
return None
|
|
|
|
|
|
async def send_browser_click(x: int, y: int):
|
|
"""Forward a mouse click to the interactive browser page."""
|
|
global _login_state
|
|
if _interactive_page is None:
|
|
return
|
|
try:
|
|
await _interactive_page.mouse.click(x, y)
|
|
await asyncio.sleep(0.3)
|
|
# Check if login completed after click
|
|
await _check_interactive_login_complete()
|
|
except Exception as e:
|
|
logger.error(f"Browser-Klick fehlgeschlagen: {e}")
|
|
|
|
|
|
async def send_browser_type(text: str):
|
|
"""Type text into the currently focused element in the browser."""
|
|
global _login_state
|
|
if _interactive_page is None:
|
|
return
|
|
try:
|
|
await _interactive_page.keyboard.type(text, delay=50)
|
|
await asyncio.sleep(0.2)
|
|
except Exception as e:
|
|
logger.error(f"Browser-Texteingabe fehlgeschlagen: {e}")
|
|
|
|
|
|
async def send_browser_key(key: str):
|
|
"""Send a special key (Enter, Tab, Backspace, Escape) to the browser."""
|
|
global _login_state
|
|
if _interactive_page is None:
|
|
return
|
|
try:
|
|
await _interactive_page.keyboard.press(key)
|
|
await asyncio.sleep(0.5)
|
|
# Check if login completed after key press (e.g. Enter on password)
|
|
await _check_interactive_login_complete()
|
|
except Exception as e:
|
|
logger.error(f"Browser-Taste fehlgeschlagen: {e}")
|
|
|
|
|
|
async def _check_interactive_login_complete():
|
|
"""Check if the interactive login page has left the login flow."""
|
|
global _login_state
|
|
if _interactive_page is None:
|
|
return
|
|
try:
|
|
url = _interactive_page.url
|
|
is_login = "signin" in url or "/ap/" in url or "/auth/" in url
|
|
is_captcha = "captcha" in url.lower()
|
|
|
|
# Check page content for error indicators
|
|
try:
|
|
title = await _interactive_page.title()
|
|
except Exception:
|
|
title = ""
|
|
is_error = any(t in title.lower() for t in [
|
|
"tut uns leid", "sorry", "fehler", "error",
|
|
"problem", "bot", "automated", "unusual",
|
|
])
|
|
is_blocked = "errors" in url or "/hz/approvalrequest" in url
|
|
|
|
if is_error or is_blocked:
|
|
_login_state = {"status": "interactive", "message": "Amazon blockiert den Zugriff. Versuchen Sie es erneut oder lösen Sie die Sicherheitsabfrage."}
|
|
logger.warning(f"Interaktiver Login: Error-Seite erkannt. URL: {url}, Titel: {title}")
|
|
elif is_captcha:
|
|
_login_state = {"status": "interactive", "message": "Bitte CAPTCHA lösen"}
|
|
elif not is_login and "amazon." in url:
|
|
_login_state = {"status": "logged_in", "message": "Erfolgreich angemeldet"}
|
|
logger.info(f"Interaktiver Login erfolgreich! URL: {url}")
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
async def close_interactive_login(force_close: bool = False):
|
|
"""Close the interactive login modal. Page stays alive for reuse by process_amazon.
|
|
|
|
Args:
|
|
force_close: If True, actually close the page (e.g. on error or explicit logout).
|
|
"""
|
|
global _interactive_page, _login_state
|
|
if force_close and _interactive_page is not None:
|
|
try:
|
|
await _interactive_page.close()
|
|
except Exception:
|
|
pass
|
|
_interactive_page = None
|
|
# Reset login state (page stays alive for process_amazon to consume)
|
|
_login_state = {"status": "idle", "message": ""}
|
|
logger.info(f"Interactive Login Modal geschlossen (page={'behalten' if _interactive_page else 'geschlossen'})")
|
|
|
|
|
|
async def start_login():
|
|
"""Start interactive Amazon login as background task."""
|
|
global _login_state, _otp_future
|
|
|
|
if _login_lock.locked():
|
|
return
|
|
|
|
async with _login_lock:
|
|
_login_state = {"status": "logging_in", "message": "Browser wird gestartet..."}
|
|
_otp_future = None
|
|
|
|
try:
|
|
settings = await get_settings()
|
|
amazon_email = settings.get("amazon_email", "")
|
|
amazon_password = settings.get("amazon_password", "")
|
|
domain = settings.get("amazon_domain", "amazon.de")
|
|
|
|
if not amazon_email or not amazon_password:
|
|
_login_state = {
|
|
"status": "login_failed",
|
|
"message": "Amazon E-Mail oder Passwort nicht konfiguriert",
|
|
}
|
|
return
|
|
|
|
ctx = await _get_browser_context()
|
|
page = await ctx.new_page()
|
|
|
|
try:
|
|
await _do_login(page, domain, amazon_email, amazon_password)
|
|
finally:
|
|
await page.close()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Amazon-Login fehlgeschlagen: {e}")
|
|
_login_state = {
|
|
"status": "login_failed",
|
|
"message": f"Login fehlgeschlagen: {e}",
|
|
}
|
|
|
|
|
|
async def _do_login(page, domain, email, password):
|
|
"""Execute the login flow using semantic Playwright locators."""
|
|
global _login_state, _otp_future
|
|
|
|
_login_state = {"status": "logging_in", "message": "Navigiere zu Amazon..."}
|
|
|
|
# Apply stealth and virtual authenticator to avoid bot detection
|
|
# Stealth is applied at context level
|
|
await _add_virtual_authenticator(page)
|
|
|
|
# Navigate to order history - Amazon will redirect to login if not authenticated
|
|
await page.goto(
|
|
f"https://www.{domain}/gp/css/order-history",
|
|
wait_until="networkidle",
|
|
timeout=60000,
|
|
)
|
|
|
|
await _save_debug(page, "login_start")
|
|
|
|
# Check if we're already logged in (no redirect to login page)
|
|
url = page.url
|
|
if ("order-history" in url or "your-orders" in url) and "signin" not in url and "/ap/" not in url:
|
|
logger.info("Amazon Login: Bereits eingeloggt!")
|
|
_login_state = {"status": "logged_in", "message": "Bereits angemeldet"}
|
|
return
|
|
|
|
await _human_delay()
|
|
|
|
# --- Step 1: Enter email ---
|
|
_login_state = {"status": "logging_in", "message": "E-Mail wird eingegeben..."}
|
|
|
|
email_field = None
|
|
for locator in [
|
|
page.locator("#ap_email_login"),
|
|
page.locator("#ap_email"),
|
|
page.locator("input[name='email']"),
|
|
page.locator("input[type='email']"),
|
|
page.get_by_label("Mobiltelefonnummer oder E-Mail-Adresse eingeben"),
|
|
page.get_by_label("E-Mail"),
|
|
page.get_by_label("E-Mail-Adresse"),
|
|
page.get_by_label("Email"),
|
|
]:
|
|
try:
|
|
if await locator.count() > 0 and await locator.first.is_visible():
|
|
email_field = locator.first
|
|
logger.info("Amazon Login: Email-Feld gefunden")
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
if not email_field:
|
|
await _save_debug(page, "login_no_email_field")
|
|
_login_state = {"status": "login_failed", "message": "Email-Feld nicht gefunden"}
|
|
return
|
|
|
|
await email_field.fill(email)
|
|
await _human_delay(0.5, 1.5)
|
|
|
|
# Click continue button
|
|
continue_btn = None
|
|
for locator in [
|
|
page.get_by_role("button", name="Weiter"),
|
|
page.get_by_role("button", name="Continue"),
|
|
page.locator("#continue"),
|
|
page.locator("input[type='submit']"),
|
|
]:
|
|
try:
|
|
if await locator.count() > 0 and await locator.first.is_visible():
|
|
continue_btn = locator.first
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
if continue_btn:
|
|
logger.info("Amazon Login: Weiter-Button geklickt")
|
|
await continue_btn.click()
|
|
await page.wait_for_load_state("networkidle")
|
|
await _human_delay()
|
|
|
|
await _save_debug(page, "login_after_email")
|
|
|
|
# Check for CAPTCHA
|
|
if await page.locator("#auth-captcha-image, #captchacharacters, #cvf-aamation-container, #captcha-container, #aa-challenge-whole-page-iframe").count() > 0 or "captcha" in (await page.title()).lower() or "bestätige deine Identität" in (await page.title()):
|
|
_login_state = {"status": "login_failed", "message": "CAPTCHA/Sicherheitsabfrage erkannt. Bitte über den interaktiven Browser anmelden."}
|
|
await _save_debug(page, "login_captcha")
|
|
return
|
|
|
|
# --- Step 2: Enter password ---
|
|
_login_state = {"status": "logging_in", "message": "Passwort wird eingegeben..."}
|
|
|
|
pw_field = None
|
|
for locator in [
|
|
page.get_by_label("Passwort"),
|
|
page.get_by_label("Password"),
|
|
page.locator("#ap_password"),
|
|
page.locator("input[name='password']"),
|
|
page.locator("input[type='password']"),
|
|
]:
|
|
try:
|
|
if await locator.count() > 0 and await locator.first.is_visible():
|
|
pw_field = locator.first
|
|
logger.info("Amazon Login: Passwort-Feld gefunden")
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
if not pw_field:
|
|
logger.info("Amazon Login: Kein Passwort-Feld sichtbar, prüfe ob bereits eingeloggt...")
|
|
await _save_debug(page, "login_no_password_field")
|
|
else:
|
|
await pw_field.fill(password)
|
|
await _human_delay(0.5, 1.5)
|
|
|
|
# Click sign-in button
|
|
signin_btn = None
|
|
for locator in [
|
|
page.get_by_role("button", name="Anmelden"),
|
|
page.get_by_role("button", name="Sign in"),
|
|
page.locator("#signInSubmit"),
|
|
page.locator("#auth-signin-button"),
|
|
page.locator("input[type='submit']"),
|
|
]:
|
|
try:
|
|
if await locator.count() > 0 and await locator.first.is_visible():
|
|
signin_btn = locator.first
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
if signin_btn:
|
|
logger.info("Amazon Login: Anmelden-Button geklickt")
|
|
await signin_btn.click()
|
|
await page.wait_for_load_state("networkidle")
|
|
await _human_delay(1.5, 3.0)
|
|
|
|
await _save_debug(page, "login_after_password")
|
|
|
|
# Check for CAPTCHA again
|
|
if await page.locator("#auth-captcha-image, #captchacharacters, #cvf-aamation-container, #captcha-container, #aa-challenge-whole-page-iframe").count() > 0 or "captcha" in (await page.title()).lower() or "bestätige deine Identität" in (await page.title()):
|
|
_login_state = {"status": "login_failed", "message": "CAPTCHA/Sicherheitsabfrage erkannt. Bitte über den interaktiven Browser anmelden."}
|
|
await _save_debug(page, "login_captcha")
|
|
return
|
|
|
|
# --- Step 3: Handle 2FA/OTP ---
|
|
otp_field = page.locator("#auth-mfa-otpcode, input[name='otpCode'], #ap_dcq_hint")
|
|
if await otp_field.count() > 0:
|
|
_login_state = {
|
|
"status": "awaiting_otp",
|
|
"message": "Bitte geben Sie den Bestätigungscode ein",
|
|
}
|
|
loop = asyncio.get_event_loop()
|
|
_otp_future = loop.create_future()
|
|
try:
|
|
otp_code = await asyncio.wait_for(_otp_future, timeout=300)
|
|
except asyncio.TimeoutError:
|
|
_login_state = {"status": "login_failed", "message": "OTP-Zeitüberschreitung (5 Minuten)"}
|
|
return
|
|
finally:
|
|
_otp_future = None
|
|
|
|
_login_state = {"status": "logging_in", "message": "OTP wird eingegeben..."}
|
|
|
|
for sel in ["#auth-mfa-otpcode", "input[name='otpCode']"]:
|
|
field = page.locator(sel)
|
|
if await field.count() > 0:
|
|
await field.first.fill(otp_code)
|
|
break
|
|
|
|
for sel in ["#auth-signin-button", "input[type='submit']", "#submitButton"]:
|
|
btn = page.locator(sel)
|
|
if await btn.count() > 0:
|
|
await btn.first.click()
|
|
break
|
|
|
|
await page.wait_for_load_state("networkidle")
|
|
await _human_delay(1.5, 3.0)
|
|
|
|
# --- Step 4: Handle device approval ---
|
|
approval = page.locator("#auth-approve-form, .cvf-widget-form-approve")
|
|
if await approval.count() > 0:
|
|
_login_state = {
|
|
"status": "awaiting_otp",
|
|
"message": "Bitte bestätigen Sie die Anmeldung auf Ihrem Gerät",
|
|
}
|
|
for _ in range(60):
|
|
await asyncio.sleep(2)
|
|
url = page.url
|
|
if ("signin" not in url and "/ap/" not in url) or domain + "/?ref" in url:
|
|
break
|
|
if await approval.count() == 0:
|
|
break
|
|
|
|
# --- Verify login success ---
|
|
url = page.url
|
|
is_login_page = "signin" in url or "/ap/" in url
|
|
page_content = await page.content()
|
|
content_len = len(page_content)
|
|
is_error_page = "Suchen Sie etwas" in page_content or "Seite wurde nicht gefunden" in page_content
|
|
is_order_page = "order-history" in url or "your-orders" in url or "Meine Bestellungen" in page_content
|
|
is_success = not is_login_page and not is_error_page and domain in url and (is_order_page or content_len > 10000)
|
|
logger.info(f"Amazon Login: URL={url}, is_login_page={is_login_page}, is_error_page={is_error_page}, is_order_page={is_order_page}, content_len={content_len}, success={is_success}")
|
|
await _save_debug(page, "login_result")
|
|
|
|
if is_success:
|
|
_login_state = {"status": "logged_in", "message": "Erfolgreich angemeldet"}
|
|
logger.info("Amazon-Login erfolgreich")
|
|
else:
|
|
error_el = page.locator("#auth-error-message-box, .a-alert-content")
|
|
error_msg = ""
|
|
if await error_el.count() > 0:
|
|
error_msg = await error_el.first.inner_text()
|
|
if is_error_page:
|
|
error_msg = "Amazon hat den Zugriff blockiert (Fehlerseite). Bitte später erneut versuchen."
|
|
_login_state = {
|
|
"status": "login_failed",
|
|
"message": f"Login fehlgeschlagen. {error_msg}".strip(),
|
|
}
|
|
|
|
|
|
async def submit_otp(code: str) -> bool:
|
|
"""Submit OTP code from web UI."""
|
|
global _otp_future
|
|
if _otp_future is not None and not _otp_future.done():
|
|
_otp_future.set_result(code)
|
|
return True
|
|
return False
|
|
|
|
|
|
async def process_amazon() -> dict:
|
|
"""Main function: fetch Amazon invoices and forward via email."""
|
|
if _process_lock.locked():
|
|
logger.info("Amazon-Import: Läuft bereits, überspringe")
|
|
return {"processed": 0, "errors": 0, "error": "Amazon-Abruf läuft bereits"}
|
|
|
|
# Don't start processing while user is logging in (would freeze the browser)
|
|
if is_interactive_login_active():
|
|
logger.info("Amazon-Import: Interaktiver Login läuft, überspringe")
|
|
return {"processed": 0, "errors": 0, "error": "Bitte zuerst den Login abschließen"}
|
|
|
|
async with _process_lock:
|
|
return await _process_amazon_inner()
|
|
|
|
|
|
async def _process_amazon_inner() -> dict:
|
|
"""Inner processing function (protected by _process_lock)."""
|
|
global _interactive_page, _processing_active
|
|
|
|
settings = await get_settings()
|
|
|
|
if settings.get("amazon_enabled") != "true":
|
|
return {"processed": 0, "errors": 0}
|
|
|
|
# Check prerequisites
|
|
if not settings.get("smtp_server") or not settings.get("import_email"):
|
|
logger.warning("Amazon-Import: SMTP oder Import-Email nicht konfiguriert")
|
|
return {"processed": 0, "errors": 0, "error": "SMTP/Import-Email nicht konfiguriert"}
|
|
|
|
if not settings.get("amazon_email") or not settings.get("amazon_password"):
|
|
logger.warning("Amazon-Import: Zugangsdaten nicht konfiguriert")
|
|
return {"processed": 0, "errors": 0, "error": "Amazon-Zugangsdaten nicht konfiguriert"}
|
|
|
|
# Without interactive login page, new pages can't authenticate (session bound to page)
|
|
if _interactive_page is None:
|
|
logger.info("Amazon-Import: Keine aktive Login-Session, überspringe (bitte zuerst manuell anmelden)")
|
|
return {"processed": 0, "errors": 0, "error": "Bitte zuerst unter Plattformen bei Amazon anmelden"}
|
|
|
|
domain = settings.get("amazon_domain", "amazon.de")
|
|
since_str = settings.get("amazon_since_date", "")
|
|
if since_str:
|
|
try:
|
|
since_date = datetime.strptime(since_str, "%Y-%m-%d")
|
|
except ValueError:
|
|
logger.warning(f"Amazon: Ungültiges Startdatum: {since_str}")
|
|
since_date = datetime.now() - timedelta(days=30)
|
|
else:
|
|
since_date = datetime.now() - timedelta(days=30)
|
|
|
|
logger.info(f"Amazon-Import gestartet: domain={domain}, seit={since_date.strftime('%Y-%m-%d')}")
|
|
|
|
processed = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
# Reuse interactive login page if available (session is bound to the page)
|
|
reused_page = False
|
|
if _interactive_page is not None:
|
|
page = _interactive_page
|
|
_interactive_page = None # Take ownership
|
|
_processing_active = True # Signal that session is still valid while processing
|
|
_login_state = {"status": "idle", "message": ""} # Reset login state
|
|
reused_page = True
|
|
logger.info("Amazon: Verwende interaktive Login-Page für Abruf")
|
|
else:
|
|
ctx = await _get_browser_context()
|
|
page = await ctx.new_page()
|
|
await _add_virtual_authenticator(page)
|
|
|
|
smtp_conn = None
|
|
|
|
try:
|
|
logger.info("Amazon: SMTP-Verbindung wird hergestellt...")
|
|
smtp_conn = _connect_smtp(settings)
|
|
logger.info("Amazon: SMTP-Verbindung OK, verarbeite Bestellungen seitenweise...")
|
|
import_email = settings.get("import_email_eingang") or settings.get("import_email", "")
|
|
|
|
# Process orders PAGE BY PAGE (collect + process on same page so buttons are visible)
|
|
result = await _collect_and_process_orders(
|
|
page, domain, since_date, smtp_conn, settings, import_email
|
|
)
|
|
if result is None:
|
|
error_detail = "Amazon-Sitzung abgelaufen. Bitte manuell unter Plattformen neu anmelden."
|
|
logger.warning(f"Amazon-Import: {error_detail}")
|
|
await add_log_entry(
|
|
email_subject="Amazon-Import",
|
|
email_from="Amazon",
|
|
attachments_count=0,
|
|
status="error",
|
|
error_message=error_detail,
|
|
)
|
|
return {"processed": 0, "errors": 0, "error": error_detail}
|
|
|
|
processed, skipped, errors = result["processed"], result["skipped"], result["errors"]
|
|
|
|
# Update last sync date
|
|
await save_settings({"amazon_last_sync": datetime.now().strftime("%Y-%m-%d %H:%M")})
|
|
|
|
# Log summary if nothing was processed
|
|
if processed == 0 and errors == 0:
|
|
if skipped > 0:
|
|
summary = f"Alle Rechnungen bereits importiert ({skipped} übersprungen)"
|
|
else:
|
|
summary = "Keine neuen Rechnungen gefunden"
|
|
await add_log_entry(
|
|
email_subject="Amazon-Import (Zusammenfassung)",
|
|
email_from=f"Amazon ({domain})",
|
|
attachments_count=0,
|
|
status="success",
|
|
error_message=summary,
|
|
sent_to="",
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Amazon-Import Fehler: {e}")
|
|
await add_log_entry(
|
|
email_subject="Amazon-Import",
|
|
email_from=f"Amazon ({domain})",
|
|
attachments_count=0,
|
|
status="error",
|
|
error_message=str(e),
|
|
)
|
|
return {"processed": processed, "skipped": skipped, "errors": errors + 1, "error": str(e)}
|
|
|
|
finally:
|
|
_processing_active = False
|
|
# Keep page alive for next run instead of closing it (preserves session)
|
|
if reused_page and page:
|
|
_interactive_page = page # Return page for reuse
|
|
logger.info("Amazon: Page zurück in Session-Pool (Session bleibt erhalten)")
|
|
else:
|
|
await page.close()
|
|
if smtp_conn:
|
|
try:
|
|
smtp_conn.quit()
|
|
except Exception:
|
|
pass
|
|
|
|
logger.info(f"Amazon-Import fertig: {processed} verarbeitet, {skipped} übersprungen, {errors} Fehler")
|
|
return {"processed": processed, "skipped": skipped, "errors": errors}
|
|
|
|
|
|
async def _collect_and_process_orders(page, domain, since_date, smtp_conn, settings, import_email) -> dict | None:
|
|
"""Collect orders AND process invoices page by page.
|
|
|
|
This ensures invoice buttons are visible when we try to click them,
|
|
because we process each page's orders before navigating to the next page.
|
|
Returns None if session is invalid, otherwise dict with processed/skipped/errors counts.
|
|
"""
|
|
processed = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
# Navigate to orders page if needed
|
|
actual_url = page.url
|
|
if "order-history" not in actual_url and "your-orders" not in actual_url:
|
|
if "signin" in actual_url or "/ap/" in actual_url:
|
|
return None
|
|
logger.info("Amazon: Nicht auf Bestellseite, versuche Navigation über Link...")
|
|
orders_link = page.locator("a[href*='order-history'], a[href*='your-orders']")
|
|
if await orders_link.count() > 0:
|
|
await orders_link.first.click()
|
|
await asyncio.sleep(3)
|
|
try:
|
|
await page.wait_for_load_state("networkidle", timeout=15000)
|
|
except Exception:
|
|
pass
|
|
actual_url = page.url
|
|
if "order-history" not in actual_url and "your-orders" not in actual_url:
|
|
return None
|
|
|
|
# Try to set time filter
|
|
now = datetime.now()
|
|
days_back = (now - since_date).days
|
|
if days_back <= 30:
|
|
desired_filter = "last30"
|
|
elif days_back <= 90:
|
|
desired_filter = "months-3"
|
|
else:
|
|
desired_filter = f"year-{since_date.year}"
|
|
|
|
logger.info(f"Amazon: Setze Zeitfilter: {desired_filter}")
|
|
try:
|
|
filter_dropdown = page.locator("select[name='orderFilter'], select#orderFilter, select#time-filter")
|
|
if await filter_dropdown.count() > 0:
|
|
await filter_dropdown.first.select_option(desired_filter)
|
|
await asyncio.sleep(3)
|
|
try:
|
|
await page.wait_for_load_state("networkidle", timeout=15000)
|
|
except Exception:
|
|
pass
|
|
else:
|
|
logger.info("Amazon: Kein Filter-Dropdown gefunden, verwende aktuelle Ansicht")
|
|
except Exception as e:
|
|
logger.warning(f"Amazon: Filter setzen fehlgeschlagen: {e}")
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
seen_ids = set()
|
|
page_num = 1
|
|
total_orders = 0
|
|
|
|
while True:
|
|
logger.info(f"Amazon: Verarbeite Seite {page_num}...")
|
|
|
|
# Check for login redirect
|
|
if "signin" in page.url or "/ap/" in page.url:
|
|
if total_orders > 0:
|
|
logger.warning(f"Amazon: Login-Redirect auf Seite {page_num}, breche ab")
|
|
break
|
|
return None
|
|
|
|
# Extract orders from current page
|
|
page_orders = await _extract_orders_from_page(page, since_date)
|
|
new_orders = [o for o in page_orders if o["id"] not in seen_ids]
|
|
for o in new_orders:
|
|
seen_ids.add(o["id"])
|
|
logger.info(f"Amazon: Seite {page_num}: {len(page_orders)} gefunden, {len(new_orders)} neu")
|
|
total_orders += len(new_orders)
|
|
|
|
# Process invoices for THIS page's orders immediately (buttons are visible now)
|
|
for order in new_orders:
|
|
order_id = order.get("id", "?")
|
|
try:
|
|
if await is_invoice_downloaded(order_id, order_id):
|
|
skipped += 1
|
|
logger.debug(f"Amazon: Bestellung {order_id} bereits importiert")
|
|
continue
|
|
|
|
pdf_list = await _download_order_invoices(page, domain, order_id)
|
|
if not pdf_list:
|
|
logger.debug(f"Amazon: Keine Rechnung für Bestellung {order_id}")
|
|
continue
|
|
|
|
for inv_idx, pdf_bytes in enumerate(pdf_list):
|
|
suffix = f"_{inv_idx+1}" if len(pdf_list) > 1 else ""
|
|
try:
|
|
filename = f"Amazon_Rechnung_{order_id}{suffix}.pdf"
|
|
|
|
if settings.get("debug_save_amazon_pdfs") == "true":
|
|
try:
|
|
tmp_dir = Path(os.environ.get("UPLOAD_DIR", "/data/uploads")) / "amazon_invoices"
|
|
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
(tmp_dir / filename).write_bytes(pdf_bytes)
|
|
logger.info(f"Amazon: Debug-PDF gespeichert: {tmp_dir / filename} ({len(pdf_bytes)} Bytes)")
|
|
except Exception as e:
|
|
logger.warning(f"Amazon: Debug-PDF speichern fehlgeschlagen: {e}")
|
|
|
|
forward_msg = _build_forward_email(
|
|
from_addr=settings.get("smtp_username", ""),
|
|
to_addr=import_email,
|
|
original_subject=f"Amazon Rechnung - Bestellung {order_id}{suffix}",
|
|
original_from=f"Amazon ({domain})",
|
|
attachments=[(filename, pdf_bytes)],
|
|
)
|
|
smtp_log = _send_with_log(smtp_conn, forward_msg)
|
|
processed += 1
|
|
logger.info(f"Amazon: Rechnung {inv_idx+1}/{len(pdf_list)} für {order_id} gesendet")
|
|
await add_log_entry(
|
|
email_subject=f"Amazon Rechnung - {order_id}{suffix}",
|
|
email_from=f"Amazon ({domain})",
|
|
attachments_count=1,
|
|
status="success",
|
|
sent_to=import_email,
|
|
smtp_log=smtp_log,
|
|
)
|
|
except Exception as e:
|
|
errors += 1
|
|
logger.error(f"Amazon: Fehler bei Rechnung {inv_idx+1} für {order_id}: {e}")
|
|
await add_log_entry(
|
|
email_subject=f"Amazon Rechnung - {order_id}{suffix}",
|
|
email_from=f"Amazon ({domain})",
|
|
attachments_count=0,
|
|
status="error",
|
|
error_message=str(e),
|
|
)
|
|
|
|
await mark_invoice_downloaded(order_id, order_id)
|
|
await _human_delay(2.0, 4.0)
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
logger.error(f"Amazon: Fehler bei Bestellung {order_id}: {e}")
|
|
await add_log_entry(
|
|
email_subject=f"Amazon Rechnung - {order_id}",
|
|
email_from=f"Amazon ({domain})",
|
|
attachments_count=0,
|
|
status="error",
|
|
error_message=str(e),
|
|
)
|
|
|
|
# Navigate to next page
|
|
has_next = await page.evaluate("""() => {
|
|
const nextLink = document.querySelector('.a-pagination .a-last:not(.a-disabled) a');
|
|
if (nextLink) {
|
|
nextLink.scrollIntoView({behavior: 'smooth', block: 'center'});
|
|
return true;
|
|
}
|
|
return false;
|
|
}""")
|
|
|
|
if has_next and page_orders:
|
|
logger.info("Amazon: Klicke auf nächste Seite (JS)...")
|
|
await asyncio.sleep(0.5)
|
|
await page.evaluate("""() => {
|
|
const nextLink = document.querySelector('.a-pagination .a-last:not(.a-disabled) a');
|
|
if (nextLink) nextLink.click();
|
|
}""")
|
|
await asyncio.sleep(3)
|
|
try:
|
|
await page.wait_for_load_state("networkidle", timeout=30000)
|
|
except Exception:
|
|
pass
|
|
page_num += 1
|
|
await _human_delay(1.0, 2.0)
|
|
else:
|
|
break
|
|
|
|
logger.info(f"Amazon: Gesamt {total_orders} Bestellungen auf {page_num} Seite(n)")
|
|
return {"processed": processed, "skipped": skipped, "errors": errors}
|
|
|
|
|
|
async def _collect_orders(page, domain: str, since_date: datetime) -> list[dict] | None:
|
|
"""Collect order IDs from Amazon order history using SPA navigation (no page.goto).
|
|
|
|
The page must already be on the Amazon orders page (from interactive login).
|
|
Uses dropdown/click navigation to avoid losing the session.
|
|
"""
|
|
orders = []
|
|
|
|
actual_url = page.url
|
|
logger.info(f"Amazon: Aktuelle Seite: {actual_url}")
|
|
|
|
# Check if we're on the orders page or need to navigate there
|
|
if "order-history" not in actual_url and "your-orders" not in actual_url:
|
|
if "signin" in actual_url or "/ap/" in actual_url:
|
|
logger.error("Amazon: Seite ist Login-Seite - Session ungültig!")
|
|
await _save_debug(page, "orders_not_on_orders_page")
|
|
return None
|
|
# Try clicking the orders link within Amazon's SPA
|
|
logger.info("Amazon: Nicht auf Bestellseite, versuche Navigation über Link...")
|
|
orders_link = page.locator("a[href*='order-history'], a[href*='your-orders'], a:has-text('Bestellungen'), a:has-text('Meine Bestellungen')")
|
|
if await orders_link.count() > 0:
|
|
await orders_link.first.click()
|
|
await asyncio.sleep(3)
|
|
try:
|
|
await page.wait_for_load_state("networkidle", timeout=15000)
|
|
except Exception:
|
|
pass
|
|
actual_url = page.url
|
|
if "order-history" not in actual_url and "your-orders" not in actual_url:
|
|
logger.error(f"Amazon: Konnte nicht zur Bestellseite navigieren. URL: {actual_url}")
|
|
await _save_debug(page, "orders_navigation_failed")
|
|
return None
|
|
|
|
# Determine desired time filter
|
|
now = datetime.now()
|
|
days_back = (now - since_date).days
|
|
if days_back <= 30:
|
|
desired_filter = "last30"
|
|
elif days_back <= 90:
|
|
desired_filter = "months-3"
|
|
else:
|
|
desired_filter = f"year-{since_date.year}"
|
|
|
|
# Try to set the time filter via the dropdown
|
|
logger.info(f"Amazon: Setze Zeitfilter: {desired_filter}")
|
|
try:
|
|
filter_dropdown = page.locator("select[name='orderFilter'], select#orderFilter, select#time-filter")
|
|
if await filter_dropdown.count() > 0:
|
|
await filter_dropdown.first.select_option(desired_filter)
|
|
await asyncio.sleep(3)
|
|
try:
|
|
await page.wait_for_load_state("networkidle", timeout=15000)
|
|
except Exception:
|
|
pass
|
|
logger.info(f"Amazon: Zeitfilter '{desired_filter}' gesetzt")
|
|
else:
|
|
logger.info("Amazon: Kein Filter-Dropdown gefunden, verwende aktuelle Ansicht")
|
|
except Exception as e:
|
|
logger.warning(f"Amazon: Filter setzen fehlgeschlagen: {e}")
|
|
|
|
# Wait for content to load
|
|
await asyncio.sleep(2)
|
|
|
|
seen_ids = set()
|
|
page_num = 1
|
|
|
|
while True:
|
|
logger.info(f"Amazon: Verarbeite Seite {page_num}...")
|
|
|
|
# Check for login redirect
|
|
if "signin" in page.url or "/ap/" in page.url:
|
|
if orders:
|
|
logger.warning(f"Amazon: Login-Redirect auf Seite {page_num}, verwende {len(orders)} bereits gefundene Bestellung(en)")
|
|
return orders
|
|
logger.error("Amazon: Session ungültig!")
|
|
await _save_debug(page, "orders_redirect_login")
|
|
return None
|
|
|
|
page_orders = await _extract_orders_from_page(page, since_date)
|
|
new_orders = [o for o in page_orders if o["id"] not in seen_ids]
|
|
for o in new_orders:
|
|
seen_ids.add(o["id"])
|
|
logger.info(f"Amazon: Seite {page_num}: {len(page_orders)} gefunden, {len(new_orders)} neu")
|
|
orders.extend(new_orders)
|
|
|
|
# Try to click "Next" button for pagination via JavaScript (avoids visibility issues)
|
|
has_next = await page.evaluate("""() => {
|
|
const nextLink = document.querySelector('.a-pagination .a-last:not(.a-disabled) a');
|
|
if (nextLink) {
|
|
nextLink.scrollIntoView({behavior: 'smooth', block: 'center'});
|
|
return true;
|
|
}
|
|
return false;
|
|
}""")
|
|
|
|
if has_next and page_orders:
|
|
logger.info("Amazon: Klicke auf nächste Seite (JS)...")
|
|
await asyncio.sleep(0.5) # Wait for scroll
|
|
# Use JavaScript click to bypass Playwright visibility checks
|
|
await page.evaluate("""() => {
|
|
const nextLink = document.querySelector('.a-pagination .a-last:not(.a-disabled) a');
|
|
if (nextLink) nextLink.click();
|
|
}""")
|
|
await asyncio.sleep(3)
|
|
try:
|
|
await page.wait_for_load_state("networkidle", timeout=30000)
|
|
except Exception:
|
|
pass
|
|
page_num += 1
|
|
await _human_delay(1.0, 2.0)
|
|
else:
|
|
break
|
|
|
|
if not orders:
|
|
logger.warning("Amazon: Keine Bestellungen gefunden!")
|
|
await _save_debug(page, "no_orders_found")
|
|
|
|
return orders
|
|
|
|
|
|
async def _extract_orders_from_page(page, since_date: datetime) -> list[dict]:
|
|
"""Extract order data from the current page."""
|
|
orders = []
|
|
|
|
title = await page.title()
|
|
logger.info(f"Amazon: Seite analysieren: Titel='{title}', URL={page.url}")
|
|
|
|
await _save_debug(page, "order_page")
|
|
|
|
# Use JavaScript to extract only VISIBLE order cards (Amazon loads all in DOM, shows ~10 per page)
|
|
visible_orders = await page.evaluate("""() => {
|
|
const results = [];
|
|
// Try multiple selectors
|
|
const selectors = [
|
|
'.order-card.js-order-card',
|
|
'.order-card',
|
|
'.order-info',
|
|
'.a-box-group.order',
|
|
'.order',
|
|
];
|
|
const seen = new Set();
|
|
for (const sel of selectors) {
|
|
for (const el of document.querySelectorAll(sel)) {
|
|
// Only process visible elements (offsetParent !== null or check display)
|
|
if (el.offsetParent === null && getComputedStyle(el).position !== 'fixed') continue;
|
|
const text = el.innerText || '';
|
|
const idMatch = text.match(/(\d{3}-\d{7}-\d{7})/);
|
|
if (idMatch && !seen.has(idMatch[1])) {
|
|
seen.add(idMatch[1]);
|
|
results.push({id: idMatch[1], text: text.substring(0, 500)});
|
|
}
|
|
}
|
|
if (results.length > 0) break;
|
|
}
|
|
return results;
|
|
}""")
|
|
|
|
logger.info(f"Amazon: Sichtbare Order-Cards gefunden: {len(visible_orders)}")
|
|
|
|
if not visible_orders:
|
|
# Last resort: regex fallback on visible page text
|
|
visible_text = await page.evaluate("() => document.body.innerText")
|
|
order_ids = re.findall(r"\b(\d{3}-\d{7}-\d{7})\b", visible_text)
|
|
unique_ids = {oid for oid in set(order_ids) if not oid.startswith("000-")}
|
|
logger.info(f"Amazon: Keine Order-Cards, Fallback-Regex: {len(unique_ids)} Bestell-ID(s) im sichtbaren Text")
|
|
if not unique_ids:
|
|
logger.warning(f"Amazon: Seite hat keine Bestell-IDs. Titel: '{title}', URL: {page.url}")
|
|
for oid in unique_ids:
|
|
orders.append({"id": oid, "date": None})
|
|
return orders
|
|
|
|
for vo in visible_orders:
|
|
order_id = vo["id"]
|
|
if order_id.startswith("000-"):
|
|
continue
|
|
|
|
order_date = _parse_german_date(vo["text"])
|
|
if order_date and order_date < since_date:
|
|
logger.debug(f"Amazon: Bestellung {order_id} übersprungen (Datum {order_date.strftime('%Y-%m-%d')} < {since_date.strftime('%Y-%m-%d')})")
|
|
continue
|
|
|
|
logger.debug(f"Amazon: Bestellung gefunden: {order_id}, Datum: {order_date}")
|
|
orders.append({"id": order_id, "date": order_date})
|
|
|
|
return orders
|
|
|
|
|
|
def _parse_german_date(text: str) -> datetime | None:
|
|
"""Parse German date formats from order text."""
|
|
months_de = {
|
|
"Januar": 1, "Februar": 2, "März": 3, "April": 4,
|
|
"Mai": 5, "Juni": 6, "Juli": 7, "August": 8,
|
|
"September": 9, "Oktober": 10, "November": 11, "Dezember": 12,
|
|
}
|
|
pattern = r"(\d{1,2})\.\s*(" + "|".join(months_de.keys()) + r")\s+(\d{4})"
|
|
match = re.search(pattern, text)
|
|
if match:
|
|
day = int(match.group(1))
|
|
month = months_de[match.group(2)]
|
|
year = int(match.group(3))
|
|
try:
|
|
return datetime(year, month, day)
|
|
except ValueError:
|
|
pass
|
|
|
|
match = re.search(r"(\d{2})\.(\d{2})\.(\d{4})", text)
|
|
if match:
|
|
try:
|
|
return datetime(int(match.group(3)), int(match.group(2)), int(match.group(1)))
|
|
except ValueError:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
async def _close_all_popovers(page):
|
|
"""Close all open Amazon popovers reliably.
|
|
|
|
IMPORTANT: Do NOT set display:none - Amazon recycles popover containers,
|
|
so hiding them prevents future popovers from appearing.
|
|
"""
|
|
try:
|
|
await page.evaluate("""() => {
|
|
// Close via close buttons
|
|
document.querySelectorAll('.a-popover-footer button, .a-popover .a-button-close, .a-popover-close').forEach(b => {
|
|
try { b.click(); } catch(e) {}
|
|
});
|
|
// Use Amazon's own popover API to close if available
|
|
if (window.P && window.P.when) {
|
|
try {
|
|
window.P.when('A').execute(function(A) {
|
|
if (A && A.popover) {
|
|
document.querySelectorAll('.a-popover:not(.a-popover-hidden)').forEach(p => {
|
|
const id = p.getAttribute('data-a-popover-id');
|
|
if (id) try { A.popover.close(id); } catch(e) {}
|
|
});
|
|
}
|
|
});
|
|
} catch(e) {}
|
|
}
|
|
// Click outside to dismiss any remaining popovers
|
|
document.body.click();
|
|
}""")
|
|
await asyncio.sleep(0.5)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
async def _download_order_invoices(page, domain: str, order_id: str) -> list[bytes]:
|
|
"""Download invoice PDFs for an order.
|
|
|
|
Strategy: Extract popover AJAX URL from data-a-popover attribute,
|
|
then use XMLHttpRequest with proper Amazon headers (anti-CSRF token,
|
|
X-Requested-With) to load the invoice popover HTML.
|
|
This is exactly what Amazon's own JavaScript does internally.
|
|
"""
|
|
import base64
|
|
pdfs = []
|
|
|
|
logger.info(f"Amazon: Hole Rechnungs-Links für {order_id}")
|
|
|
|
# Step 1: Extract the popover AJAX URL and download links via XMLHttpRequest
|
|
invoice_result = await page.evaluate(f"""async () => {{
|
|
// Find the order card containing this order ID
|
|
const cards = document.querySelectorAll('.order-card, .order-info, .a-box-group, div');
|
|
let popoverUrl = null;
|
|
|
|
for (const card of cards) {{
|
|
if (!card.innerText || !card.innerText.includes('{order_id}')) continue;
|
|
|
|
// Find the popover trigger with invoice URL
|
|
const triggers = card.querySelectorAll('[data-a-popover*="invoice"]');
|
|
for (const trigger of triggers) {{
|
|
try {{
|
|
const config = JSON.parse(trigger.getAttribute('data-a-popover'));
|
|
if (config && config.url && config.url.includes('{order_id}')) {{
|
|
popoverUrl = config.url;
|
|
break;
|
|
}}
|
|
}} catch(e) {{}}
|
|
}}
|
|
if (popoverUrl) break;
|
|
}}
|
|
|
|
if (!popoverUrl) {{
|
|
return {{ found: false, error: 'Kein Popover-URL gefunden' }};
|
|
}}
|
|
|
|
// Step 2: Make XMLHttpRequest with proper Amazon headers
|
|
try {{
|
|
const response = await new Promise((resolve, reject) => {{
|
|
const xhr = new XMLHttpRequest();
|
|
xhr.open('GET', popoverUrl, true);
|
|
xhr.setRequestHeader('X-Requested-With', 'XMLHttpRequest');
|
|
xhr.setRequestHeader('Accept', 'text/html,*/*');
|
|
xhr.onload = function() {{
|
|
resolve({{ status: xhr.status, html: xhr.responseText }});
|
|
}};
|
|
xhr.onerror = function() {{
|
|
reject(new Error('XHR failed'));
|
|
}};
|
|
xhr.send();
|
|
}});
|
|
|
|
if (response.status !== 200) {{
|
|
return {{ found: false, error: 'HTTP ' + response.status, url: popoverUrl }};
|
|
}}
|
|
|
|
const html = response.html;
|
|
|
|
// Check if response is a login page
|
|
if (html.includes('ap_signin') || html.includes('ap_error') || html.includes('/ap/')) {{
|
|
return {{ found: false, error: 'Login-Seite erhalten', url: popoverUrl, isLogin: true }};
|
|
}}
|
|
|
|
// Extract PDF download links from the response HTML
|
|
const parser = new DOMParser();
|
|
const doc = parser.parseFromString(html, 'text/html');
|
|
const links = doc.querySelectorAll('a[href]');
|
|
const pdfLinks = [];
|
|
|
|
for (const link of links) {{
|
|
const href = link.getAttribute('href') || '';
|
|
const text = (link.innerText || '').trim();
|
|
if (href.includes('/ap/') || href.includes('openid')) continue;
|
|
if (href.includes('contact.html') || href.includes('help/contact')) continue;
|
|
if (text.toLowerCase().includes('anfordern')) continue;
|
|
if (
|
|
href.includes('.pdf') ||
|
|
href.includes('documents/download') ||
|
|
href.includes('/document/') ||
|
|
href.includes('invoice/download') ||
|
|
href.includes('generated_invoices')
|
|
) {{
|
|
pdfLinks.push({{ href: href, text: text.substring(0, 100) }});
|
|
}}
|
|
}}
|
|
|
|
return {{ found: true, url: popoverUrl, links: pdfLinks, htmlSize: html.length }};
|
|
|
|
}} catch(e) {{
|
|
return {{ found: false, error: e.message, url: popoverUrl }};
|
|
}}
|
|
}}""")
|
|
|
|
logger.info(f"Amazon: Invoice-Ergebnis für {order_id}: found={invoice_result.get('found')}, "
|
|
f"links={invoice_result.get('links', [])}, error={invoice_result.get('error', '')}")
|
|
|
|
if not invoice_result.get("found") or not invoice_result.get("links"):
|
|
if invoice_result.get("isLogin"):
|
|
logger.warning(f"Amazon: Session abgelaufen beim Rechnungsabruf für {order_id}")
|
|
return pdfs
|
|
|
|
# Step 3: Download each PDF via XMLHttpRequest as base64
|
|
for link_info in invoice_result["links"]:
|
|
href = link_info["href"]
|
|
text = link_info.get("text", "")
|
|
|
|
# Make href absolute if relative
|
|
if href.startswith("/"):
|
|
fetch_href = href
|
|
elif href.startswith("http"):
|
|
from urllib.parse import urlparse
|
|
parsed = urlparse(href)
|
|
fetch_href = parsed.path + ("?" + parsed.query if parsed.query else "")
|
|
else:
|
|
fetch_href = "/" + href
|
|
|
|
logger.info(f"Amazon: Lade PDF '{text}' -> {fetch_href[:100]}")
|
|
|
|
try:
|
|
pdf_result = await page.evaluate(f"""async () => {{
|
|
try {{
|
|
const resp = await new Promise((resolve, reject) => {{
|
|
const xhr = new XMLHttpRequest();
|
|
xhr.open('GET', '{fetch_href}', true);
|
|
xhr.responseType = 'arraybuffer';
|
|
xhr.onload = function() {{
|
|
const bytes = new Uint8Array(xhr.response);
|
|
let binary = '';
|
|
for (let i = 0; i < bytes.length; i++) {{
|
|
binary += String.fromCharCode(bytes[i]);
|
|
}}
|
|
resolve({{
|
|
ok: xhr.status === 200,
|
|
status: xhr.status,
|
|
data: btoa(binary),
|
|
size: bytes.length,
|
|
contentType: xhr.getResponseHeader('content-type') || ''
|
|
}});
|
|
}};
|
|
xhr.onerror = function() {{ reject(new Error('XHR failed')); }};
|
|
xhr.send();
|
|
}});
|
|
return resp;
|
|
}} catch(e) {{
|
|
return {{ ok: false, error: e.message }};
|
|
}}
|
|
}}""")
|
|
|
|
if pdf_result and pdf_result.get("ok") and pdf_result.get("size", 0) > 500:
|
|
pdf_bytes = base64.b64decode(pdf_result["data"])
|
|
content_type = pdf_result.get("contentType", "")
|
|
if pdf_bytes[:5] == b"%PDF-" or "pdf" in content_type.lower():
|
|
logger.info(f"Amazon: PDF heruntergeladen für {order_id}: {len(pdf_bytes)} Bytes")
|
|
pdfs.append(pdf_bytes)
|
|
else:
|
|
logger.debug(f"Amazon: Download kein PDF für {order_id} (type: {content_type}, size: {len(pdf_bytes)})")
|
|
elif pdf_result:
|
|
logger.debug(f"Amazon: PDF-Download fehlgeschlagen für {order_id}: {pdf_result.get('error', 'status=' + str(pdf_result.get('status')))}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Amazon: PDF-Download Exception für {order_id}: {e}")
|
|
|
|
if not pdfs:
|
|
logger.info(f"Amazon: Keine PDFs für {order_id}")
|
|
|
|
return pdfs
|