proxmox-cluster-network-cha.../main.py

#!/usr/bin/env python3
"""
Proxmox Cluster Network Changer

Migriert ein Proxmox-Cluster (inkl. Ceph) von einem Netzwerk in ein anderes.
Behandelt Corosync, Ceph, /etc/network/interfaces und /etc/hosts.

Kann auch mit gebrochenem Quorum umgehen (z.B. wenn ein Node bereits
manuell geändert wurde).

Muss als root auf einem Proxmox-Node ausgeführt werden.

Verwendung:
    python3 main.py              # Interaktiver Modus
    python3 main.py --dry-run    # Nur anzeigen, nichts ändern
    python3 main.py --discover   # Nur Discovery, keine Migration
"""

import argparse
import os
import sys

from ssh_manager import SSHManager
from discovery import Discovery
from planner import Planner
from backup import Backup
from migrator import Migrator
from verifier import Verifier
from rescue import RescueNetwork


def check_prerequisites():
    """Check that we're running as root on a Proxmox node."""
    if os.geteuid() != 0:
        print("FEHLER: Dieses Tool muss als root ausgeführt werden!")
        print("Bitte mit 'sudo python3 main.py' starten.")
        sys.exit(1)

    if not os.path.exists("/etc/pve") and not os.path.exists("/etc/corosync"):
        print("WARNUNG: Dies scheint kein Proxmox-Node zu sein.")
        print("         /etc/pve und /etc/corosync nicht gefunden.")
        answer = input("Trotzdem fortfahren? [j/N]: ").strip().lower()
        if answer not in ('j', 'ja', 'y', 'yes'):
            sys.exit(0)


def main():
    parser = argparse.ArgumentParser(
        description="Proxmox Cluster Network Changer - "
                    "Migriert Cluster + Ceph in ein neues Netzwerk"
    )
    parser.add_argument(
        "--dry-run", action="store_true",
        help="Nur anzeigen was geändert würde, nichts ändern"
    )
    parser.add_argument(
        "--discover", action="store_true",
        help="Nur Discovery durchführen, keine Migration"
    )
    parser.add_argument(
        "--ssh-key", type=str, default=None,
        help="Pfad zum SSH-Key (Standard: Default SSH-Key)"
    )
    parser.add_argument(
        "--ssh-port", type=int, default=22,
        help="SSH-Port (Standard: 22)"
    )
    parser.add_argument(
        "--rescue", action="store_true",
        help="Rescue-Modus: Emergency-Netzwerk einrichten wenn Nodes "
             "sich nicht erreichen können"
    )
    parser.add_argument(
        "--rescue-commands", type=str, metavar="SUBNET",
        help="Nur Rescue-Befehle ausgeben ohne Migration "
             "(z.B. --rescue-commands 10.99.99.0/24)"
    )
    args = parser.parse_args()

    print("=" * 60)
    print("  Proxmox Cluster Network Changer")
    print("=" * 60)

    check_prerequisites()

    # Initialize SSH manager
    ssh = SSHManager(ssh_key=args.ssh_key, ssh_port=args.ssh_port)
    rescue = RescueNetwork(ssh)

    # Quick mode: just print rescue commands and exit
    if args.rescue_commands:
        discovery = Discovery(ssh)
        print("\n[Corosync]")
        corosync = discovery.discover_corosync()
        if not corosync:
            print("\nFEHLER: Konnte Cluster-Konfiguration nicht lesen.")
            sys.exit(1)

        bridge_input = input(f"Bridge [{rescue.bridge}]: ").strip()
        bridge = bridge_input or rescue.bridge

        commands = rescue.get_rescue_commands(corosync, args.rescue_commands, bridge)
        print()
        print("=" * 60)
        print("  RESCUE BEFEHLE")
        print(f"  Subnetz: {args.rescue_commands} | Bridge: {bridge}")
        print("=" * 60)
        print()
        for cmd_info in commands:
            print(f"  {cmd_info['name']} ({cmd_info['current_ip']}):")
            print(f"    {cmd_info['command']}")
            print()
        print("  Zum Entfernen:")
        for cmd_info in commands:
            print(f"    {cmd_info['remove_command']}  # {cmd_info['name']}")
        print()
        sys.exit(0)

    # Phase 1: Discovery
    discovery = Discovery(ssh)
    corosync, ceph, nodes, has_quorum = discovery.run()

    if not corosync:
        print("\nFEHLER: Konnte Cluster-Konfiguration nicht lesen. Abbruch.")
        sys.exit(1)

    # Check if rescue mode is needed
    unreachable = [n for n in nodes if not n.is_reachable and not n.is_local]
    use_rescue = args.rescue

    if unreachable and not use_rescue:
        print(f"\n  {len(unreachable)} Node(s) nicht erreichbar.")
        answer = input("  Rescue-Netzwerk einrichten? [J/n]: ").strip().lower()
        if answer not in ('n', 'nein', 'no'):
            use_rescue = True

    if use_rescue:
        rescue_nodes = rescue.setup_interactive(corosync)
        if not rescue_nodes:
            sys.exit(1)
        # Re-run discovery with rescue IPs to read configs from all nodes
        print("\n  [Rescue] Lese Konfigurationen über Rescue-Netzwerk...")
        nodes = discovery.discover_nodes_with_overrides(
            corosync, rescue_nodes
        )
        # Re-check quorum
        has_quorum = discovery.check_quorum()
        # Re-read ceph
        ceph = discovery.discover_ceph()

    if args.discover:
        if rescue.active:
            rescue.cleanup(nodes)
        print("\n--- Discovery abgeschlossen (--discover Modus) ---")
        sys.exit(0)

    # Phase 2: Planning
    planner = Planner()
    plan = planner.plan(nodes, corosync, ceph, has_quorum)

    if not plan:
        if rescue.active:
            rescue.cleanup(nodes)
        sys.exit(0)

    plan.dry_run = args.dry_run

    # Generate all new config files
    configs = planner.generate_new_configs(plan)

    # Phase 3: Backup (skip in dry-run)
    if not args.dry_run:
        backup = Backup(ssh)
        if not backup.run(plan):
            print("\nBackup fehlgeschlagen! Trotzdem fortfahren?")
            answer = input("[j/N]: ").strip().lower()
            if answer not in ('j', 'ja', 'y', 'yes'):
                if rescue.active:
                    rescue.cleanup(nodes)
                sys.exit(1)
    else:
        print("\n=== Phase 3: Backup (übersprungen im Dry-Run) ===")

    # Phase 4: Migration
    migrator = Migrator(ssh)
    success = migrator.run(plan, configs, dry_run=args.dry_run)

    if not success:
        print("\n[!] Migration hatte Fehler!")
        if not args.dry_run:
            print("    Prüfe Backups in /root/network-migration-backup-*/")
        if rescue.active:
            rescue.cleanup(nodes)
        sys.exit(1)

    # Cleanup rescue network (before verification, so we verify real connectivity)
    if rescue.active and not args.dry_run:
        rescue.cleanup(nodes)

    # Phase 5: Verification (skip in dry-run)
    if not args.dry_run:
        verifier = Verifier(ssh)
        verifier.run(plan)
    else:
        if rescue.active:
            rescue.cleanup(nodes)
        print("\n=== Phase 5: Verifikation (übersprungen im Dry-Run) ===")
        print("\nDry-Run abgeschlossen. Keine Änderungen vorgenommen.")


if __name__ == "__main__":
    main()