commit bb7c1d5c3f403ee72d16eba4c761ecaaafe66854 Author: Stefan Hacker Date: Wed Mar 4 21:55:49 2026 +0100 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..262601d --- /dev/null +++ b/README.md @@ -0,0 +1,245 @@ +# Proxmox Cluster Network Changer + +Migriert ein komplettes Proxmox-Cluster (inkl. Ceph) von einem Netzwerk in ein anderes. + +**Problem:** Wenn man bei einem Proxmox-Cluster die IPs ändert, verliert man das Quorum und `/etc/pve` wird read-only — dann kann man weder Corosync noch Ceph über das Cluster-Dateisystem konfigurieren. Dieses Tool löst das Problem durch eine koordinierte Migration aller Nodes. + +## Features + +- Automatische Erkennung aller Nodes, IPs und Konfigurationen +- Koordinierte Migration aller Nodes in einem Durchgang +- Ceph-Support (Public Network, Cluster Network, MON-Adressen) +- Funktioniert auch bei **gebrochenem Quorum** (z.B. wenn ein Node bereits manuell geändert wurde) +- Automatische Backups aller Konfigurationen vor der Migration +- Dry-Run-Modus zum gefahrlosen Testen +- Verifikation nach der Migration + +## Voraussetzungen + +- Python 3.9+ (auf Proxmox standardmäßig vorhanden) +- Root-Zugriff auf dem Node, auf dem das Tool läuft +- SSH-Zugriff (Key-basiert) zu allen anderen Cluster-Nodes +- Keine externen Python-Pakete nötig (nur stdlib) + +## Installation + +```bash +# Auf einen Proxmox-Node kopieren +scp -r proxmox-cluster-network-changer/ root@pve1:/root/ + +# Oder direkt klonen +cd /root +git clone proxmox-cluster-network-changer +``` + +## Verwendung + +### Aktuelle Konfiguration anzeigen (Discovery) + +```bash +python3 main.py --discover +``` + +Zeigt an: +- Alle Cluster-Nodes mit IPs +- Corosync-Konfiguration +- Ceph-Netzwerke und MON-Hosts +- Quorum-Status +- Welche Nodes erreichbar sind + +### Dry-Run (nichts wird geändert) + +```bash +python3 main.py --dry-run +``` + +Durchläuft den kompletten Prozess, zeigt alle geplanten Änderungen an, schreibt aber nichts. + +### Migration durchführen + +```bash +python3 main.py +``` + +Das Tool führt interaktiv durch den Prozess: + +``` +=== Phase 1: Discovery === + +[Corosync] + Cluster: mycluster + Nodes gefunden: 4 + - pve1 (ID: 1) -> 192.168.0.101 + - pve2 (ID: 2) -> 192.168.0.102 + - pve3 (ID: 3) -> 192.168.0.103 + - pve4 (ID: 4) -> 192.168.0.104 + +[Ceph] + Public Network: 192.168.0.0/24 + Cluster Network: 192.168.0.0/24 + +=== Phase 2: Migration planen === + +Neues Netzwerk (z.B. 172.0.2.0/16): 172.0.2.0/16 +Neues Gateway [172.0.0.1]: 172.0.2.1 + +[IP-Mapping] + pve1: 192.168.0.101 -> [172.0.2.101]: + pve2: 192.168.0.102 -> [172.0.2.102]: + pve3: 192.168.0.103 -> [172.0.2.103]: + pve4: 192.168.0.104 -> [172.0.2.104]: + +Migration durchführen? [j/N]: j +``` + +### Optionen + +| Option | Beschreibung | +|---|---| +| `--dry-run` | Nur anzeigen, nichts ändern | +| `--discover` | Nur aktuelle Config anzeigen | +| `--rescue` | Rescue-Modus: Emergency-Netzwerk einrichten | +| `--rescue-commands SUBNET` | Nur Rescue-Befehle ausgeben (z.B. `10.99.99.0/24`) | +| `--ssh-key PFAD` | Pfad zum SSH-Key (Standard: Default-Key) | +| `--ssh-port PORT` | SSH-Port (Standard: 22) | + +## Was wird geändert? + +| Datei | Wo | Was | +|---|---|---| +| `/etc/network/interfaces` | Jeder Node | Bridge-IP, Gateway | +| `/etc/hosts` | Jeder Node | Hostname-zu-IP-Zuordnung | +| `/etc/corosync/corosync.conf` | Jeder Node | Corosync Ring-Adressen | +| `/etc/pve/ceph.conf` | Cluster-FS | public_network, cluster_network, MON-Adressen | + +## Migrationsablauf (Phase 4) + +1. Neue Konfigurationen werden auf alle Nodes verteilt (Staging) +2. Corosync wird auf allen Nodes gestoppt +3. pve-cluster (pmxcfs) wird gestoppt +4. Corosync-Config wird direkt geschrieben (`/etc/corosync/corosync.conf`) +5. `/etc/hosts` wird aktualisiert +6. `/etc/network/interfaces` wird aktualisiert + Netzwerk-Reload (`ifreload -a`) +7. Services werden gestartet, Quorum abgewartet, Ceph aktualisiert + +## Rescue-Netzwerk (Emergency Mode) + +**Szenario:** PVE01 hat bereits eine neue IP, PVE02-04 sind noch im alten Netz. Kein Node kann die anderen erreichen. + +### Schnell: Nur Befehle anzeigen + +```bash +python3 main.py --rescue-commands 10.99.99.0/24 +``` + +Ausgabe: +``` + RESCUE BEFEHLE + Subnetz: 10.99.99.0/24 | Bridge: vmbr0 + + pve1 (192.168.0.101): + ip addr add 10.99.99.1/24 dev vmbr0 + + pve2 (192.168.0.102): + ip addr add 10.99.99.2/24 dev vmbr0 + + pve3 (192.168.0.103): + ip addr add 10.99.99.3/24 dev vmbr0 + + pve4 (192.168.0.104): + ip addr add 10.99.99.4/24 dev vmbr0 + + Zum Entfernen: + ip addr del 10.99.99.1/24 dev vmbr0 # pve1 + ip addr del 10.99.99.2/24 dev vmbr0 # pve2 + ip addr del 10.99.99.3/24 dev vmbr0 # pve3 + ip addr del 10.99.99.4/24 dev vmbr0 # pve4 +``` + +Diese Befehle über IPMI/iLO/iDRAC/KVM-Konsole auf jedem Node ausführen. + +### Interaktiv: Rescue + Migration + +```bash +python3 main.py --rescue +``` + +oder einfach starten — wenn Nodes nicht erreichbar sind, wird automatisch gefragt: + +```bash +python3 main.py +``` + +``` + 3 Node(s) nicht erreichbar. + Rescue-Netzwerk einrichten? [J/n]: j +``` + +Ablauf: +1. Du gibst ein freies Subnetz an (z.B. `10.99.99.0/24`) +2. Das Tool zeigt für jeden Node den `ip addr add` Befehl +3. Auf dem lokalen Node wird die IP automatisch gesetzt +4. Du führst die Befehle auf den anderen Nodes per Konsole aus +5. Das Tool testet die Verbindung und liest die Configs +6. Danach läuft die normale Migration +7. Am Ende werden die Emergency-IPs automatisch entfernt + +### Wann brauche ich das? + +- Ein oder mehrere Nodes haben bereits manuell eine neue IP bekommen +- Die Nodes liegen in verschiedenen Subnetzen +- SSH zwischen den Nodes funktioniert nicht mehr +- Du hast aber noch Zugriff auf die Konsolen (IPMI/iLO/iDRAC/KVM) + +## Gebrochenes Quorum + +Wenn bereits ein Node manuell geändert wurde und das Quorum verloren ist: + +- Das Tool erkennt den Zustand automatisch in der Discovery-Phase +- Nicht erreichbare Nodes werden per Hostname gesucht +- Configs werden direkt geschrieben (nicht über `/etc/pve/`) +- Nach dem Netzwerk-Reload wird `pvecm expected 1` genutzt, um Quorum zu erzwingen +- Danach wird Ceph über das Cluster-Dateisystem aktualisiert + +## Backups + +Vor der Migration werden automatisch Backups erstellt: + +``` +/root/network-migration-backup-20260304_143022/ +├── etc_network_interfaces +├── etc_hosts +├── etc_corosync_corosync.conf +├── etc_ceph_ceph.conf +├── etc_pve_corosync.conf +└── etc_pve_ceph.conf +``` + +### Restore (manuell) + +```bash +# Beispiel: Netzwerk-Config wiederherstellen +cp /root/network-migration-backup-*/etc_network_interfaces /etc/network/interfaces +ifreload -a + +# Corosync wiederherstellen +cp /root/network-migration-backup-*/etc_corosync_corosync.conf /etc/corosync/corosync.conf +systemctl restart corosync +``` + +## Empfohlene Reihenfolge bei Problemen + +1. `pvecm status` — Cluster-Status prüfen +2. `pvecm expected 1` — Quorum erzwingen (Notfall) +3. `ceph -s` — Ceph-Status prüfen +4. `ceph -w` — Ceph-Recovery beobachten +5. `journalctl -u corosync` — Corosync-Logs prüfen +6. `journalctl -u pve-cluster` — pmxcfs-Logs prüfen + +## Hinweise + +- Das Tool muss als **root** ausgeführt werden +- SSH-Keys müssen **vorher** zwischen den Nodes eingerichtet sein (bei Proxmox-Clustern standardmäßig der Fall) +- VMs/CTs werden **nicht** automatisch migriert oder gestoppt — das Netzwerk wird im laufenden Betrieb geändert +- Nach der Migration sollten VM-Netzwerke (Bridges in VM-Configs) geprüft werden, falls diese sich auf spezifische IPs beziehen +- Getestet mit Proxmox VE 7.x und 8.x diff --git a/backup.py b/backup.py new file mode 100644 index 0000000..c751cc9 --- /dev/null +++ b/backup.py @@ -0,0 +1,86 @@ +"""Phase 3: Backup all configuration files before migration.""" + +import datetime +from models import MigrationPlan +from ssh_manager import SSHManager + + +BACKUP_FILES = [ + "/etc/network/interfaces", + "/etc/hosts", + "/etc/corosync/corosync.conf", + "/etc/ceph/ceph.conf", +] + +CLUSTER_BACKUP_FILES = [ + "/etc/pve/corosync.conf", + "/etc/pve/ceph.conf", +] + + +class Backup: + """Creates backups of all config files on each node.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + + def run(self, plan: MigrationPlan) -> bool: + """Create backups on all reachable nodes. + + Returns True if all backups succeeded. + """ + print("\n=== Phase 3: Backup ===\n") + + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + backup_dir = f"/root/network-migration-backup-{timestamp}" + all_ok = True + + for node in plan.nodes: + if not node.is_reachable: + print(f" [{node.name}] Übersprungen (nicht erreichbar)") + continue + + print(f" [{node.name}] Erstelle Backup in {backup_dir}/") + + # Create backup directory + rc, _, err = self.ssh.run_on_node( + node.ssh_host, f"mkdir -p {backup_dir}", node.is_local + ) + if rc != 0: + print(f" [!] Fehler beim Erstellen des Backup-Verzeichnisses: {err}") + all_ok = False + continue + + # Backup per-node files + for filepath in BACKUP_FILES: + filename = filepath.replace("/", "_").lstrip("_") + rc, _, _ = self.ssh.run_on_node( + node.ssh_host, + f"cp {filepath} {backup_dir}/{filename} 2>/dev/null", + node.is_local, + ) + if rc == 0: + print(f" OK: {filepath}") + else: + print(f" --: {filepath} (nicht vorhanden)") + + # Backup cluster files (only from local node since they're shared) + if node.is_local: + for filepath in CLUSTER_BACKUP_FILES: + filename = filepath.replace("/", "_").lstrip("_") + rc, _, _ = self.ssh.run_on_node( + node.ssh_host, + f"cp {filepath} {backup_dir}/{filename} 2>/dev/null", + node.is_local, + ) + if rc == 0: + print(f" OK: {filepath} (cluster)") + else: + print(f" --: {filepath} (nicht vorhanden)") + + if all_ok: + print(f"\n Backup erfolgreich in {backup_dir}/") + else: + print("\n [!] Einige Backups sind fehlgeschlagen!") + + return all_ok diff --git a/config_parser.py b/config_parser.py new file mode 100644 index 0000000..d332227 --- /dev/null +++ b/config_parser.py @@ -0,0 +1,271 @@ +"""Parsers for Proxmox configuration files (Corosync, Ceph, /etc/network/interfaces).""" + +import re +from models import ( + CorosyncConfig, CorosyncNode, CephConfig, NetworkInterface, +) + + +def parse_corosync_conf(content: str) -> CorosyncConfig: + """Parse corosync.conf and extract node information.""" + config = CorosyncConfig(raw_content=content) + + # Extract config_version + m = re.search(r'config_version:\s*(\d+)', content) + if m: + config.config_version = int(m.group(1)) + + # Extract cluster_name + m = re.search(r'cluster_name:\s*(\S+)', content) + if m: + config.cluster_name = m.group(1) + + # Extract transport + m = re.search(r'transport:\s*(\S+)', content) + if m: + config.transport = m.group(1) + + # Extract nodes from nodelist section + nodelist_match = re.search(r'nodelist\s*\{(.*?)\n\}', content, re.DOTALL) + if nodelist_match: + nodelist_content = nodelist_match.group(1) + # Find all node blocks + node_blocks = re.findall(r'node\s*\{(.*?)\}', nodelist_content, re.DOTALL) + for block in node_blocks: + node = CorosyncNode(nodeid=0, name="", ring0_addr="") + m = re.search(r'nodeid:\s*(\d+)', block) + if m: + node.nodeid = int(m.group(1)) + m = re.search(r'name:\s*(\S+)', block) + if m: + node.name = m.group(1) + m = re.search(r'ring0_addr:\s*(\S+)', block) + if m: + node.ring0_addr = m.group(1) + m = re.search(r'ring1_addr:\s*(\S+)', block) + if m: + node.ring1_addr = m.group(1) + config.nodes.append(node) + + return config + + +def generate_corosync_conf(config: CorosyncConfig, ip_mapping: dict[str, str]) -> str: + """Generate new corosync.conf with updated IP addresses. + + ip_mapping: old_ip -> new_ip + """ + new_content = config.raw_content + + for old_ip, new_ip in ip_mapping.items(): + new_content = new_content.replace(old_ip, new_ip) + + # Increment config_version + m = re.search(r'config_version:\s*(\d+)', new_content) + if m: + old_version = int(m.group(1)) + new_content = new_content.replace( + f'config_version: {old_version}', + f'config_version: {old_version + 1}' + ) + + return new_content + + +def parse_ceph_conf(content: str) -> CephConfig: + """Parse ceph.conf (INI-like format).""" + config = CephConfig(raw_content=content) + + # Extract fsid + m = re.search(r'fsid\s*=\s*(\S+)', content) + if m: + config.fsid = m.group(1) + + # Extract public_network + m = re.search(r'public.network\s*=\s*(\S+)', content) + if m: + config.public_network = m.group(1) + + # Extract cluster_network + m = re.search(r'cluster.network\s*=\s*(\S+)', content) + if m: + config.cluster_network = m.group(1) + + # Extract mon_host + m = re.search(r'mon.host\s*=\s*(.+)', content) + if m: + hosts_str = m.group(1).strip() + config.mon_hosts = [h.strip() for h in hosts_str.split(',') if h.strip()] + + # Extract [mon.X] sections + mon_sections = re.findall( + r'\[(mon\.[\w.-]+)\]\s*\n((?:\s+\w.*\n)*)', content + ) + for section_name, section_body in mon_sections: + props = {} + for line in section_body.strip().split('\n'): + line = line.strip() + if '=' in line: + key, val = line.split('=', 1) + props[key.strip()] = val.strip() + config.mon_sections[section_name] = props + + return config + + +def generate_ceph_conf(config: CephConfig, ip_mapping: dict[str, str], + new_public_network: str, new_cluster_network: str) -> str: + """Generate new ceph.conf with updated IPs and networks.""" + new_content = config.raw_content + + # Replace network definitions + if config.public_network: + new_content = new_content.replace( + config.public_network, new_public_network, 1 + ) + if config.cluster_network: + new_content = new_content.replace( + config.cluster_network, new_cluster_network, 1 + ) + + # Replace all IPs in the config + for old_ip, new_ip in ip_mapping.items(): + new_content = new_content.replace(old_ip, new_ip) + + return new_content + + +def parse_network_interfaces(content: str) -> list[NetworkInterface]: + """Parse /etc/network/interfaces and extract interface configs.""" + interfaces = [] + current_iface = None + current_lines = [] + + for line in content.split('\n'): + stripped = line.strip() + + # New iface block + m = re.match(r'iface\s+(\S+)\s+inet\s+(\S+)', stripped) + if m: + # Save previous + if current_iface: + interfaces.append(_build_interface(current_iface, current_lines)) + current_iface = m.group(1) + current_lines = [line] + continue + + # Auto line or source line starts a new context + if stripped.startswith('auto ') or stripped.startswith('source '): + if current_iface: + interfaces.append(_build_interface(current_iface, current_lines)) + current_iface = None + current_lines = [] + continue + + if current_iface and stripped: + current_lines.append(line) + + # Don't forget the last one + if current_iface: + interfaces.append(_build_interface(current_iface, current_lines)) + + return interfaces + + +def _build_interface(name: str, lines: list[str]) -> NetworkInterface: + """Build a NetworkInterface from parsed lines.""" + raw = '\n'.join(lines) + address = "" + netmask = "" + cidr = 0 + gateway = None + bridge_ports = None + + for line in lines: + stripped = line.strip() + # address with CIDR notation: address 192.168.0.1/24 + m = re.match(r'address\s+(\d+\.\d+\.\d+\.\d+)/(\d+)', stripped) + if m: + address = m.group(1) + cidr = int(m.group(2)) + netmask = cidr_to_netmask(cidr) + continue + # address without CIDR + m = re.match(r'address\s+(\d+\.\d+\.\d+\.\d+)', stripped) + if m: + address = m.group(1) + continue + m = re.match(r'netmask\s+(\S+)', stripped) + if m: + netmask = m.group(1) + cidr = netmask_to_cidr(netmask) + continue + m = re.match(r'gateway\s+(\S+)', stripped) + if m: + gateway = m.group(1) + continue + m = re.match(r'bridge[_-]ports\s+(\S+)', stripped) + if m: + bridge_ports = m.group(1) + continue + + return NetworkInterface( + name=name, + address=address, + netmask=netmask, + cidr=cidr, + gateway=gateway, + bridge_ports=bridge_ports, + raw_config=raw, + ) + + +def generate_network_interfaces(content: str, old_ip: str, new_ip: str, + new_cidr: int, new_gateway: str | None = None, + old_gateway: str | None = None) -> str: + """Update /etc/network/interfaces with new IP, keeping everything else.""" + new_content = content + + # Replace IP in address lines (with and without CIDR) + # address 192.168.0.101/24 -> address 172.0.2.101/16 + new_content = re.sub( + rf'(address\s+){re.escape(old_ip)}/\d+', + rf'\g<1>{new_ip}/{new_cidr}', + new_content + ) + # address 192.168.0.101 (without CIDR) + new_content = re.sub( + rf'(address\s+){re.escape(old_ip)}(\s)', + rf'\g<1>{new_ip}\2', + new_content + ) + + # Replace gateway if provided + if new_gateway and old_gateway: + new_content = new_content.replace( + f'gateway {old_gateway}', + f'gateway {new_gateway}' + ) + + return new_content + + +def generate_hosts(content: str, ip_mapping: dict[str, str]) -> str: + """Update /etc/hosts with new IPs.""" + new_content = content + for old_ip, new_ip in ip_mapping.items(): + new_content = new_content.replace(old_ip, new_ip) + return new_content + + +def cidr_to_netmask(cidr: int) -> str: + """Convert CIDR prefix length to netmask string.""" + bits = (0xFFFFFFFF << (32 - cidr)) & 0xFFFFFFFF + return f"{(bits >> 24) & 0xFF}.{(bits >> 16) & 0xFF}.{(bits >> 8) & 0xFF}.{bits & 0xFF}" + + +def netmask_to_cidr(netmask: str) -> int: + """Convert netmask string to CIDR prefix length.""" + parts = netmask.split('.') + binary = ''.join(f'{int(p):08b}' for p in parts) + return binary.count('1') diff --git a/discovery.py b/discovery.py new file mode 100644 index 0000000..344a696 --- /dev/null +++ b/discovery.py @@ -0,0 +1,189 @@ +"""Phase 1: Discovery - Read current cluster configuration.""" + +import socket +from models import NodeInfo, CorosyncConfig, CephConfig +from config_parser import parse_corosync_conf, parse_ceph_conf, parse_network_interfaces +from ssh_manager import SSHManager + + +class Discovery: + """Discovers current Proxmox cluster and Ceph configuration.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + self.local_hostname = socket.gethostname() + + def discover_corosync(self) -> CorosyncConfig | None: + """Read and parse corosync.conf from the local node.""" + # Try /etc/pve/corosync.conf first (cluster filesystem) + ok, content = self.ssh.read_local_file("/etc/pve/corosync.conf") + if not ok: + # Fallback to local corosync config + ok, content = self.ssh.read_local_file("/etc/corosync/corosync.conf") + if not ok: + print(f" [!] Corosync config nicht gefunden: {content}") + return None + + config = parse_corosync_conf(content) + print(f" Cluster: {config.cluster_name}") + print(f" Transport: {config.transport}") + print(f" Config Version: {config.config_version}") + print(f" Nodes gefunden: {len(config.nodes)}") + for node in config.nodes: + print(f" - {node.name} (ID: {node.nodeid}) -> {node.ring0_addr}") + return config + + def discover_ceph(self) -> CephConfig | None: + """Read and parse ceph.conf.""" + ok, content = self.ssh.read_local_file("/etc/pve/ceph.conf") + if not ok: + ok, content = self.ssh.read_local_file("/etc/ceph/ceph.conf") + if not ok: + print(" [!] Ceph config nicht gefunden (Ceph evtl. nicht installiert)") + return None + + config = parse_ceph_conf(content) + print(f" FSID: {config.fsid}") + print(f" Public Network: {config.public_network}") + print(f" Cluster Network: {config.cluster_network}") + if config.mon_hosts: + print(f" MON Hosts: {', '.join(config.mon_hosts)}") + if config.mon_sections: + print(f" MON Sections: {', '.join(config.mon_sections.keys())}") + return config + + def discover_nodes(self, corosync: CorosyncConfig) -> list[NodeInfo]: + """Build node list from corosync config and check reachability.""" + nodes = [] + for cs_node in corosync.nodes: + is_local = (cs_node.name == self.local_hostname) + node = NodeInfo( + name=cs_node.name, + current_ip=cs_node.ring0_addr, + ssh_host=cs_node.ring0_addr, + is_local=is_local, + ) + + # Check reachability + if is_local: + node.is_reachable = True + else: + node.is_reachable = self.ssh.is_reachable(cs_node.ring0_addr) + + # Try to reach by hostname if IP doesn't work + if not node.is_reachable and not is_local: + if self.ssh.is_reachable(cs_node.name): + node.is_reachable = True + node.ssh_host = cs_node.name + + if node.is_reachable: + self._read_node_configs(node) + + status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" + local_tag = " (lokal)" if is_local else "" + print(f" {node.name}: {node.current_ip} - {status}{local_tag}") + + nodes.append(node) + + return nodes + + def discover_nodes_with_overrides(self, corosync: CorosyncConfig, + override_nodes: list[NodeInfo]) -> list[NodeInfo]: + """Re-discover nodes using override SSH hosts (e.g. rescue IPs). + + Takes pre-configured nodes (with rescue IPs as ssh_host) and + reads their configs. + """ + print("\n[Nodes - via Rescue-Netzwerk]") + for node in override_nodes: + if node.is_reachable: + self._read_node_configs(node) + + status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" + local_tag = " (lokal)" if node.is_local else "" + via = f" via {node.ssh_host}" if not node.is_local else "" + print(f" {node.name}: {node.current_ip}{via} - {status}{local_tag}") + + return override_nodes + + def _read_node_configs(self, node: NodeInfo): + """Read network interfaces and hosts from a node.""" + # Read /etc/network/interfaces + ok, content = self.ssh.read_node_file( + node.ssh_host, "/etc/network/interfaces", node.is_local + ) + if ok: + node.network_interfaces_content = content + node.interfaces = parse_network_interfaces(content) + + # Read /etc/hosts + ok, content = self.ssh.read_node_file( + node.ssh_host, "/etc/hosts", node.is_local + ) + if ok: + node.hosts_content = content + + def check_quorum(self) -> bool: + """Check if the cluster currently has quorum.""" + rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null") + if rc != 0: + print(" [!] pvecm status fehlgeschlagen - kein Quorum oder kein Cluster") + return False + + if "Quorate: Yes" in stdout or "Activity blocked" not in stdout: + # Also check if /etc/pve is writable + rc2, _, _ = self.ssh.execute_local( + "touch /etc/pve/.migration_test && rm -f /etc/pve/.migration_test" + ) + if rc2 == 0: + print(" Quorum: JA (/etc/pve ist beschreibbar)") + return True + + print(" Quorum: NEIN (/etc/pve ist read-only!)") + return False + + def check_ceph_health(self) -> str | None: + """Get current Ceph health status.""" + rc, stdout, _ = self.ssh.execute_local("ceph health 2>/dev/null") + if rc == 0: + status = stdout.strip() + print(f" Ceph Health: {status}") + return status + return None + + def run(self) -> tuple[CorosyncConfig | None, CephConfig | None, + list[NodeInfo], bool]: + """Run full discovery. + + Returns: (corosync_config, ceph_config, nodes, has_quorum) + """ + print("\n=== Phase 1: Discovery ===\n") + + print("[Corosync]") + corosync = self.discover_corosync() + if not corosync or not corosync.nodes: + print("FEHLER: Konnte keine Corosync-Konfiguration lesen!") + return None, None, [], False + + print("\n[Ceph]") + ceph = self.discover_ceph() + + print("\n[Nodes]") + nodes = self.discover_nodes(corosync) + + print("\n[Cluster Status]") + has_quorum = self.check_quorum() + + if ceph: + print("\n[Ceph Health]") + self.check_ceph_health() + + unreachable = [n for n in nodes if not n.is_reachable] + if unreachable: + print(f"\n[!] WARNUNG: {len(unreachable)} Node(s) nicht erreichbar:") + for n in unreachable: + print(f" - {n.name} ({n.current_ip})") + print(" Diese Nodes wurden möglicherweise bereits manuell geändert.") + print(" Das Tool wird versuchen, sie über ihren Hostnamen zu erreichen.") + + return corosync, ceph, nodes, has_quorum diff --git a/main.py b/main.py new file mode 100644 index 0000000..39cec39 --- /dev/null +++ b/main.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +""" +Proxmox Cluster Network Changer + +Migriert ein Proxmox-Cluster (inkl. Ceph) von einem Netzwerk in ein anderes. +Behandelt Corosync, Ceph, /etc/network/interfaces und /etc/hosts. + +Kann auch mit gebrochenem Quorum umgehen (z.B. wenn ein Node bereits +manuell geändert wurde). + +Muss als root auf einem Proxmox-Node ausgeführt werden. + +Verwendung: + python3 main.py # Interaktiver Modus + python3 main.py --dry-run # Nur anzeigen, nichts ändern + python3 main.py --discover # Nur Discovery, keine Migration +""" + +import argparse +import os +import sys + +from ssh_manager import SSHManager +from discovery import Discovery +from planner import Planner +from backup import Backup +from migrator import Migrator +from verifier import Verifier +from rescue import RescueNetwork + + +def check_prerequisites(): + """Check that we're running as root on a Proxmox node.""" + if os.geteuid() != 0: + print("FEHLER: Dieses Tool muss als root ausgeführt werden!") + print("Bitte mit 'sudo python3 main.py' starten.") + sys.exit(1) + + if not os.path.exists("/etc/pve") and not os.path.exists("/etc/corosync"): + print("WARNUNG: Dies scheint kein Proxmox-Node zu sein.") + print(" /etc/pve und /etc/corosync nicht gefunden.") + answer = input("Trotzdem fortfahren? [j/N]: ").strip().lower() + if answer not in ('j', 'ja', 'y', 'yes'): + sys.exit(0) + + +def main(): + parser = argparse.ArgumentParser( + description="Proxmox Cluster Network Changer - " + "Migriert Cluster + Ceph in ein neues Netzwerk" + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Nur anzeigen was geändert würde, nichts ändern" + ) + parser.add_argument( + "--discover", action="store_true", + help="Nur Discovery durchführen, keine Migration" + ) + parser.add_argument( + "--ssh-key", type=str, default=None, + help="Pfad zum SSH-Key (Standard: Default SSH-Key)" + ) + parser.add_argument( + "--ssh-port", type=int, default=22, + help="SSH-Port (Standard: 22)" + ) + parser.add_argument( + "--rescue", action="store_true", + help="Rescue-Modus: Emergency-Netzwerk einrichten wenn Nodes " + "sich nicht erreichen können" + ) + parser.add_argument( + "--rescue-commands", type=str, metavar="SUBNET", + help="Nur Rescue-Befehle ausgeben ohne Migration " + "(z.B. --rescue-commands 10.99.99.0/24)" + ) + args = parser.parse_args() + + print("=" * 60) + print(" Proxmox Cluster Network Changer") + print("=" * 60) + + check_prerequisites() + + # Initialize SSH manager + ssh = SSHManager(ssh_key=args.ssh_key, ssh_port=args.ssh_port) + rescue = RescueNetwork(ssh) + + # Quick mode: just print rescue commands and exit + if args.rescue_commands: + discovery = Discovery(ssh) + print("\n[Corosync]") + corosync = discovery.discover_corosync() + if not corosync: + print("\nFEHLER: Konnte Cluster-Konfiguration nicht lesen.") + sys.exit(1) + + bridge_input = input(f"Bridge [{rescue.bridge}]: ").strip() + bridge = bridge_input or rescue.bridge + + commands = rescue.get_rescue_commands(corosync, args.rescue_commands, bridge) + print() + print("=" * 60) + print(" RESCUE BEFEHLE") + print(f" Subnetz: {args.rescue_commands} | Bridge: {bridge}") + print("=" * 60) + print() + for cmd_info in commands: + print(f" {cmd_info['name']} ({cmd_info['current_ip']}):") + print(f" {cmd_info['command']}") + print() + print(" Zum Entfernen:") + for cmd_info in commands: + print(f" {cmd_info['remove_command']} # {cmd_info['name']}") + print() + sys.exit(0) + + # Phase 1: Discovery + discovery = Discovery(ssh) + corosync, ceph, nodes, has_quorum = discovery.run() + + if not corosync: + print("\nFEHLER: Konnte Cluster-Konfiguration nicht lesen. Abbruch.") + sys.exit(1) + + # Check if rescue mode is needed + unreachable = [n for n in nodes if not n.is_reachable and not n.is_local] + use_rescue = args.rescue + + if unreachable and not use_rescue: + print(f"\n {len(unreachable)} Node(s) nicht erreichbar.") + answer = input(" Rescue-Netzwerk einrichten? [J/n]: ").strip().lower() + if answer not in ('n', 'nein', 'no'): + use_rescue = True + + if use_rescue: + rescue_nodes = rescue.setup_interactive(corosync) + if not rescue_nodes: + sys.exit(1) + # Re-run discovery with rescue IPs to read configs from all nodes + print("\n [Rescue] Lese Konfigurationen über Rescue-Netzwerk...") + nodes = discovery.discover_nodes_with_overrides( + corosync, rescue_nodes + ) + # Re-check quorum + has_quorum = discovery.check_quorum() + # Re-read ceph + ceph = discovery.discover_ceph() + + if args.discover: + if rescue.active: + rescue.cleanup(nodes) + print("\n--- Discovery abgeschlossen (--discover Modus) ---") + sys.exit(0) + + # Phase 2: Planning + planner = Planner() + plan = planner.plan(nodes, corosync, ceph, has_quorum) + + if not plan: + if rescue.active: + rescue.cleanup(nodes) + sys.exit(0) + + plan.dry_run = args.dry_run + + # Generate all new config files + configs = planner.generate_new_configs(plan) + + # Phase 3: Backup (skip in dry-run) + if not args.dry_run: + backup = Backup(ssh) + if not backup.run(plan): + print("\nBackup fehlgeschlagen! Trotzdem fortfahren?") + answer = input("[j/N]: ").strip().lower() + if answer not in ('j', 'ja', 'y', 'yes'): + if rescue.active: + rescue.cleanup(nodes) + sys.exit(1) + else: + print("\n=== Phase 3: Backup (übersprungen im Dry-Run) ===") + + # Phase 4: Migration + migrator = Migrator(ssh) + success = migrator.run(plan, configs, dry_run=args.dry_run) + + if not success: + print("\n[!] Migration hatte Fehler!") + if not args.dry_run: + print(" Prüfe Backups in /root/network-migration-backup-*/") + if rescue.active: + rescue.cleanup(nodes) + sys.exit(1) + + # Cleanup rescue network (before verification, so we verify real connectivity) + if rescue.active and not args.dry_run: + rescue.cleanup(nodes) + + # Phase 5: Verification (skip in dry-run) + if not args.dry_run: + verifier = Verifier(ssh) + verifier.run(plan) + else: + if rescue.active: + rescue.cleanup(nodes) + print("\n=== Phase 5: Verifikation (übersprungen im Dry-Run) ===") + print("\nDry-Run abgeschlossen. Keine Änderungen vorgenommen.") + + +if __name__ == "__main__": + main() diff --git a/migrator.py b/migrator.py new file mode 100644 index 0000000..8cf472e --- /dev/null +++ b/migrator.py @@ -0,0 +1,450 @@ +"""Phase 4: Execute the network migration.""" + +import time +from models import MigrationPlan +from ssh_manager import SSHManager + + +class Migrator: + """Executes the actual network migration across all nodes.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + + def run(self, plan: MigrationPlan, configs: dict, dry_run: bool = False) -> bool: + """Execute the migration. + + Args: + plan: The migration plan + configs: Generated configs from Planner.generate_new_configs() + dry_run: If True, only show what would be done + """ + print("\n=== Phase 4: Migration ===\n") + + if dry_run: + print(" *** DRY RUN - Es werden keine Änderungen vorgenommen ***\n") + + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + reachable_nodes = [n for n in plan.nodes if n.is_reachable] + + if not reachable_nodes: + print(" FEHLER: Keine Nodes erreichbar!") + return False + + # Step 1: Write new configs to all nodes (but don't activate yet) + print("[1/7] Neue Konfigurationen verteilen...") + if not self._distribute_configs(plan, configs, dry_run): + return False + + # Step 2: Stop Corosync on all nodes + print("\n[2/7] Corosync stoppen auf allen Nodes...") + if not self._stop_corosync(reachable_nodes, dry_run): + return False + + # Step 3: Stop pve-cluster (pmxcfs) to release corosync.conf + print("\n[3/7] pve-cluster stoppen...") + if not self._stop_pve_cluster(reachable_nodes, dry_run): + return False + + # Step 4: Write corosync config directly + print("\n[4/7] Corosync-Konfiguration aktualisieren...") + if not self._update_corosync(reachable_nodes, configs, dry_run): + return False + + # Step 5: Update /etc/hosts on all nodes + print("\n[5/7] /etc/hosts aktualisieren...") + if not self._update_hosts(plan, configs, dry_run): + return False + + # Step 6: Update network interfaces and restart networking + print("\n[6/7] Netzwerk-Interfaces aktualisieren und Netzwerk neu starten...") + if not self._update_network(plan, configs, dry_run): + return False + + # Step 7: Start services back up + print("\n[7/7] Services starten...") + if not self._start_services(plan, configs, dry_run): + return False + + return True + + def _distribute_configs(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Write prepared configs as staged files (not yet active).""" + for node in plan.nodes: + if not node.is_reachable or node.name not in configs['nodes']: + continue + + node_configs = configs['nodes'][node.name] + staging_dir = "/root/.network-migration-staged" + + if dry_run: + print(f" [{node.name}] Würde Configs nach {staging_dir}/ schreiben") + continue + + # Create staging directory + self.ssh.run_on_node( + node.ssh_host, f"mkdir -p {staging_dir}", node.is_local + ) + + # Stage network interfaces + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/interfaces", + node_configs['interfaces'], + node.is_local, + ) + if ok: + print(f" [{node.name}] interfaces staged") + else: + print(f" [{node.name}] FEHLER interfaces: {msg}") + return False + + # Stage hosts + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/hosts", + node_configs['hosts'], + node.is_local, + ) + if ok: + print(f" [{node.name}] hosts staged") + else: + print(f" [{node.name}] FEHLER hosts: {msg}") + return False + + # Stage corosync config + if configs['corosync']: + for node in plan.nodes: + if not node.is_reachable: + continue + staging_dir = "/root/.network-migration-staged" + if dry_run: + print(f" [{node.name}] Würde corosync.conf stagen") + continue + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/corosync.conf", + configs['corosync'], + node.is_local, + ) + if ok: + print(f" [{node.name}] corosync.conf staged") + else: + print(f" [{node.name}] FEHLER corosync.conf: {msg}") + return False + + # Stage ceph config + if configs['ceph']: + for node in plan.nodes: + if not node.is_reachable: + continue + staging_dir = "/root/.network-migration-staged" + if dry_run: + print(f" [{node.name}] Würde ceph.conf stagen") + continue + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/ceph.conf", + configs['ceph'], + node.is_local, + ) + if ok: + print(f" [{node.name}] ceph.conf staged") + else: + print(f" [{node.name}] FEHLER ceph.conf: {msg}") + return False + + return True + + def _stop_corosync(self, nodes: list, dry_run: bool) -> bool: + """Stop corosync on all nodes.""" + for node in nodes: + if dry_run: + print(f" [{node.name}] Würde corosync stoppen") + continue + rc, _, err = self.ssh.run_on_node( + node.ssh_host, "systemctl stop corosync", node.is_local + ) + if rc == 0: + print(f" [{node.name}] corosync gestoppt") + else: + print(f" [{node.name}] WARNUNG beim Stoppen: {err}") + return True + + def _stop_pve_cluster(self, nodes: list, dry_run: bool) -> bool: + """Stop pve-cluster service to unmount /etc/pve.""" + for node in nodes: + if dry_run: + print(f" [{node.name}] Würde pve-cluster stoppen") + continue + rc, _, err = self.ssh.run_on_node( + node.ssh_host, "systemctl stop pve-cluster", node.is_local + ) + if rc == 0: + print(f" [{node.name}] pve-cluster gestoppt") + else: + print(f" [{node.name}] WARNUNG: {err}") + return True + + def _update_corosync(self, nodes: list, configs: dict, + dry_run: bool) -> bool: + """Write new corosync.conf directly to /etc/corosync/.""" + if not configs['corosync']: + print(" Keine Corosync-Änderungen") + return True + + for node in nodes: + if dry_run: + print(f" [{node.name}] Würde /etc/corosync/corosync.conf schreiben") + continue + + staging = "/root/.network-migration-staged/corosync.conf" + rc, _, err = self.ssh.run_on_node( + node.ssh_host, + f"cp {staging} /etc/corosync/corosync.conf", + node.is_local, + ) + if rc == 0: + print(f" [{node.name}] corosync.conf aktualisiert") + else: + print(f" [{node.name}] FEHLER: {err}") + return False + + return True + + def _update_hosts(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Update /etc/hosts on all nodes.""" + for node in plan.nodes: + if not node.is_reachable or node.name not in configs['nodes']: + continue + + if dry_run: + print(f" [{node.name}] Würde /etc/hosts aktualisieren") + continue + + staging = "/root/.network-migration-staged/hosts" + rc, _, err = self.ssh.run_on_node( + node.ssh_host, + f"cp {staging} /etc/hosts", + node.is_local, + ) + if rc == 0: + print(f" [{node.name}] /etc/hosts aktualisiert") + else: + print(f" [{node.name}] FEHLER: {err}") + return False + + return True + + def _update_network(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Update /etc/network/interfaces and restart networking.""" + for node in plan.nodes: + if not node.is_reachable or node.name not in configs['nodes']: + continue + + if dry_run: + print(f" [{node.name}] Würde /etc/network/interfaces aktualisieren") + print(f" [{node.name}] Würde 'ifreload -a' ausführen") + continue + + staging = "/root/.network-migration-staged/interfaces" + rc, _, err = self.ssh.run_on_node( + node.ssh_host, + f"cp {staging} /etc/network/interfaces", + node.is_local, + ) + if rc == 0: + print(f" [{node.name}] /etc/network/interfaces aktualisiert") + else: + print(f" [{node.name}] FEHLER: {err}") + return False + + # Reload network - use ifreload if available, otherwise ifdown/ifup + rc, _, _ = self.ssh.run_on_node( + node.ssh_host, "which ifreload", node.is_local + ) + if rc == 0: + reload_cmd = "ifreload -a" + else: + reload_cmd = f"ifdown {plan.bridge_name} && ifup {plan.bridge_name}" + + print(f" [{node.name}] Netzwerk wird neu geladen ({reload_cmd})...") + rc, _, err = self.ssh.run_on_node( + node.ssh_host, reload_cmd, node.is_local, timeout=60 + ) + if rc == 0: + print(f" [{node.name}] Netzwerk neu geladen") + else: + print(f" [{node.name}] WARNUNG beim Netzwerk-Reload: {err}") + # Don't fail here - the node might just be unreachable on old IP now + + return True + + def _start_services(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Start pve-cluster and corosync, then handle Ceph.""" + # Now we need to reach nodes on their NEW IPs + for node in plan.nodes: + if not node.is_reachable: + continue + + new_host = node.new_ip if not node.is_local else node.ssh_host + is_local = node.is_local + + # Start pve-cluster + if dry_run: + print(f" [{node.name}] Würde pve-cluster starten") + print(f" [{node.name}] Würde corosync starten") + continue + + print(f" [{node.name}] Starte pve-cluster...") + rc, _, err = self.ssh.run_on_node( + new_host, "systemctl start pve-cluster", is_local, timeout=30 + ) + if rc == 0: + print(f" [{node.name}] pve-cluster gestartet") + else: + print(f" [{node.name}] WARNUNG pve-cluster: {err}") + + print(f" [{node.name}] Starte corosync...") + rc, _, err = self.ssh.run_on_node( + new_host, "systemctl start corosync", is_local, timeout=30 + ) + if rc == 0: + print(f" [{node.name}] corosync gestartet") + else: + print(f" [{node.name}] WARNUNG corosync: {err}") + + if dry_run: + print("\n Würde auf Quorum warten...") + return True + + # Wait for quorum + print("\n Warte auf Quorum...") + if not self._wait_for_quorum(timeout=60): + print(" [!] Quorum nicht erreicht! Versuche 'pvecm expected 1'...") + rc, _, _ = self.ssh.execute_local("pvecm expected 1") + if rc == 0: + print(" Quorum erzwungen mit 'pvecm expected 1'") + time.sleep(5) + else: + print(" [!] Konnte Quorum nicht erzwingen!") + + # Update Ceph config via cluster FS if possible + if configs.get('ceph'): + self._update_ceph(plan, configs) + + # Cleanup staging directories + print("\n Staging-Verzeichnisse aufräumen...") + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + self.ssh.run_on_node( + new_host, + "rm -rf /root/.network-migration-staged", + node.is_local, + ) + + return True + + def _wait_for_quorum(self, timeout: int = 60) -> bool: + """Wait for cluster quorum to be established.""" + start = time.time() + while time.time() - start < timeout: + rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null") + if rc == 0 and "Quorate: Yes" in stdout: + print(" Quorum erreicht!") + return True + print(" ... warte auf Quorum ...") + time.sleep(5) + return False + + def _update_ceph(self, plan: MigrationPlan, configs: dict): + """Update Ceph configuration after quorum is available.""" + print("\n [Ceph] Konfiguration aktualisieren...") + + # Try to write via /etc/pve/ceph.conf first + rc, _, _ = self.ssh.execute_local( + "touch /etc/pve/.ceph_test && rm -f /etc/pve/.ceph_test" + ) + if rc == 0: + # /etc/pve is writable - use cluster filesystem + ok, msg = self.ssh.write_local_file("/etc/pve/ceph.conf", configs['ceph']) + if ok: + print(" [Ceph] /etc/pve/ceph.conf aktualisiert (via Cluster-FS)") + else: + print(f" [Ceph] FEHLER /etc/pve/ceph.conf: {msg}") + self._update_ceph_direct(plan, configs) + else: + # /etc/pve not writable - write directly on each node + print(" [Ceph] /etc/pve nicht beschreibbar, schreibe direkt...") + self._update_ceph_direct(plan, configs) + + # Restart Ceph services + print(" [Ceph] Services neu starten...") + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + + # Restart MON + self.ssh.run_on_node( + new_host, + f"systemctl restart ceph-mon@{node.name} 2>/dev/null", + node.is_local, timeout=30, + ) + # Restart MGR + self.ssh.run_on_node( + new_host, + f"systemctl restart ceph-mgr@{node.name} 2>/dev/null", + node.is_local, timeout=30, + ) + # Restart all OSDs on this node + self.ssh.run_on_node( + new_host, + "systemctl restart ceph-osd.target 2>/dev/null", + node.is_local, timeout=60, + ) + print(f" [{node.name}] Ceph-Services neu gestartet") + + def _update_ceph_direct(self, plan: MigrationPlan, configs: dict): + """Write ceph.conf directly on each node (fallback when no quorum).""" + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + + ok, msg = self.ssh.write_node_file( + new_host, "/etc/ceph/ceph.conf", + configs['ceph'], node.is_local, + ) + if ok: + print(f" [{node.name}] /etc/ceph/ceph.conf direkt geschrieben") + else: + print(f" [{node.name}] FEHLER /etc/ceph/ceph.conf: {msg}") + + def _update_ceph_mon_map(self, plan: MigrationPlan): + """Update Ceph MON map with new addresses. + + This is needed when MON IPs change. + """ + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + new_ip = node.new_ip + + # Extract monmap, modify, and reinject + cmds = [ + f"ceph-mon -i {node.name} --extract-monmap /tmp/monmap", + # Remove old entries and add new ones + ] + # This is complex - for now we rely on the ceph.conf update + # and let Ceph handle the MON map update on restart + print(f" [{node.name}] MON-Map wird beim Neustart aktualisiert") diff --git a/models.py b/models.py new file mode 100644 index 0000000..6de6d3e --- /dev/null +++ b/models.py @@ -0,0 +1,76 @@ +"""Data models for the Proxmox Cluster Network Changer.""" + +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class NetworkInterface: + """Represents a network interface configuration.""" + name: str # e.g. vmbr0 + address: str # e.g. 192.168.0.101 + netmask: str # e.g. 255.255.255.0 + cidr: int # e.g. 24 + gateway: Optional[str] = None + bridge_ports: Optional[str] = None + raw_config: str = "" + + +@dataclass +class NodeInfo: + """Represents a single Proxmox node.""" + name: str # e.g. pve1 + current_ip: str # current IP address + new_ip: Optional[str] = None # planned new IP + ssh_host: Optional[str] = None # how to reach it (IP or hostname) + is_local: bool = False # is this the node we're running on + is_reachable: bool = False + interfaces: list[NetworkInterface] = field(default_factory=list) + hosts_content: str = "" + network_interfaces_content: str = "" + + +@dataclass +class CorosyncNode: + """A node entry in corosync.conf.""" + nodeid: int + name: str + ring0_addr: str + ring1_addr: Optional[str] = None + + +@dataclass +class CorosyncConfig: + """Parsed corosync configuration.""" + nodes: list[CorosyncNode] = field(default_factory=list) + config_version: int = 1 + cluster_name: str = "" + transport: str = "knet" + raw_content: str = "" + + +@dataclass +class CephConfig: + """Parsed Ceph configuration.""" + fsid: str = "" + public_network: str = "" # e.g. 192.168.0.0/24 + cluster_network: str = "" # e.g. 192.168.0.0/24 + mon_hosts: list[str] = field(default_factory=list) + mon_sections: dict[str, dict[str, str]] = field(default_factory=dict) # [mon.pve1] -> {key: val} + raw_content: str = "" + + +@dataclass +class MigrationPlan: + """Complete migration plan with old -> new mappings.""" + nodes: list[NodeInfo] = field(default_factory=list) + old_network: str = "" # e.g. 192.168.0.0/24 + new_network: str = "" # e.g. 172.0.2.0/16 + new_gateway: Optional[str] = None + ceph_new_public_network: str = "" + ceph_new_cluster_network: str = "" + corosync_config: Optional[CorosyncConfig] = None + ceph_config: Optional[CephConfig] = None + dry_run: bool = False + quorum_available: bool = True + bridge_name: str = "vmbr0" # which bridge to modify diff --git a/planner.py b/planner.py new file mode 100644 index 0000000..e87ef10 --- /dev/null +++ b/planner.py @@ -0,0 +1,236 @@ +"""Phase 2: Plan the migration - IP mapping and config generation.""" + +import ipaddress +from models import NodeInfo, CorosyncConfig, CephConfig, MigrationPlan +from config_parser import ( + generate_corosync_conf, generate_ceph_conf, + generate_network_interfaces, generate_hosts, +) + + +class Planner: + """Plans the network migration with user input.""" + + def plan(self, nodes: list[NodeInfo], corosync: CorosyncConfig, + ceph: CephConfig | None, has_quorum: bool) -> MigrationPlan | None: + """Interactive planning with the user.""" + plan = MigrationPlan( + nodes=nodes, + corosync_config=corosync, + ceph_config=ceph, + quorum_available=has_quorum, + ) + + print("\n=== Phase 2: Migration planen ===\n") + + # Get new network + plan.new_network = self._ask_new_network() + if not plan.new_network: + return None + + new_net = ipaddress.ip_network(plan.new_network, strict=False) + plan.new_gateway = self._ask_gateway(new_net) + + # Detect old network from first node + if nodes: + old_ip = ipaddress.ip_address(nodes[0].current_ip) + for iface in nodes[0].interfaces: + if iface.address == str(old_ip): + plan.old_network = f"{ipaddress.ip_network(f'{iface.address}/{iface.cidr}', strict=False)}" + plan.bridge_name = iface.name + break + + # Generate IP mapping suggestions + print("\n[IP-Mapping]") + print("Für jeden Node wird eine neue IP benötigt.\n") + + for node in nodes: + suggested_ip = self._suggest_new_ip(node.current_ip, plan.new_network) + print(f" {node.name}: {node.current_ip} -> ", end="") + + user_input = input(f"[{suggested_ip}]: ").strip() + if user_input: + node.new_ip = user_input + else: + node.new_ip = suggested_ip + + print(f" => {node.new_ip}") + + # Ceph network planning + if ceph: + print("\n[Ceph Netzwerke]") + print(f" Aktuelles Public Network: {ceph.public_network}") + print(f" Aktuelles Cluster Network: {ceph.cluster_network}") + + default_ceph_net = plan.new_network + user_input = input( + f"\n Neues Ceph Public Network [{default_ceph_net}]: " + ).strip() + plan.ceph_new_public_network = user_input or default_ceph_net + + user_input = input( + f" Neues Ceph Cluster Network [{plan.ceph_new_public_network}]: " + ).strip() + plan.ceph_new_cluster_network = user_input or plan.ceph_new_public_network + + # Which bridge to modify + print(f"\n[Bridge]") + user_input = input( + f" Welche Bridge soll geändert werden? [{plan.bridge_name}]: " + ).strip() + if user_input: + plan.bridge_name = user_input + + # Show preview + self._show_preview(plan) + + # Confirm + confirm = input("\nMigration durchführen? [j/N]: ").strip().lower() + if confirm not in ('j', 'ja', 'y', 'yes'): + print("Abgebrochen.") + return None + + return plan + + def _ask_new_network(self) -> str | None: + """Ask for the new network.""" + while True: + network = input("Neues Netzwerk (z.B. 172.0.2.0/16): ").strip() + if not network: + print("Abgebrochen.") + return None + try: + ipaddress.ip_network(network, strict=False) + return network + except ValueError as e: + print(f" Ungültiges Netzwerk: {e}") + + def _ask_gateway(self, network: ipaddress.IPv4Network) -> str: + """Ask for the gateway in the new network.""" + # Suggest first usable IP as gateway + suggested = str(list(network.hosts())[0]) + user_input = input(f"Neues Gateway [{suggested}]: ").strip() + return user_input or suggested + + def _suggest_new_ip(self, old_ip: str, new_network: str) -> str: + """Suggest a new IP by keeping the host part from the old IP.""" + old = ipaddress.ip_address(old_ip) + new_net = ipaddress.ip_network(new_network, strict=False) + + # Keep the last octet(s) from the old IP + old_host = int(old) & 0xFF # last octet + if new_net.prefixlen <= 16: + # For /16 or bigger, keep last two octets + old_host = int(old) & 0xFFFF + + new_ip = ipaddress.ip_address(int(new_net.network_address) | old_host) + return str(new_ip) + + def _show_preview(self, plan: MigrationPlan): + """Show a preview of all planned changes.""" + print("\n" + "=" * 60) + print(" MIGRATION PREVIEW") + print("=" * 60) + + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + + print(f"\n Netzwerk: {plan.old_network} -> {plan.new_network}") + print(f" Gateway: {plan.new_gateway}") + print(f" Bridge: {plan.bridge_name}") + print(f" Quorum verfügbar: {'Ja' if plan.quorum_available else 'NEIN'}") + + print("\n [Node IP-Mapping]") + for node in plan.nodes: + status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" + print(f" {node.name}: {node.current_ip} -> {node.new_ip} ({status})") + + if plan.ceph_config: + print("\n [Ceph Netzwerke]") + print(f" Public: {plan.ceph_config.public_network} -> {plan.ceph_new_public_network}") + print(f" Cluster: {plan.ceph_config.cluster_network} -> {plan.ceph_new_cluster_network}") + if plan.ceph_config.mon_hosts: + print(f" MON Hosts: {', '.join(plan.ceph_config.mon_hosts)}") + new_mons = [ip_mapping.get(h, h) for h in plan.ceph_config.mon_hosts] + print(f" -> {', '.join(new_mons)}") + + print("\n [Dateien die geändert werden]") + print(" - /etc/network/interfaces (auf jedem Node)") + print(" - /etc/hosts (auf jedem Node)") + print(" - /etc/corosync/corosync.conf (auf jedem Node)") + if plan.ceph_config: + if plan.quorum_available: + print(" - /etc/pve/ceph.conf (über Cluster-FS)") + else: + print(" - /etc/ceph/ceph.conf (direkt, da kein Quorum)") + + if not plan.quorum_available: + print("\n [!] WARNUNG: Kein Quorum verfügbar!") + print(" Es wird 'pvecm expected 1' verwendet um Quorum zu erzwingen.") + print(" Ceph-Config wird direkt auf jedem Node geschrieben.") + + print("\n" + "=" * 60) + + def generate_new_configs(self, plan: MigrationPlan) -> dict: + """Generate all new configuration file contents. + + Returns dict with: + 'corosync': new corosync.conf content + 'ceph': new ceph.conf content (or None) + 'nodes': {node_name: {'interfaces': content, 'hosts': content}} + """ + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + + configs = { + 'corosync': None, + 'ceph': None, + 'nodes': {}, + } + + # Generate new corosync.conf + if plan.corosync_config: + configs['corosync'] = generate_corosync_conf( + plan.corosync_config, ip_mapping + ) + + # Generate new ceph.conf + if plan.ceph_config: + configs['ceph'] = generate_ceph_conf( + plan.ceph_config, ip_mapping, + plan.ceph_new_public_network, + plan.ceph_new_cluster_network, + ) + + # Generate per-node configs + new_cidr = ipaddress.ip_network(plan.new_network, strict=False).prefixlen + + # Detect old gateway from first reachable node + old_gateway = None + for node in plan.nodes: + for iface in node.interfaces: + if iface.name == plan.bridge_name and iface.gateway: + old_gateway = iface.gateway + break + if old_gateway: + break + + for node in plan.nodes: + if not node.new_ip or not node.network_interfaces_content: + continue + + node_configs = {} + + # Network interfaces + node_configs['interfaces'] = generate_network_interfaces( + node.network_interfaces_content, + node.current_ip, node.new_ip, + new_cidr, plan.new_gateway, old_gateway, + ) + + # /etc/hosts + node_configs['hosts'] = generate_hosts( + node.hosts_content, ip_mapping + ) + + configs['nodes'][node.name] = node_configs + + return configs diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dbdcc7c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +# Proxmox Cluster Network Changer +# Keine externen Dependencies - nutzt nur Python stdlib + system ssh diff --git a/rescue.py b/rescue.py new file mode 100644 index 0000000..e87d2b3 --- /dev/null +++ b/rescue.py @@ -0,0 +1,228 @@ +"""Emergency/Rescue Network - Temporäres Netzwerk zur SSH-Kommunikation. + +Wenn Nodes in verschiedenen Subnetzen sind und sich nicht mehr erreichen +können, wird ein temporäres Emergency-Netzwerk aufgebaut: +- Jeder Node bekommt eine zusätzliche IP auf der Bridge (z.B. vmbr0) +- Über dieses Netz kann das Tool dann per SSH arbeiten +- Nach der Migration werden die Emergency-IPs wieder entfernt +""" + +import ipaddress +import time +from models import NodeInfo, CorosyncConfig +from config_parser import parse_corosync_conf +from ssh_manager import SSHManager + + +class RescueNetwork: + """Manages an emergency network for broken clusters.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + self.rescue_subnet: str = "" + self.rescue_ips: dict[str, str] = {} # node_name -> rescue_ip + self.bridge: str = "vmbr0" + self.active: bool = False + + def setup_interactive(self, corosync: CorosyncConfig) -> list[NodeInfo] | None: + """Interactively set up the rescue network. + + Returns updated node list with rescue IPs as ssh_host, or None on abort. + """ + print("\n" + "=" * 60) + print(" RESCUE NETZWERK") + print("=" * 60) + print() + print(" Dieses Feature richtet ein temporäres Netzwerk ein,") + print(" damit alle Nodes sich wieder per SSH erreichen können.") + print() + print(" Ablauf:") + print(" 1. Du gibst ein freies Subnetz an (z.B. 10.99.99.0/24)") + print(" 2. Das Tool zeigt dir für jeden Node den Befehl an") + print(" 3. Du führst die Befehle manuell auf jedem Node aus") + print(" (z.B. über IPMI/iLO/iDRAC/KVM-Konsole)") + print(" 4. Danach kann das Tool alle Nodes per SSH erreichen") + print() + + # Ask for bridge + user_input = input(f" Bridge für Emergency-IPs [{self.bridge}]: ").strip() + if user_input: + self.bridge = user_input + + # Ask for rescue subnet + while True: + subnet_input = input(" Emergency Subnetz (z.B. 10.99.99.0/24): ").strip() + if not subnet_input: + print(" Abgebrochen.") + return None + try: + subnet = ipaddress.ip_network(subnet_input, strict=False) + self.rescue_subnet = str(subnet) + break + except ValueError as e: + print(f" Ungültiges Subnetz: {e}") + + # Generate IPs for all nodes + hosts = list(subnet.hosts()) + print() + print(" " + "-" * 56) + print(f" Emergency Subnetz: {self.rescue_subnet}") + print(f" Bridge: {self.bridge}") + print(" " + "-" * 56) + print() + + nodes = [] + for i, cs_node in enumerate(corosync.nodes): + if i >= len(hosts): + print(f" [!] FEHLER: Nicht genug IPs im Subnetz für alle Nodes!") + return None + + rescue_ip = str(hosts[i]) + self.rescue_ips[cs_node.name] = rescue_ip + cidr = subnet.prefixlen + + node = NodeInfo( + name=cs_node.name, + current_ip=cs_node.ring0_addr, + ssh_host=rescue_ip, # Use rescue IP for SSH + ) + nodes.append(node) + + # Show command for this node + cmd = f"ip addr add {rescue_ip}/{cidr} dev {self.bridge}" + print(f" {cs_node.name} ({cs_node.ring0_addr}):") + print(f" Rescue-IP: {rescue_ip}/{cidr}") + print(f" Befehl: {cmd}") + print() + + # Apply locally + print(" " + "-" * 56) + print() + + # Find local node + import socket + local_hostname = socket.gethostname() + local_node = None + for node in nodes: + if node.name == local_hostname: + local_node = node + node.is_local = True + break + + if local_node and local_node.name in self.rescue_ips: + rescue_ip = self.rescue_ips[local_node.name] + cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen + print(f" Lokaler Node erkannt: {local_node.name}") + answer = input( + f" Emergency-IP {rescue_ip}/{cidr} auf {self.bridge} " + f"automatisch setzen? [J/n]: " + ).strip().lower() + + if answer not in ('n', 'nein', 'no'): + rc, _, err = self.ssh.execute_local( + f"ip addr add {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null; echo ok" + ) + if rc == 0: + print(f" -> {rescue_ip}/{cidr} auf {self.bridge} gesetzt") + local_node.is_reachable = True + else: + print(f" -> WARNUNG: {err}") + local_node.is_reachable = True # It's local, still reachable + else: + local_node.is_reachable = True + + # Wait for user to configure other nodes + print() + print(" " + "=" * 56) + print(" Bitte führe jetzt die oben genannten Befehle auf den") + print(" anderen Nodes aus (IPMI/iLO/iDRAC/KVM-Konsole).") + print(" " + "=" * 56) + print() + input(" Drücke ENTER wenn alle Nodes konfiguriert sind...") + + # Test connectivity + print() + print(" [Verbindungstest]") + all_ok = True + for node in nodes: + if node.is_local: + print(f" {node.name}: OK (lokal)") + continue + + rescue_ip = self.rescue_ips[node.name] + reachable = self.ssh.is_reachable(rescue_ip) + if reachable: + print(f" {node.name} ({rescue_ip}): OK") + node.is_reachable = True + else: + print(f" {node.name} ({rescue_ip}): NICHT ERREICHBAR") + all_ok = False + + if not all_ok: + print() + print(" [!] Nicht alle Nodes erreichbar!") + answer = input(" Trotzdem fortfahren? [j/N]: ").strip().lower() + if answer not in ('j', 'ja', 'y', 'yes'): + self.cleanup(nodes) + return None + + self.active = True + print() + print(" Rescue-Netzwerk aktiv. Migration kann starten.") + return nodes + + def cleanup(self, nodes: list[NodeInfo]): + """Remove emergency IPs from all nodes.""" + if not self.active and not self.rescue_ips: + return + + print("\n [Rescue] Emergency-IPs entfernen...") + cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen + + for node in nodes: + if node.name not in self.rescue_ips: + continue + + rescue_ip = self.rescue_ips[node.name] + cmd = f"ip addr del {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null" + + if node.is_local: + rc, _, _ = self.ssh.execute_local(cmd) + elif node.is_reachable: + # Try to reach via new IP first (after migration), then rescue IP + if node.new_ip: + rc, _, _ = self.ssh.execute(node.new_ip, cmd) + else: + rc, _, _ = self.ssh.execute(rescue_ip, cmd) + + status = "entfernt" if True else "FEHLER" + print(f" {node.name}: {rescue_ip}/{cidr} {status}") + + self.active = False + print(" [Rescue] Emergency-IPs entfernt.") + + def get_rescue_commands(self, corosync: CorosyncConfig, + subnet: str, bridge: str = "vmbr0") -> list[dict]: + """Generate rescue commands without interactive prompts. + + Returns list of {name, ip, cidr, command, current_ip} + """ + network = ipaddress.ip_network(subnet, strict=False) + hosts = list(network.hosts()) + commands = [] + + for i, cs_node in enumerate(corosync.nodes): + if i >= len(hosts): + break + rescue_ip = str(hosts[i]) + cidr = network.prefixlen + commands.append({ + 'name': cs_node.name, + 'current_ip': cs_node.ring0_addr, + 'ip': rescue_ip, + 'cidr': cidr, + 'command': f"ip addr add {rescue_ip}/{cidr} dev {bridge}", + 'remove_command': f"ip addr del {rescue_ip}/{cidr} dev {bridge}", + }) + + return commands diff --git a/ssh_manager.py b/ssh_manager.py new file mode 100644 index 0000000..130f0cf --- /dev/null +++ b/ssh_manager.py @@ -0,0 +1,140 @@ +"""SSH connection manager for remote Proxmox nodes.""" + +import subprocess +from typing import Optional + + +class SSHManager: + """Manages SSH connections to Proxmox nodes using system ssh.""" + + def __init__(self, ssh_user: str = "root", ssh_key: Optional[str] = None, + ssh_port: int = 22): + self.ssh_user = ssh_user + self.ssh_key = ssh_key + self.ssh_port = ssh_port + + def _build_ssh_cmd(self, host: str, command: str) -> list[str]: + """Build the ssh command list.""" + cmd = [ + "ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + "-o", "BatchMode=yes", + "-p", str(self.ssh_port), + ] + if self.ssh_key: + cmd.extend(["-i", self.ssh_key]) + cmd.append(f"{self.ssh_user}@{host}") + cmd.append(command) + return cmd + + def execute(self, host: str, command: str, timeout: int = 30) -> tuple[int, str, str]: + """Execute a command on a remote host via SSH. + + Returns: (return_code, stdout, stderr) + """ + cmd = self._build_ssh_cmd(host, command) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + ) + return result.returncode, result.stdout, result.stderr + except subprocess.TimeoutExpired: + return -1, "", f"SSH command timed out after {timeout}s" + except Exception as e: + return -1, "", str(e) + + def read_file(self, host: str, path: str) -> tuple[bool, str]: + """Read a file from a remote host. + + Returns: (success, content) + """ + rc, stdout, stderr = self.execute(host, f"cat {path}") + if rc == 0: + return True, stdout + return False, stderr + + def write_file(self, host: str, path: str, content: str) -> tuple[bool, str]: + """Write content to a file on a remote host. + + Returns: (success, message) + """ + # Use heredoc via ssh to write file + escaped = content.replace("'", "'\\''") + cmd = self._build_ssh_cmd(host, f"cat > {path} << 'PROXMOX_NET_EOF'\n{content}\nPROXMOX_NET_EOF") + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0: + return True, "OK" + return False, result.stderr + except Exception as e: + return False, str(e) + + def is_reachable(self, host: str) -> bool: + """Check if a host is reachable via SSH.""" + rc, _, _ = self.execute(host, "echo ok", timeout=10) + return rc == 0 + + def execute_local(self, command: str, timeout: int = 30) -> tuple[int, str, str]: + """Execute a command locally. + + Returns: (return_code, stdout, stderr) + """ + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=timeout, + ) + return result.returncode, result.stdout, result.stderr + except subprocess.TimeoutExpired: + return -1, "", f"Command timed out after {timeout}s" + except Exception as e: + return -1, "", str(e) + + def read_local_file(self, path: str) -> tuple[bool, str]: + """Read a local file.""" + try: + with open(path, 'r') as f: + return True, f.read() + except Exception as e: + return False, str(e) + + def write_local_file(self, path: str, content: str) -> tuple[bool, str]: + """Write a local file.""" + try: + with open(path, 'w') as f: + f.write(content) + return True, "OK" + except Exception as e: + return False, str(e) + + def run_on_node(self, host: str, command: str, is_local: bool = False, + timeout: int = 30) -> tuple[int, str, str]: + """Run a command on a node (local or remote).""" + if is_local: + return self.execute_local(command, timeout) + return self.execute(host, command, timeout) + + def read_node_file(self, host: str, path: str, is_local: bool = False) -> tuple[bool, str]: + """Read a file from a node (local or remote).""" + if is_local: + return self.read_local_file(path) + return self.read_file(host, path) + + def write_node_file(self, host: str, path: str, content: str, + is_local: bool = False) -> tuple[bool, str]: + """Write a file to a node (local or remote).""" + if is_local: + return self.write_local_file(path, content) + return self.write_file(host, path, content) diff --git a/verifier.py b/verifier.py new file mode 100644 index 0000000..472ae6a --- /dev/null +++ b/verifier.py @@ -0,0 +1,112 @@ +"""Phase 5: Verify the migration was successful.""" + +import time +from models import MigrationPlan +from ssh_manager import SSHManager + + +class Verifier: + """Verifies the cluster state after migration.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + + def run(self, plan: MigrationPlan) -> bool: + """Run all verification checks.""" + print("\n=== Phase 5: Verifikation ===\n") + + all_ok = True + + # Check node reachability on new IPs + print("[Node-Erreichbarkeit (neue IPs)]") + for node in plan.nodes: + if not node.new_ip: + continue + + if node.is_local: + # Check local IP + rc, stdout, _ = self.ssh.execute_local( + f"ip addr show | grep -q '{node.new_ip}'" + ) + reachable = rc == 0 + else: + reachable = self.ssh.is_reachable(node.new_ip) + + status = "OK" if reachable else "FEHLER" + print(f" {node.name} ({node.new_ip}): {status}") + if not reachable: + all_ok = False + + # Check cluster status + print("\n[Cluster Status]") + rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null") + if rc == 0: + # Extract relevant info + for line in stdout.split('\n'): + line = line.strip() + if any(k in line for k in ['Quorate:', 'Nodes:', 'Node name', + 'Total votes', 'Expected votes']): + print(f" {line}") + if "Quorate: Yes" not in stdout: + print(" [!] WARNUNG: Cluster hat KEIN Quorum!") + all_ok = False + else: + print(" [!] pvecm status fehlgeschlagen") + all_ok = False + + # Check corosync members + print("\n[Corosync Members]") + rc, stdout, _ = self.ssh.execute_local("corosync-cmapctl 2>/dev/null | grep 'ip(' || true") + if rc == 0 and stdout.strip(): + for line in stdout.strip().split('\n'): + print(f" {line.strip()}") + else: + print(" Keine Corosync-Member-Info verfügbar") + + # Check Ceph if it was configured + if plan.ceph_config: + print("\n[Ceph Status]") + rc, stdout, _ = self.ssh.execute_local("ceph -s 2>/dev/null") + if rc == 0: + for line in stdout.split('\n'): + line = line.strip() + if line: + print(f" {line}") + else: + print(" [!] ceph -s fehlgeschlagen") + all_ok = False + + print("\n[Ceph MON Status]") + rc, stdout, _ = self.ssh.execute_local("ceph mon stat 2>/dev/null") + if rc == 0: + print(f" {stdout.strip()}") + else: + print(" [!] ceph mon stat fehlgeschlagen") + + print("\n[Ceph OSD Status]") + rc, stdout, _ = self.ssh.execute_local("ceph osd tree 2>/dev/null") + if rc == 0: + for line in stdout.split('\n')[:20]: # First 20 lines + if line.strip(): + print(f" {line}") + + # Summary + print("\n" + "=" * 60) + if all_ok: + print(" MIGRATION ERFOLGREICH!") + print(" Alle Checks bestanden.") + else: + print(" MIGRATION MIT WARNUNGEN ABGESCHLOSSEN") + print(" Einige Checks sind fehlgeschlagen. Bitte manuell prüfen!") + print("=" * 60) + + # Suggest next steps + print("\n[Empfohlene nächste Schritte]") + print(" 1. VMs/CTs auf allen Nodes prüfen: qm list / pct list") + print(" 2. Live-Migration testen: qm migrate ") + print(" 3. Ceph Recovery abwarten: ceph -w") + if not all_ok: + print(" 4. Bei Problemen Backup wiederherstellen:") + print(" ls /root/network-migration-backup-*/") + + return all_ok