From bb7c1d5c3f403ee72d16eba4c761ecaaafe66854 Mon Sep 17 00:00:00 2001 From: Stefan Hacker Date: Wed, 4 Mar 2026 21:55:49 +0100 Subject: [PATCH] first commit --- README.md | 245 ++++++++++++++++++++++++++ backup.py | 86 +++++++++ config_parser.py | 271 ++++++++++++++++++++++++++++ discovery.py | 189 ++++++++++++++++++++ main.py | 212 ++++++++++++++++++++++ migrator.py | 450 +++++++++++++++++++++++++++++++++++++++++++++++ models.py | 76 ++++++++ planner.py | 236 +++++++++++++++++++++++++ requirements.txt | 2 + rescue.py | 228 ++++++++++++++++++++++++ ssh_manager.py | 140 +++++++++++++++ verifier.py | 112 ++++++++++++ 12 files changed, 2247 insertions(+) create mode 100644 README.md create mode 100644 backup.py create mode 100644 config_parser.py create mode 100644 discovery.py create mode 100644 main.py create mode 100644 migrator.py create mode 100644 models.py create mode 100644 planner.py create mode 100644 requirements.txt create mode 100644 rescue.py create mode 100644 ssh_manager.py create mode 100644 verifier.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..262601d --- /dev/null +++ b/README.md @@ -0,0 +1,245 @@ +# Proxmox Cluster Network Changer + +Migriert ein komplettes Proxmox-Cluster (inkl. Ceph) von einem Netzwerk in ein anderes. + +**Problem:** Wenn man bei einem Proxmox-Cluster die IPs ändert, verliert man das Quorum und `/etc/pve` wird read-only — dann kann man weder Corosync noch Ceph über das Cluster-Dateisystem konfigurieren. Dieses Tool löst das Problem durch eine koordinierte Migration aller Nodes. + +## Features + +- Automatische Erkennung aller Nodes, IPs und Konfigurationen +- Koordinierte Migration aller Nodes in einem Durchgang +- Ceph-Support (Public Network, Cluster Network, MON-Adressen) +- Funktioniert auch bei **gebrochenem Quorum** (z.B. wenn ein Node bereits manuell geändert wurde) +- Automatische Backups aller Konfigurationen vor der Migration +- Dry-Run-Modus zum gefahrlosen Testen +- Verifikation nach der Migration + +## Voraussetzungen + +- Python 3.9+ (auf Proxmox standardmäßig vorhanden) +- Root-Zugriff auf dem Node, auf dem das Tool läuft +- SSH-Zugriff (Key-basiert) zu allen anderen Cluster-Nodes +- Keine externen Python-Pakete nötig (nur stdlib) + +## Installation + +```bash +# Auf einen Proxmox-Node kopieren +scp -r proxmox-cluster-network-changer/ root@pve1:/root/ + +# Oder direkt klonen +cd /root +git clone proxmox-cluster-network-changer +``` + +## Verwendung + +### Aktuelle Konfiguration anzeigen (Discovery) + +```bash +python3 main.py --discover +``` + +Zeigt an: +- Alle Cluster-Nodes mit IPs +- Corosync-Konfiguration +- Ceph-Netzwerke und MON-Hosts +- Quorum-Status +- Welche Nodes erreichbar sind + +### Dry-Run (nichts wird geändert) + +```bash +python3 main.py --dry-run +``` + +Durchläuft den kompletten Prozess, zeigt alle geplanten Änderungen an, schreibt aber nichts. + +### Migration durchführen + +```bash +python3 main.py +``` + +Das Tool führt interaktiv durch den Prozess: + +``` +=== Phase 1: Discovery === + +[Corosync] + Cluster: mycluster + Nodes gefunden: 4 + - pve1 (ID: 1) -> 192.168.0.101 + - pve2 (ID: 2) -> 192.168.0.102 + - pve3 (ID: 3) -> 192.168.0.103 + - pve4 (ID: 4) -> 192.168.0.104 + +[Ceph] + Public Network: 192.168.0.0/24 + Cluster Network: 192.168.0.0/24 + +=== Phase 2: Migration planen === + +Neues Netzwerk (z.B. 172.0.2.0/16): 172.0.2.0/16 +Neues Gateway [172.0.0.1]: 172.0.2.1 + +[IP-Mapping] + pve1: 192.168.0.101 -> [172.0.2.101]: + pve2: 192.168.0.102 -> [172.0.2.102]: + pve3: 192.168.0.103 -> [172.0.2.103]: + pve4: 192.168.0.104 -> [172.0.2.104]: + +Migration durchführen? [j/N]: j +``` + +### Optionen + +| Option | Beschreibung | +|---|---| +| `--dry-run` | Nur anzeigen, nichts ändern | +| `--discover` | Nur aktuelle Config anzeigen | +| `--rescue` | Rescue-Modus: Emergency-Netzwerk einrichten | +| `--rescue-commands SUBNET` | Nur Rescue-Befehle ausgeben (z.B. `10.99.99.0/24`) | +| `--ssh-key PFAD` | Pfad zum SSH-Key (Standard: Default-Key) | +| `--ssh-port PORT` | SSH-Port (Standard: 22) | + +## Was wird geändert? + +| Datei | Wo | Was | +|---|---|---| +| `/etc/network/interfaces` | Jeder Node | Bridge-IP, Gateway | +| `/etc/hosts` | Jeder Node | Hostname-zu-IP-Zuordnung | +| `/etc/corosync/corosync.conf` | Jeder Node | Corosync Ring-Adressen | +| `/etc/pve/ceph.conf` | Cluster-FS | public_network, cluster_network, MON-Adressen | + +## Migrationsablauf (Phase 4) + +1. Neue Konfigurationen werden auf alle Nodes verteilt (Staging) +2. Corosync wird auf allen Nodes gestoppt +3. pve-cluster (pmxcfs) wird gestoppt +4. Corosync-Config wird direkt geschrieben (`/etc/corosync/corosync.conf`) +5. `/etc/hosts` wird aktualisiert +6. `/etc/network/interfaces` wird aktualisiert + Netzwerk-Reload (`ifreload -a`) +7. Services werden gestartet, Quorum abgewartet, Ceph aktualisiert + +## Rescue-Netzwerk (Emergency Mode) + +**Szenario:** PVE01 hat bereits eine neue IP, PVE02-04 sind noch im alten Netz. Kein Node kann die anderen erreichen. + +### Schnell: Nur Befehle anzeigen + +```bash +python3 main.py --rescue-commands 10.99.99.0/24 +``` + +Ausgabe: +``` + RESCUE BEFEHLE + Subnetz: 10.99.99.0/24 | Bridge: vmbr0 + + pve1 (192.168.0.101): + ip addr add 10.99.99.1/24 dev vmbr0 + + pve2 (192.168.0.102): + ip addr add 10.99.99.2/24 dev vmbr0 + + pve3 (192.168.0.103): + ip addr add 10.99.99.3/24 dev vmbr0 + + pve4 (192.168.0.104): + ip addr add 10.99.99.4/24 dev vmbr0 + + Zum Entfernen: + ip addr del 10.99.99.1/24 dev vmbr0 # pve1 + ip addr del 10.99.99.2/24 dev vmbr0 # pve2 + ip addr del 10.99.99.3/24 dev vmbr0 # pve3 + ip addr del 10.99.99.4/24 dev vmbr0 # pve4 +``` + +Diese Befehle über IPMI/iLO/iDRAC/KVM-Konsole auf jedem Node ausführen. + +### Interaktiv: Rescue + Migration + +```bash +python3 main.py --rescue +``` + +oder einfach starten — wenn Nodes nicht erreichbar sind, wird automatisch gefragt: + +```bash +python3 main.py +``` + +``` + 3 Node(s) nicht erreichbar. + Rescue-Netzwerk einrichten? [J/n]: j +``` + +Ablauf: +1. Du gibst ein freies Subnetz an (z.B. `10.99.99.0/24`) +2. Das Tool zeigt für jeden Node den `ip addr add` Befehl +3. Auf dem lokalen Node wird die IP automatisch gesetzt +4. Du führst die Befehle auf den anderen Nodes per Konsole aus +5. Das Tool testet die Verbindung und liest die Configs +6. Danach läuft die normale Migration +7. Am Ende werden die Emergency-IPs automatisch entfernt + +### Wann brauche ich das? + +- Ein oder mehrere Nodes haben bereits manuell eine neue IP bekommen +- Die Nodes liegen in verschiedenen Subnetzen +- SSH zwischen den Nodes funktioniert nicht mehr +- Du hast aber noch Zugriff auf die Konsolen (IPMI/iLO/iDRAC/KVM) + +## Gebrochenes Quorum + +Wenn bereits ein Node manuell geändert wurde und das Quorum verloren ist: + +- Das Tool erkennt den Zustand automatisch in der Discovery-Phase +- Nicht erreichbare Nodes werden per Hostname gesucht +- Configs werden direkt geschrieben (nicht über `/etc/pve/`) +- Nach dem Netzwerk-Reload wird `pvecm expected 1` genutzt, um Quorum zu erzwingen +- Danach wird Ceph über das Cluster-Dateisystem aktualisiert + +## Backups + +Vor der Migration werden automatisch Backups erstellt: + +``` +/root/network-migration-backup-20260304_143022/ +├── etc_network_interfaces +├── etc_hosts +├── etc_corosync_corosync.conf +├── etc_ceph_ceph.conf +├── etc_pve_corosync.conf +└── etc_pve_ceph.conf +``` + +### Restore (manuell) + +```bash +# Beispiel: Netzwerk-Config wiederherstellen +cp /root/network-migration-backup-*/etc_network_interfaces /etc/network/interfaces +ifreload -a + +# Corosync wiederherstellen +cp /root/network-migration-backup-*/etc_corosync_corosync.conf /etc/corosync/corosync.conf +systemctl restart corosync +``` + +## Empfohlene Reihenfolge bei Problemen + +1. `pvecm status` — Cluster-Status prüfen +2. `pvecm expected 1` — Quorum erzwingen (Notfall) +3. `ceph -s` — Ceph-Status prüfen +4. `ceph -w` — Ceph-Recovery beobachten +5. `journalctl -u corosync` — Corosync-Logs prüfen +6. `journalctl -u pve-cluster` — pmxcfs-Logs prüfen + +## Hinweise + +- Das Tool muss als **root** ausgeführt werden +- SSH-Keys müssen **vorher** zwischen den Nodes eingerichtet sein (bei Proxmox-Clustern standardmäßig der Fall) +- VMs/CTs werden **nicht** automatisch migriert oder gestoppt — das Netzwerk wird im laufenden Betrieb geändert +- Nach der Migration sollten VM-Netzwerke (Bridges in VM-Configs) geprüft werden, falls diese sich auf spezifische IPs beziehen +- Getestet mit Proxmox VE 7.x und 8.x diff --git a/backup.py b/backup.py new file mode 100644 index 0000000..c751cc9 --- /dev/null +++ b/backup.py @@ -0,0 +1,86 @@ +"""Phase 3: Backup all configuration files before migration.""" + +import datetime +from models import MigrationPlan +from ssh_manager import SSHManager + + +BACKUP_FILES = [ + "/etc/network/interfaces", + "/etc/hosts", + "/etc/corosync/corosync.conf", + "/etc/ceph/ceph.conf", +] + +CLUSTER_BACKUP_FILES = [ + "/etc/pve/corosync.conf", + "/etc/pve/ceph.conf", +] + + +class Backup: + """Creates backups of all config files on each node.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + + def run(self, plan: MigrationPlan) -> bool: + """Create backups on all reachable nodes. + + Returns True if all backups succeeded. + """ + print("\n=== Phase 3: Backup ===\n") + + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + backup_dir = f"/root/network-migration-backup-{timestamp}" + all_ok = True + + for node in plan.nodes: + if not node.is_reachable: + print(f" [{node.name}] Übersprungen (nicht erreichbar)") + continue + + print(f" [{node.name}] Erstelle Backup in {backup_dir}/") + + # Create backup directory + rc, _, err = self.ssh.run_on_node( + node.ssh_host, f"mkdir -p {backup_dir}", node.is_local + ) + if rc != 0: + print(f" [!] Fehler beim Erstellen des Backup-Verzeichnisses: {err}") + all_ok = False + continue + + # Backup per-node files + for filepath in BACKUP_FILES: + filename = filepath.replace("/", "_").lstrip("_") + rc, _, _ = self.ssh.run_on_node( + node.ssh_host, + f"cp {filepath} {backup_dir}/{filename} 2>/dev/null", + node.is_local, + ) + if rc == 0: + print(f" OK: {filepath}") + else: + print(f" --: {filepath} (nicht vorhanden)") + + # Backup cluster files (only from local node since they're shared) + if node.is_local: + for filepath in CLUSTER_BACKUP_FILES: + filename = filepath.replace("/", "_").lstrip("_") + rc, _, _ = self.ssh.run_on_node( + node.ssh_host, + f"cp {filepath} {backup_dir}/{filename} 2>/dev/null", + node.is_local, + ) + if rc == 0: + print(f" OK: {filepath} (cluster)") + else: + print(f" --: {filepath} (nicht vorhanden)") + + if all_ok: + print(f"\n Backup erfolgreich in {backup_dir}/") + else: + print("\n [!] Einige Backups sind fehlgeschlagen!") + + return all_ok diff --git a/config_parser.py b/config_parser.py new file mode 100644 index 0000000..d332227 --- /dev/null +++ b/config_parser.py @@ -0,0 +1,271 @@ +"""Parsers for Proxmox configuration files (Corosync, Ceph, /etc/network/interfaces).""" + +import re +from models import ( + CorosyncConfig, CorosyncNode, CephConfig, NetworkInterface, +) + + +def parse_corosync_conf(content: str) -> CorosyncConfig: + """Parse corosync.conf and extract node information.""" + config = CorosyncConfig(raw_content=content) + + # Extract config_version + m = re.search(r'config_version:\s*(\d+)', content) + if m: + config.config_version = int(m.group(1)) + + # Extract cluster_name + m = re.search(r'cluster_name:\s*(\S+)', content) + if m: + config.cluster_name = m.group(1) + + # Extract transport + m = re.search(r'transport:\s*(\S+)', content) + if m: + config.transport = m.group(1) + + # Extract nodes from nodelist section + nodelist_match = re.search(r'nodelist\s*\{(.*?)\n\}', content, re.DOTALL) + if nodelist_match: + nodelist_content = nodelist_match.group(1) + # Find all node blocks + node_blocks = re.findall(r'node\s*\{(.*?)\}', nodelist_content, re.DOTALL) + for block in node_blocks: + node = CorosyncNode(nodeid=0, name="", ring0_addr="") + m = re.search(r'nodeid:\s*(\d+)', block) + if m: + node.nodeid = int(m.group(1)) + m = re.search(r'name:\s*(\S+)', block) + if m: + node.name = m.group(1) + m = re.search(r'ring0_addr:\s*(\S+)', block) + if m: + node.ring0_addr = m.group(1) + m = re.search(r'ring1_addr:\s*(\S+)', block) + if m: + node.ring1_addr = m.group(1) + config.nodes.append(node) + + return config + + +def generate_corosync_conf(config: CorosyncConfig, ip_mapping: dict[str, str]) -> str: + """Generate new corosync.conf with updated IP addresses. + + ip_mapping: old_ip -> new_ip + """ + new_content = config.raw_content + + for old_ip, new_ip in ip_mapping.items(): + new_content = new_content.replace(old_ip, new_ip) + + # Increment config_version + m = re.search(r'config_version:\s*(\d+)', new_content) + if m: + old_version = int(m.group(1)) + new_content = new_content.replace( + f'config_version: {old_version}', + f'config_version: {old_version + 1}' + ) + + return new_content + + +def parse_ceph_conf(content: str) -> CephConfig: + """Parse ceph.conf (INI-like format).""" + config = CephConfig(raw_content=content) + + # Extract fsid + m = re.search(r'fsid\s*=\s*(\S+)', content) + if m: + config.fsid = m.group(1) + + # Extract public_network + m = re.search(r'public.network\s*=\s*(\S+)', content) + if m: + config.public_network = m.group(1) + + # Extract cluster_network + m = re.search(r'cluster.network\s*=\s*(\S+)', content) + if m: + config.cluster_network = m.group(1) + + # Extract mon_host + m = re.search(r'mon.host\s*=\s*(.+)', content) + if m: + hosts_str = m.group(1).strip() + config.mon_hosts = [h.strip() for h in hosts_str.split(',') if h.strip()] + + # Extract [mon.X] sections + mon_sections = re.findall( + r'\[(mon\.[\w.-]+)\]\s*\n((?:\s+\w.*\n)*)', content + ) + for section_name, section_body in mon_sections: + props = {} + for line in section_body.strip().split('\n'): + line = line.strip() + if '=' in line: + key, val = line.split('=', 1) + props[key.strip()] = val.strip() + config.mon_sections[section_name] = props + + return config + + +def generate_ceph_conf(config: CephConfig, ip_mapping: dict[str, str], + new_public_network: str, new_cluster_network: str) -> str: + """Generate new ceph.conf with updated IPs and networks.""" + new_content = config.raw_content + + # Replace network definitions + if config.public_network: + new_content = new_content.replace( + config.public_network, new_public_network, 1 + ) + if config.cluster_network: + new_content = new_content.replace( + config.cluster_network, new_cluster_network, 1 + ) + + # Replace all IPs in the config + for old_ip, new_ip in ip_mapping.items(): + new_content = new_content.replace(old_ip, new_ip) + + return new_content + + +def parse_network_interfaces(content: str) -> list[NetworkInterface]: + """Parse /etc/network/interfaces and extract interface configs.""" + interfaces = [] + current_iface = None + current_lines = [] + + for line in content.split('\n'): + stripped = line.strip() + + # New iface block + m = re.match(r'iface\s+(\S+)\s+inet\s+(\S+)', stripped) + if m: + # Save previous + if current_iface: + interfaces.append(_build_interface(current_iface, current_lines)) + current_iface = m.group(1) + current_lines = [line] + continue + + # Auto line or source line starts a new context + if stripped.startswith('auto ') or stripped.startswith('source '): + if current_iface: + interfaces.append(_build_interface(current_iface, current_lines)) + current_iface = None + current_lines = [] + continue + + if current_iface and stripped: + current_lines.append(line) + + # Don't forget the last one + if current_iface: + interfaces.append(_build_interface(current_iface, current_lines)) + + return interfaces + + +def _build_interface(name: str, lines: list[str]) -> NetworkInterface: + """Build a NetworkInterface from parsed lines.""" + raw = '\n'.join(lines) + address = "" + netmask = "" + cidr = 0 + gateway = None + bridge_ports = None + + for line in lines: + stripped = line.strip() + # address with CIDR notation: address 192.168.0.1/24 + m = re.match(r'address\s+(\d+\.\d+\.\d+\.\d+)/(\d+)', stripped) + if m: + address = m.group(1) + cidr = int(m.group(2)) + netmask = cidr_to_netmask(cidr) + continue + # address without CIDR + m = re.match(r'address\s+(\d+\.\d+\.\d+\.\d+)', stripped) + if m: + address = m.group(1) + continue + m = re.match(r'netmask\s+(\S+)', stripped) + if m: + netmask = m.group(1) + cidr = netmask_to_cidr(netmask) + continue + m = re.match(r'gateway\s+(\S+)', stripped) + if m: + gateway = m.group(1) + continue + m = re.match(r'bridge[_-]ports\s+(\S+)', stripped) + if m: + bridge_ports = m.group(1) + continue + + return NetworkInterface( + name=name, + address=address, + netmask=netmask, + cidr=cidr, + gateway=gateway, + bridge_ports=bridge_ports, + raw_config=raw, + ) + + +def generate_network_interfaces(content: str, old_ip: str, new_ip: str, + new_cidr: int, new_gateway: str | None = None, + old_gateway: str | None = None) -> str: + """Update /etc/network/interfaces with new IP, keeping everything else.""" + new_content = content + + # Replace IP in address lines (with and without CIDR) + # address 192.168.0.101/24 -> address 172.0.2.101/16 + new_content = re.sub( + rf'(address\s+){re.escape(old_ip)}/\d+', + rf'\g<1>{new_ip}/{new_cidr}', + new_content + ) + # address 192.168.0.101 (without CIDR) + new_content = re.sub( + rf'(address\s+){re.escape(old_ip)}(\s)', + rf'\g<1>{new_ip}\2', + new_content + ) + + # Replace gateway if provided + if new_gateway and old_gateway: + new_content = new_content.replace( + f'gateway {old_gateway}', + f'gateway {new_gateway}' + ) + + return new_content + + +def generate_hosts(content: str, ip_mapping: dict[str, str]) -> str: + """Update /etc/hosts with new IPs.""" + new_content = content + for old_ip, new_ip in ip_mapping.items(): + new_content = new_content.replace(old_ip, new_ip) + return new_content + + +def cidr_to_netmask(cidr: int) -> str: + """Convert CIDR prefix length to netmask string.""" + bits = (0xFFFFFFFF << (32 - cidr)) & 0xFFFFFFFF + return f"{(bits >> 24) & 0xFF}.{(bits >> 16) & 0xFF}.{(bits >> 8) & 0xFF}.{bits & 0xFF}" + + +def netmask_to_cidr(netmask: str) -> int: + """Convert netmask string to CIDR prefix length.""" + parts = netmask.split('.') + binary = ''.join(f'{int(p):08b}' for p in parts) + return binary.count('1') diff --git a/discovery.py b/discovery.py new file mode 100644 index 0000000..344a696 --- /dev/null +++ b/discovery.py @@ -0,0 +1,189 @@ +"""Phase 1: Discovery - Read current cluster configuration.""" + +import socket +from models import NodeInfo, CorosyncConfig, CephConfig +from config_parser import parse_corosync_conf, parse_ceph_conf, parse_network_interfaces +from ssh_manager import SSHManager + + +class Discovery: + """Discovers current Proxmox cluster and Ceph configuration.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + self.local_hostname = socket.gethostname() + + def discover_corosync(self) -> CorosyncConfig | None: + """Read and parse corosync.conf from the local node.""" + # Try /etc/pve/corosync.conf first (cluster filesystem) + ok, content = self.ssh.read_local_file("/etc/pve/corosync.conf") + if not ok: + # Fallback to local corosync config + ok, content = self.ssh.read_local_file("/etc/corosync/corosync.conf") + if not ok: + print(f" [!] Corosync config nicht gefunden: {content}") + return None + + config = parse_corosync_conf(content) + print(f" Cluster: {config.cluster_name}") + print(f" Transport: {config.transport}") + print(f" Config Version: {config.config_version}") + print(f" Nodes gefunden: {len(config.nodes)}") + for node in config.nodes: + print(f" - {node.name} (ID: {node.nodeid}) -> {node.ring0_addr}") + return config + + def discover_ceph(self) -> CephConfig | None: + """Read and parse ceph.conf.""" + ok, content = self.ssh.read_local_file("/etc/pve/ceph.conf") + if not ok: + ok, content = self.ssh.read_local_file("/etc/ceph/ceph.conf") + if not ok: + print(" [!] Ceph config nicht gefunden (Ceph evtl. nicht installiert)") + return None + + config = parse_ceph_conf(content) + print(f" FSID: {config.fsid}") + print(f" Public Network: {config.public_network}") + print(f" Cluster Network: {config.cluster_network}") + if config.mon_hosts: + print(f" MON Hosts: {', '.join(config.mon_hosts)}") + if config.mon_sections: + print(f" MON Sections: {', '.join(config.mon_sections.keys())}") + return config + + def discover_nodes(self, corosync: CorosyncConfig) -> list[NodeInfo]: + """Build node list from corosync config and check reachability.""" + nodes = [] + for cs_node in corosync.nodes: + is_local = (cs_node.name == self.local_hostname) + node = NodeInfo( + name=cs_node.name, + current_ip=cs_node.ring0_addr, + ssh_host=cs_node.ring0_addr, + is_local=is_local, + ) + + # Check reachability + if is_local: + node.is_reachable = True + else: + node.is_reachable = self.ssh.is_reachable(cs_node.ring0_addr) + + # Try to reach by hostname if IP doesn't work + if not node.is_reachable and not is_local: + if self.ssh.is_reachable(cs_node.name): + node.is_reachable = True + node.ssh_host = cs_node.name + + if node.is_reachable: + self._read_node_configs(node) + + status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" + local_tag = " (lokal)" if is_local else "" + print(f" {node.name}: {node.current_ip} - {status}{local_tag}") + + nodes.append(node) + + return nodes + + def discover_nodes_with_overrides(self, corosync: CorosyncConfig, + override_nodes: list[NodeInfo]) -> list[NodeInfo]: + """Re-discover nodes using override SSH hosts (e.g. rescue IPs). + + Takes pre-configured nodes (with rescue IPs as ssh_host) and + reads their configs. + """ + print("\n[Nodes - via Rescue-Netzwerk]") + for node in override_nodes: + if node.is_reachable: + self._read_node_configs(node) + + status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" + local_tag = " (lokal)" if node.is_local else "" + via = f" via {node.ssh_host}" if not node.is_local else "" + print(f" {node.name}: {node.current_ip}{via} - {status}{local_tag}") + + return override_nodes + + def _read_node_configs(self, node: NodeInfo): + """Read network interfaces and hosts from a node.""" + # Read /etc/network/interfaces + ok, content = self.ssh.read_node_file( + node.ssh_host, "/etc/network/interfaces", node.is_local + ) + if ok: + node.network_interfaces_content = content + node.interfaces = parse_network_interfaces(content) + + # Read /etc/hosts + ok, content = self.ssh.read_node_file( + node.ssh_host, "/etc/hosts", node.is_local + ) + if ok: + node.hosts_content = content + + def check_quorum(self) -> bool: + """Check if the cluster currently has quorum.""" + rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null") + if rc != 0: + print(" [!] pvecm status fehlgeschlagen - kein Quorum oder kein Cluster") + return False + + if "Quorate: Yes" in stdout or "Activity blocked" not in stdout: + # Also check if /etc/pve is writable + rc2, _, _ = self.ssh.execute_local( + "touch /etc/pve/.migration_test && rm -f /etc/pve/.migration_test" + ) + if rc2 == 0: + print(" Quorum: JA (/etc/pve ist beschreibbar)") + return True + + print(" Quorum: NEIN (/etc/pve ist read-only!)") + return False + + def check_ceph_health(self) -> str | None: + """Get current Ceph health status.""" + rc, stdout, _ = self.ssh.execute_local("ceph health 2>/dev/null") + if rc == 0: + status = stdout.strip() + print(f" Ceph Health: {status}") + return status + return None + + def run(self) -> tuple[CorosyncConfig | None, CephConfig | None, + list[NodeInfo], bool]: + """Run full discovery. + + Returns: (corosync_config, ceph_config, nodes, has_quorum) + """ + print("\n=== Phase 1: Discovery ===\n") + + print("[Corosync]") + corosync = self.discover_corosync() + if not corosync or not corosync.nodes: + print("FEHLER: Konnte keine Corosync-Konfiguration lesen!") + return None, None, [], False + + print("\n[Ceph]") + ceph = self.discover_ceph() + + print("\n[Nodes]") + nodes = self.discover_nodes(corosync) + + print("\n[Cluster Status]") + has_quorum = self.check_quorum() + + if ceph: + print("\n[Ceph Health]") + self.check_ceph_health() + + unreachable = [n for n in nodes if not n.is_reachable] + if unreachable: + print(f"\n[!] WARNUNG: {len(unreachable)} Node(s) nicht erreichbar:") + for n in unreachable: + print(f" - {n.name} ({n.current_ip})") + print(" Diese Nodes wurden möglicherweise bereits manuell geändert.") + print(" Das Tool wird versuchen, sie über ihren Hostnamen zu erreichen.") + + return corosync, ceph, nodes, has_quorum diff --git a/main.py b/main.py new file mode 100644 index 0000000..39cec39 --- /dev/null +++ b/main.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +""" +Proxmox Cluster Network Changer + +Migriert ein Proxmox-Cluster (inkl. Ceph) von einem Netzwerk in ein anderes. +Behandelt Corosync, Ceph, /etc/network/interfaces und /etc/hosts. + +Kann auch mit gebrochenem Quorum umgehen (z.B. wenn ein Node bereits +manuell geändert wurde). + +Muss als root auf einem Proxmox-Node ausgeführt werden. + +Verwendung: + python3 main.py # Interaktiver Modus + python3 main.py --dry-run # Nur anzeigen, nichts ändern + python3 main.py --discover # Nur Discovery, keine Migration +""" + +import argparse +import os +import sys + +from ssh_manager import SSHManager +from discovery import Discovery +from planner import Planner +from backup import Backup +from migrator import Migrator +from verifier import Verifier +from rescue import RescueNetwork + + +def check_prerequisites(): + """Check that we're running as root on a Proxmox node.""" + if os.geteuid() != 0: + print("FEHLER: Dieses Tool muss als root ausgeführt werden!") + print("Bitte mit 'sudo python3 main.py' starten.") + sys.exit(1) + + if not os.path.exists("/etc/pve") and not os.path.exists("/etc/corosync"): + print("WARNUNG: Dies scheint kein Proxmox-Node zu sein.") + print(" /etc/pve und /etc/corosync nicht gefunden.") + answer = input("Trotzdem fortfahren? [j/N]: ").strip().lower() + if answer not in ('j', 'ja', 'y', 'yes'): + sys.exit(0) + + +def main(): + parser = argparse.ArgumentParser( + description="Proxmox Cluster Network Changer - " + "Migriert Cluster + Ceph in ein neues Netzwerk" + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Nur anzeigen was geändert würde, nichts ändern" + ) + parser.add_argument( + "--discover", action="store_true", + help="Nur Discovery durchführen, keine Migration" + ) + parser.add_argument( + "--ssh-key", type=str, default=None, + help="Pfad zum SSH-Key (Standard: Default SSH-Key)" + ) + parser.add_argument( + "--ssh-port", type=int, default=22, + help="SSH-Port (Standard: 22)" + ) + parser.add_argument( + "--rescue", action="store_true", + help="Rescue-Modus: Emergency-Netzwerk einrichten wenn Nodes " + "sich nicht erreichen können" + ) + parser.add_argument( + "--rescue-commands", type=str, metavar="SUBNET", + help="Nur Rescue-Befehle ausgeben ohne Migration " + "(z.B. --rescue-commands 10.99.99.0/24)" + ) + args = parser.parse_args() + + print("=" * 60) + print(" Proxmox Cluster Network Changer") + print("=" * 60) + + check_prerequisites() + + # Initialize SSH manager + ssh = SSHManager(ssh_key=args.ssh_key, ssh_port=args.ssh_port) + rescue = RescueNetwork(ssh) + + # Quick mode: just print rescue commands and exit + if args.rescue_commands: + discovery = Discovery(ssh) + print("\n[Corosync]") + corosync = discovery.discover_corosync() + if not corosync: + print("\nFEHLER: Konnte Cluster-Konfiguration nicht lesen.") + sys.exit(1) + + bridge_input = input(f"Bridge [{rescue.bridge}]: ").strip() + bridge = bridge_input or rescue.bridge + + commands = rescue.get_rescue_commands(corosync, args.rescue_commands, bridge) + print() + print("=" * 60) + print(" RESCUE BEFEHLE") + print(f" Subnetz: {args.rescue_commands} | Bridge: {bridge}") + print("=" * 60) + print() + for cmd_info in commands: + print(f" {cmd_info['name']} ({cmd_info['current_ip']}):") + print(f" {cmd_info['command']}") + print() + print(" Zum Entfernen:") + for cmd_info in commands: + print(f" {cmd_info['remove_command']} # {cmd_info['name']}") + print() + sys.exit(0) + + # Phase 1: Discovery + discovery = Discovery(ssh) + corosync, ceph, nodes, has_quorum = discovery.run() + + if not corosync: + print("\nFEHLER: Konnte Cluster-Konfiguration nicht lesen. Abbruch.") + sys.exit(1) + + # Check if rescue mode is needed + unreachable = [n for n in nodes if not n.is_reachable and not n.is_local] + use_rescue = args.rescue + + if unreachable and not use_rescue: + print(f"\n {len(unreachable)} Node(s) nicht erreichbar.") + answer = input(" Rescue-Netzwerk einrichten? [J/n]: ").strip().lower() + if answer not in ('n', 'nein', 'no'): + use_rescue = True + + if use_rescue: + rescue_nodes = rescue.setup_interactive(corosync) + if not rescue_nodes: + sys.exit(1) + # Re-run discovery with rescue IPs to read configs from all nodes + print("\n [Rescue] Lese Konfigurationen über Rescue-Netzwerk...") + nodes = discovery.discover_nodes_with_overrides( + corosync, rescue_nodes + ) + # Re-check quorum + has_quorum = discovery.check_quorum() + # Re-read ceph + ceph = discovery.discover_ceph() + + if args.discover: + if rescue.active: + rescue.cleanup(nodes) + print("\n--- Discovery abgeschlossen (--discover Modus) ---") + sys.exit(0) + + # Phase 2: Planning + planner = Planner() + plan = planner.plan(nodes, corosync, ceph, has_quorum) + + if not plan: + if rescue.active: + rescue.cleanup(nodes) + sys.exit(0) + + plan.dry_run = args.dry_run + + # Generate all new config files + configs = planner.generate_new_configs(plan) + + # Phase 3: Backup (skip in dry-run) + if not args.dry_run: + backup = Backup(ssh) + if not backup.run(plan): + print("\nBackup fehlgeschlagen! Trotzdem fortfahren?") + answer = input("[j/N]: ").strip().lower() + if answer not in ('j', 'ja', 'y', 'yes'): + if rescue.active: + rescue.cleanup(nodes) + sys.exit(1) + else: + print("\n=== Phase 3: Backup (übersprungen im Dry-Run) ===") + + # Phase 4: Migration + migrator = Migrator(ssh) + success = migrator.run(plan, configs, dry_run=args.dry_run) + + if not success: + print("\n[!] Migration hatte Fehler!") + if not args.dry_run: + print(" Prüfe Backups in /root/network-migration-backup-*/") + if rescue.active: + rescue.cleanup(nodes) + sys.exit(1) + + # Cleanup rescue network (before verification, so we verify real connectivity) + if rescue.active and not args.dry_run: + rescue.cleanup(nodes) + + # Phase 5: Verification (skip in dry-run) + if not args.dry_run: + verifier = Verifier(ssh) + verifier.run(plan) + else: + if rescue.active: + rescue.cleanup(nodes) + print("\n=== Phase 5: Verifikation (übersprungen im Dry-Run) ===") + print("\nDry-Run abgeschlossen. Keine Änderungen vorgenommen.") + + +if __name__ == "__main__": + main() diff --git a/migrator.py b/migrator.py new file mode 100644 index 0000000..8cf472e --- /dev/null +++ b/migrator.py @@ -0,0 +1,450 @@ +"""Phase 4: Execute the network migration.""" + +import time +from models import MigrationPlan +from ssh_manager import SSHManager + + +class Migrator: + """Executes the actual network migration across all nodes.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + + def run(self, plan: MigrationPlan, configs: dict, dry_run: bool = False) -> bool: + """Execute the migration. + + Args: + plan: The migration plan + configs: Generated configs from Planner.generate_new_configs() + dry_run: If True, only show what would be done + """ + print("\n=== Phase 4: Migration ===\n") + + if dry_run: + print(" *** DRY RUN - Es werden keine Änderungen vorgenommen ***\n") + + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + reachable_nodes = [n for n in plan.nodes if n.is_reachable] + + if not reachable_nodes: + print(" FEHLER: Keine Nodes erreichbar!") + return False + + # Step 1: Write new configs to all nodes (but don't activate yet) + print("[1/7] Neue Konfigurationen verteilen...") + if not self._distribute_configs(plan, configs, dry_run): + return False + + # Step 2: Stop Corosync on all nodes + print("\n[2/7] Corosync stoppen auf allen Nodes...") + if not self._stop_corosync(reachable_nodes, dry_run): + return False + + # Step 3: Stop pve-cluster (pmxcfs) to release corosync.conf + print("\n[3/7] pve-cluster stoppen...") + if not self._stop_pve_cluster(reachable_nodes, dry_run): + return False + + # Step 4: Write corosync config directly + print("\n[4/7] Corosync-Konfiguration aktualisieren...") + if not self._update_corosync(reachable_nodes, configs, dry_run): + return False + + # Step 5: Update /etc/hosts on all nodes + print("\n[5/7] /etc/hosts aktualisieren...") + if not self._update_hosts(plan, configs, dry_run): + return False + + # Step 6: Update network interfaces and restart networking + print("\n[6/7] Netzwerk-Interfaces aktualisieren und Netzwerk neu starten...") + if not self._update_network(plan, configs, dry_run): + return False + + # Step 7: Start services back up + print("\n[7/7] Services starten...") + if not self._start_services(plan, configs, dry_run): + return False + + return True + + def _distribute_configs(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Write prepared configs as staged files (not yet active).""" + for node in plan.nodes: + if not node.is_reachable or node.name not in configs['nodes']: + continue + + node_configs = configs['nodes'][node.name] + staging_dir = "/root/.network-migration-staged" + + if dry_run: + print(f" [{node.name}] Würde Configs nach {staging_dir}/ schreiben") + continue + + # Create staging directory + self.ssh.run_on_node( + node.ssh_host, f"mkdir -p {staging_dir}", node.is_local + ) + + # Stage network interfaces + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/interfaces", + node_configs['interfaces'], + node.is_local, + ) + if ok: + print(f" [{node.name}] interfaces staged") + else: + print(f" [{node.name}] FEHLER interfaces: {msg}") + return False + + # Stage hosts + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/hosts", + node_configs['hosts'], + node.is_local, + ) + if ok: + print(f" [{node.name}] hosts staged") + else: + print(f" [{node.name}] FEHLER hosts: {msg}") + return False + + # Stage corosync config + if configs['corosync']: + for node in plan.nodes: + if not node.is_reachable: + continue + staging_dir = "/root/.network-migration-staged" + if dry_run: + print(f" [{node.name}] Würde corosync.conf stagen") + continue + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/corosync.conf", + configs['corosync'], + node.is_local, + ) + if ok: + print(f" [{node.name}] corosync.conf staged") + else: + print(f" [{node.name}] FEHLER corosync.conf: {msg}") + return False + + # Stage ceph config + if configs['ceph']: + for node in plan.nodes: + if not node.is_reachable: + continue + staging_dir = "/root/.network-migration-staged" + if dry_run: + print(f" [{node.name}] Würde ceph.conf stagen") + continue + ok, msg = self.ssh.write_node_file( + node.ssh_host, + f"{staging_dir}/ceph.conf", + configs['ceph'], + node.is_local, + ) + if ok: + print(f" [{node.name}] ceph.conf staged") + else: + print(f" [{node.name}] FEHLER ceph.conf: {msg}") + return False + + return True + + def _stop_corosync(self, nodes: list, dry_run: bool) -> bool: + """Stop corosync on all nodes.""" + for node in nodes: + if dry_run: + print(f" [{node.name}] Würde corosync stoppen") + continue + rc, _, err = self.ssh.run_on_node( + node.ssh_host, "systemctl stop corosync", node.is_local + ) + if rc == 0: + print(f" [{node.name}] corosync gestoppt") + else: + print(f" [{node.name}] WARNUNG beim Stoppen: {err}") + return True + + def _stop_pve_cluster(self, nodes: list, dry_run: bool) -> bool: + """Stop pve-cluster service to unmount /etc/pve.""" + for node in nodes: + if dry_run: + print(f" [{node.name}] Würde pve-cluster stoppen") + continue + rc, _, err = self.ssh.run_on_node( + node.ssh_host, "systemctl stop pve-cluster", node.is_local + ) + if rc == 0: + print(f" [{node.name}] pve-cluster gestoppt") + else: + print(f" [{node.name}] WARNUNG: {err}") + return True + + def _update_corosync(self, nodes: list, configs: dict, + dry_run: bool) -> bool: + """Write new corosync.conf directly to /etc/corosync/.""" + if not configs['corosync']: + print(" Keine Corosync-Änderungen") + return True + + for node in nodes: + if dry_run: + print(f" [{node.name}] Würde /etc/corosync/corosync.conf schreiben") + continue + + staging = "/root/.network-migration-staged/corosync.conf" + rc, _, err = self.ssh.run_on_node( + node.ssh_host, + f"cp {staging} /etc/corosync/corosync.conf", + node.is_local, + ) + if rc == 0: + print(f" [{node.name}] corosync.conf aktualisiert") + else: + print(f" [{node.name}] FEHLER: {err}") + return False + + return True + + def _update_hosts(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Update /etc/hosts on all nodes.""" + for node in plan.nodes: + if not node.is_reachable or node.name not in configs['nodes']: + continue + + if dry_run: + print(f" [{node.name}] Würde /etc/hosts aktualisieren") + continue + + staging = "/root/.network-migration-staged/hosts" + rc, _, err = self.ssh.run_on_node( + node.ssh_host, + f"cp {staging} /etc/hosts", + node.is_local, + ) + if rc == 0: + print(f" [{node.name}] /etc/hosts aktualisiert") + else: + print(f" [{node.name}] FEHLER: {err}") + return False + + return True + + def _update_network(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Update /etc/network/interfaces and restart networking.""" + for node in plan.nodes: + if not node.is_reachable or node.name not in configs['nodes']: + continue + + if dry_run: + print(f" [{node.name}] Würde /etc/network/interfaces aktualisieren") + print(f" [{node.name}] Würde 'ifreload -a' ausführen") + continue + + staging = "/root/.network-migration-staged/interfaces" + rc, _, err = self.ssh.run_on_node( + node.ssh_host, + f"cp {staging} /etc/network/interfaces", + node.is_local, + ) + if rc == 0: + print(f" [{node.name}] /etc/network/interfaces aktualisiert") + else: + print(f" [{node.name}] FEHLER: {err}") + return False + + # Reload network - use ifreload if available, otherwise ifdown/ifup + rc, _, _ = self.ssh.run_on_node( + node.ssh_host, "which ifreload", node.is_local + ) + if rc == 0: + reload_cmd = "ifreload -a" + else: + reload_cmd = f"ifdown {plan.bridge_name} && ifup {plan.bridge_name}" + + print(f" [{node.name}] Netzwerk wird neu geladen ({reload_cmd})...") + rc, _, err = self.ssh.run_on_node( + node.ssh_host, reload_cmd, node.is_local, timeout=60 + ) + if rc == 0: + print(f" [{node.name}] Netzwerk neu geladen") + else: + print(f" [{node.name}] WARNUNG beim Netzwerk-Reload: {err}") + # Don't fail here - the node might just be unreachable on old IP now + + return True + + def _start_services(self, plan: MigrationPlan, configs: dict, + dry_run: bool) -> bool: + """Start pve-cluster and corosync, then handle Ceph.""" + # Now we need to reach nodes on their NEW IPs + for node in plan.nodes: + if not node.is_reachable: + continue + + new_host = node.new_ip if not node.is_local else node.ssh_host + is_local = node.is_local + + # Start pve-cluster + if dry_run: + print(f" [{node.name}] Würde pve-cluster starten") + print(f" [{node.name}] Würde corosync starten") + continue + + print(f" [{node.name}] Starte pve-cluster...") + rc, _, err = self.ssh.run_on_node( + new_host, "systemctl start pve-cluster", is_local, timeout=30 + ) + if rc == 0: + print(f" [{node.name}] pve-cluster gestartet") + else: + print(f" [{node.name}] WARNUNG pve-cluster: {err}") + + print(f" [{node.name}] Starte corosync...") + rc, _, err = self.ssh.run_on_node( + new_host, "systemctl start corosync", is_local, timeout=30 + ) + if rc == 0: + print(f" [{node.name}] corosync gestartet") + else: + print(f" [{node.name}] WARNUNG corosync: {err}") + + if dry_run: + print("\n Würde auf Quorum warten...") + return True + + # Wait for quorum + print("\n Warte auf Quorum...") + if not self._wait_for_quorum(timeout=60): + print(" [!] Quorum nicht erreicht! Versuche 'pvecm expected 1'...") + rc, _, _ = self.ssh.execute_local("pvecm expected 1") + if rc == 0: + print(" Quorum erzwungen mit 'pvecm expected 1'") + time.sleep(5) + else: + print(" [!] Konnte Quorum nicht erzwingen!") + + # Update Ceph config via cluster FS if possible + if configs.get('ceph'): + self._update_ceph(plan, configs) + + # Cleanup staging directories + print("\n Staging-Verzeichnisse aufräumen...") + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + self.ssh.run_on_node( + new_host, + "rm -rf /root/.network-migration-staged", + node.is_local, + ) + + return True + + def _wait_for_quorum(self, timeout: int = 60) -> bool: + """Wait for cluster quorum to be established.""" + start = time.time() + while time.time() - start < timeout: + rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null") + if rc == 0 and "Quorate: Yes" in stdout: + print(" Quorum erreicht!") + return True + print(" ... warte auf Quorum ...") + time.sleep(5) + return False + + def _update_ceph(self, plan: MigrationPlan, configs: dict): + """Update Ceph configuration after quorum is available.""" + print("\n [Ceph] Konfiguration aktualisieren...") + + # Try to write via /etc/pve/ceph.conf first + rc, _, _ = self.ssh.execute_local( + "touch /etc/pve/.ceph_test && rm -f /etc/pve/.ceph_test" + ) + if rc == 0: + # /etc/pve is writable - use cluster filesystem + ok, msg = self.ssh.write_local_file("/etc/pve/ceph.conf", configs['ceph']) + if ok: + print(" [Ceph] /etc/pve/ceph.conf aktualisiert (via Cluster-FS)") + else: + print(f" [Ceph] FEHLER /etc/pve/ceph.conf: {msg}") + self._update_ceph_direct(plan, configs) + else: + # /etc/pve not writable - write directly on each node + print(" [Ceph] /etc/pve nicht beschreibbar, schreibe direkt...") + self._update_ceph_direct(plan, configs) + + # Restart Ceph services + print(" [Ceph] Services neu starten...") + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + + # Restart MON + self.ssh.run_on_node( + new_host, + f"systemctl restart ceph-mon@{node.name} 2>/dev/null", + node.is_local, timeout=30, + ) + # Restart MGR + self.ssh.run_on_node( + new_host, + f"systemctl restart ceph-mgr@{node.name} 2>/dev/null", + node.is_local, timeout=30, + ) + # Restart all OSDs on this node + self.ssh.run_on_node( + new_host, + "systemctl restart ceph-osd.target 2>/dev/null", + node.is_local, timeout=60, + ) + print(f" [{node.name}] Ceph-Services neu gestartet") + + def _update_ceph_direct(self, plan: MigrationPlan, configs: dict): + """Write ceph.conf directly on each node (fallback when no quorum).""" + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + + ok, msg = self.ssh.write_node_file( + new_host, "/etc/ceph/ceph.conf", + configs['ceph'], node.is_local, + ) + if ok: + print(f" [{node.name}] /etc/ceph/ceph.conf direkt geschrieben") + else: + print(f" [{node.name}] FEHLER /etc/ceph/ceph.conf: {msg}") + + def _update_ceph_mon_map(self, plan: MigrationPlan): + """Update Ceph MON map with new addresses. + + This is needed when MON IPs change. + """ + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + new_ip = node.new_ip + + # Extract monmap, modify, and reinject + cmds = [ + f"ceph-mon -i {node.name} --extract-monmap /tmp/monmap", + # Remove old entries and add new ones + ] + # This is complex - for now we rely on the ceph.conf update + # and let Ceph handle the MON map update on restart + print(f" [{node.name}] MON-Map wird beim Neustart aktualisiert") diff --git a/models.py b/models.py new file mode 100644 index 0000000..6de6d3e --- /dev/null +++ b/models.py @@ -0,0 +1,76 @@ +"""Data models for the Proxmox Cluster Network Changer.""" + +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class NetworkInterface: + """Represents a network interface configuration.""" + name: str # e.g. vmbr0 + address: str # e.g. 192.168.0.101 + netmask: str # e.g. 255.255.255.0 + cidr: int # e.g. 24 + gateway: Optional[str] = None + bridge_ports: Optional[str] = None + raw_config: str = "" + + +@dataclass +class NodeInfo: + """Represents a single Proxmox node.""" + name: str # e.g. pve1 + current_ip: str # current IP address + new_ip: Optional[str] = None # planned new IP + ssh_host: Optional[str] = None # how to reach it (IP or hostname) + is_local: bool = False # is this the node we're running on + is_reachable: bool = False + interfaces: list[NetworkInterface] = field(default_factory=list) + hosts_content: str = "" + network_interfaces_content: str = "" + + +@dataclass +class CorosyncNode: + """A node entry in corosync.conf.""" + nodeid: int + name: str + ring0_addr: str + ring1_addr: Optional[str] = None + + +@dataclass +class CorosyncConfig: + """Parsed corosync configuration.""" + nodes: list[CorosyncNode] = field(default_factory=list) + config_version: int = 1 + cluster_name: str = "" + transport: str = "knet" + raw_content: str = "" + + +@dataclass +class CephConfig: + """Parsed Ceph configuration.""" + fsid: str = "" + public_network: str = "" # e.g. 192.168.0.0/24 + cluster_network: str = "" # e.g. 192.168.0.0/24 + mon_hosts: list[str] = field(default_factory=list) + mon_sections: dict[str, dict[str, str]] = field(default_factory=dict) # [mon.pve1] -> {key: val} + raw_content: str = "" + + +@dataclass +class MigrationPlan: + """Complete migration plan with old -> new mappings.""" + nodes: list[NodeInfo] = field(default_factory=list) + old_network: str = "" # e.g. 192.168.0.0/24 + new_network: str = "" # e.g. 172.0.2.0/16 + new_gateway: Optional[str] = None + ceph_new_public_network: str = "" + ceph_new_cluster_network: str = "" + corosync_config: Optional[CorosyncConfig] = None + ceph_config: Optional[CephConfig] = None + dry_run: bool = False + quorum_available: bool = True + bridge_name: str = "vmbr0" # which bridge to modify diff --git a/planner.py b/planner.py new file mode 100644 index 0000000..e87ef10 --- /dev/null +++ b/planner.py @@ -0,0 +1,236 @@ +"""Phase 2: Plan the migration - IP mapping and config generation.""" + +import ipaddress +from models import NodeInfo, CorosyncConfig, CephConfig, MigrationPlan +from config_parser import ( + generate_corosync_conf, generate_ceph_conf, + generate_network_interfaces, generate_hosts, +) + + +class Planner: + """Plans the network migration with user input.""" + + def plan(self, nodes: list[NodeInfo], corosync: CorosyncConfig, + ceph: CephConfig | None, has_quorum: bool) -> MigrationPlan | None: + """Interactive planning with the user.""" + plan = MigrationPlan( + nodes=nodes, + corosync_config=corosync, + ceph_config=ceph, + quorum_available=has_quorum, + ) + + print("\n=== Phase 2: Migration planen ===\n") + + # Get new network + plan.new_network = self._ask_new_network() + if not plan.new_network: + return None + + new_net = ipaddress.ip_network(plan.new_network, strict=False) + plan.new_gateway = self._ask_gateway(new_net) + + # Detect old network from first node + if nodes: + old_ip = ipaddress.ip_address(nodes[0].current_ip) + for iface in nodes[0].interfaces: + if iface.address == str(old_ip): + plan.old_network = f"{ipaddress.ip_network(f'{iface.address}/{iface.cidr}', strict=False)}" + plan.bridge_name = iface.name + break + + # Generate IP mapping suggestions + print("\n[IP-Mapping]") + print("Für jeden Node wird eine neue IP benötigt.\n") + + for node in nodes: + suggested_ip = self._suggest_new_ip(node.current_ip, plan.new_network) + print(f" {node.name}: {node.current_ip} -> ", end="") + + user_input = input(f"[{suggested_ip}]: ").strip() + if user_input: + node.new_ip = user_input + else: + node.new_ip = suggested_ip + + print(f" => {node.new_ip}") + + # Ceph network planning + if ceph: + print("\n[Ceph Netzwerke]") + print(f" Aktuelles Public Network: {ceph.public_network}") + print(f" Aktuelles Cluster Network: {ceph.cluster_network}") + + default_ceph_net = plan.new_network + user_input = input( + f"\n Neues Ceph Public Network [{default_ceph_net}]: " + ).strip() + plan.ceph_new_public_network = user_input or default_ceph_net + + user_input = input( + f" Neues Ceph Cluster Network [{plan.ceph_new_public_network}]: " + ).strip() + plan.ceph_new_cluster_network = user_input or plan.ceph_new_public_network + + # Which bridge to modify + print(f"\n[Bridge]") + user_input = input( + f" Welche Bridge soll geändert werden? [{plan.bridge_name}]: " + ).strip() + if user_input: + plan.bridge_name = user_input + + # Show preview + self._show_preview(plan) + + # Confirm + confirm = input("\nMigration durchführen? [j/N]: ").strip().lower() + if confirm not in ('j', 'ja', 'y', 'yes'): + print("Abgebrochen.") + return None + + return plan + + def _ask_new_network(self) -> str | None: + """Ask for the new network.""" + while True: + network = input("Neues Netzwerk (z.B. 172.0.2.0/16): ").strip() + if not network: + print("Abgebrochen.") + return None + try: + ipaddress.ip_network(network, strict=False) + return network + except ValueError as e: + print(f" Ungültiges Netzwerk: {e}") + + def _ask_gateway(self, network: ipaddress.IPv4Network) -> str: + """Ask for the gateway in the new network.""" + # Suggest first usable IP as gateway + suggested = str(list(network.hosts())[0]) + user_input = input(f"Neues Gateway [{suggested}]: ").strip() + return user_input or suggested + + def _suggest_new_ip(self, old_ip: str, new_network: str) -> str: + """Suggest a new IP by keeping the host part from the old IP.""" + old = ipaddress.ip_address(old_ip) + new_net = ipaddress.ip_network(new_network, strict=False) + + # Keep the last octet(s) from the old IP + old_host = int(old) & 0xFF # last octet + if new_net.prefixlen <= 16: + # For /16 or bigger, keep last two octets + old_host = int(old) & 0xFFFF + + new_ip = ipaddress.ip_address(int(new_net.network_address) | old_host) + return str(new_ip) + + def _show_preview(self, plan: MigrationPlan): + """Show a preview of all planned changes.""" + print("\n" + "=" * 60) + print(" MIGRATION PREVIEW") + print("=" * 60) + + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + + print(f"\n Netzwerk: {plan.old_network} -> {plan.new_network}") + print(f" Gateway: {plan.new_gateway}") + print(f" Bridge: {plan.bridge_name}") + print(f" Quorum verfügbar: {'Ja' if plan.quorum_available else 'NEIN'}") + + print("\n [Node IP-Mapping]") + for node in plan.nodes: + status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" + print(f" {node.name}: {node.current_ip} -> {node.new_ip} ({status})") + + if plan.ceph_config: + print("\n [Ceph Netzwerke]") + print(f" Public: {plan.ceph_config.public_network} -> {plan.ceph_new_public_network}") + print(f" Cluster: {plan.ceph_config.cluster_network} -> {plan.ceph_new_cluster_network}") + if plan.ceph_config.mon_hosts: + print(f" MON Hosts: {', '.join(plan.ceph_config.mon_hosts)}") + new_mons = [ip_mapping.get(h, h) for h in plan.ceph_config.mon_hosts] + print(f" -> {', '.join(new_mons)}") + + print("\n [Dateien die geändert werden]") + print(" - /etc/network/interfaces (auf jedem Node)") + print(" - /etc/hosts (auf jedem Node)") + print(" - /etc/corosync/corosync.conf (auf jedem Node)") + if plan.ceph_config: + if plan.quorum_available: + print(" - /etc/pve/ceph.conf (über Cluster-FS)") + else: + print(" - /etc/ceph/ceph.conf (direkt, da kein Quorum)") + + if not plan.quorum_available: + print("\n [!] WARNUNG: Kein Quorum verfügbar!") + print(" Es wird 'pvecm expected 1' verwendet um Quorum zu erzwingen.") + print(" Ceph-Config wird direkt auf jedem Node geschrieben.") + + print("\n" + "=" * 60) + + def generate_new_configs(self, plan: MigrationPlan) -> dict: + """Generate all new configuration file contents. + + Returns dict with: + 'corosync': new corosync.conf content + 'ceph': new ceph.conf content (or None) + 'nodes': {node_name: {'interfaces': content, 'hosts': content}} + """ + ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + + configs = { + 'corosync': None, + 'ceph': None, + 'nodes': {}, + } + + # Generate new corosync.conf + if plan.corosync_config: + configs['corosync'] = generate_corosync_conf( + plan.corosync_config, ip_mapping + ) + + # Generate new ceph.conf + if plan.ceph_config: + configs['ceph'] = generate_ceph_conf( + plan.ceph_config, ip_mapping, + plan.ceph_new_public_network, + plan.ceph_new_cluster_network, + ) + + # Generate per-node configs + new_cidr = ipaddress.ip_network(plan.new_network, strict=False).prefixlen + + # Detect old gateway from first reachable node + old_gateway = None + for node in plan.nodes: + for iface in node.interfaces: + if iface.name == plan.bridge_name and iface.gateway: + old_gateway = iface.gateway + break + if old_gateway: + break + + for node in plan.nodes: + if not node.new_ip or not node.network_interfaces_content: + continue + + node_configs = {} + + # Network interfaces + node_configs['interfaces'] = generate_network_interfaces( + node.network_interfaces_content, + node.current_ip, node.new_ip, + new_cidr, plan.new_gateway, old_gateway, + ) + + # /etc/hosts + node_configs['hosts'] = generate_hosts( + node.hosts_content, ip_mapping + ) + + configs['nodes'][node.name] = node_configs + + return configs diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dbdcc7c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +# Proxmox Cluster Network Changer +# Keine externen Dependencies - nutzt nur Python stdlib + system ssh diff --git a/rescue.py b/rescue.py new file mode 100644 index 0000000..e87d2b3 --- /dev/null +++ b/rescue.py @@ -0,0 +1,228 @@ +"""Emergency/Rescue Network - Temporäres Netzwerk zur SSH-Kommunikation. + +Wenn Nodes in verschiedenen Subnetzen sind und sich nicht mehr erreichen +können, wird ein temporäres Emergency-Netzwerk aufgebaut: +- Jeder Node bekommt eine zusätzliche IP auf der Bridge (z.B. vmbr0) +- Über dieses Netz kann das Tool dann per SSH arbeiten +- Nach der Migration werden die Emergency-IPs wieder entfernt +""" + +import ipaddress +import time +from models import NodeInfo, CorosyncConfig +from config_parser import parse_corosync_conf +from ssh_manager import SSHManager + + +class RescueNetwork: + """Manages an emergency network for broken clusters.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + self.rescue_subnet: str = "" + self.rescue_ips: dict[str, str] = {} # node_name -> rescue_ip + self.bridge: str = "vmbr0" + self.active: bool = False + + def setup_interactive(self, corosync: CorosyncConfig) -> list[NodeInfo] | None: + """Interactively set up the rescue network. + + Returns updated node list with rescue IPs as ssh_host, or None on abort. + """ + print("\n" + "=" * 60) + print(" RESCUE NETZWERK") + print("=" * 60) + print() + print(" Dieses Feature richtet ein temporäres Netzwerk ein,") + print(" damit alle Nodes sich wieder per SSH erreichen können.") + print() + print(" Ablauf:") + print(" 1. Du gibst ein freies Subnetz an (z.B. 10.99.99.0/24)") + print(" 2. Das Tool zeigt dir für jeden Node den Befehl an") + print(" 3. Du führst die Befehle manuell auf jedem Node aus") + print(" (z.B. über IPMI/iLO/iDRAC/KVM-Konsole)") + print(" 4. Danach kann das Tool alle Nodes per SSH erreichen") + print() + + # Ask for bridge + user_input = input(f" Bridge für Emergency-IPs [{self.bridge}]: ").strip() + if user_input: + self.bridge = user_input + + # Ask for rescue subnet + while True: + subnet_input = input(" Emergency Subnetz (z.B. 10.99.99.0/24): ").strip() + if not subnet_input: + print(" Abgebrochen.") + return None + try: + subnet = ipaddress.ip_network(subnet_input, strict=False) + self.rescue_subnet = str(subnet) + break + except ValueError as e: + print(f" Ungültiges Subnetz: {e}") + + # Generate IPs for all nodes + hosts = list(subnet.hosts()) + print() + print(" " + "-" * 56) + print(f" Emergency Subnetz: {self.rescue_subnet}") + print(f" Bridge: {self.bridge}") + print(" " + "-" * 56) + print() + + nodes = [] + for i, cs_node in enumerate(corosync.nodes): + if i >= len(hosts): + print(f" [!] FEHLER: Nicht genug IPs im Subnetz für alle Nodes!") + return None + + rescue_ip = str(hosts[i]) + self.rescue_ips[cs_node.name] = rescue_ip + cidr = subnet.prefixlen + + node = NodeInfo( + name=cs_node.name, + current_ip=cs_node.ring0_addr, + ssh_host=rescue_ip, # Use rescue IP for SSH + ) + nodes.append(node) + + # Show command for this node + cmd = f"ip addr add {rescue_ip}/{cidr} dev {self.bridge}" + print(f" {cs_node.name} ({cs_node.ring0_addr}):") + print(f" Rescue-IP: {rescue_ip}/{cidr}") + print(f" Befehl: {cmd}") + print() + + # Apply locally + print(" " + "-" * 56) + print() + + # Find local node + import socket + local_hostname = socket.gethostname() + local_node = None + for node in nodes: + if node.name == local_hostname: + local_node = node + node.is_local = True + break + + if local_node and local_node.name in self.rescue_ips: + rescue_ip = self.rescue_ips[local_node.name] + cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen + print(f" Lokaler Node erkannt: {local_node.name}") + answer = input( + f" Emergency-IP {rescue_ip}/{cidr} auf {self.bridge} " + f"automatisch setzen? [J/n]: " + ).strip().lower() + + if answer not in ('n', 'nein', 'no'): + rc, _, err = self.ssh.execute_local( + f"ip addr add {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null; echo ok" + ) + if rc == 0: + print(f" -> {rescue_ip}/{cidr} auf {self.bridge} gesetzt") + local_node.is_reachable = True + else: + print(f" -> WARNUNG: {err}") + local_node.is_reachable = True # It's local, still reachable + else: + local_node.is_reachable = True + + # Wait for user to configure other nodes + print() + print(" " + "=" * 56) + print(" Bitte führe jetzt die oben genannten Befehle auf den") + print(" anderen Nodes aus (IPMI/iLO/iDRAC/KVM-Konsole).") + print(" " + "=" * 56) + print() + input(" Drücke ENTER wenn alle Nodes konfiguriert sind...") + + # Test connectivity + print() + print(" [Verbindungstest]") + all_ok = True + for node in nodes: + if node.is_local: + print(f" {node.name}: OK (lokal)") + continue + + rescue_ip = self.rescue_ips[node.name] + reachable = self.ssh.is_reachable(rescue_ip) + if reachable: + print(f" {node.name} ({rescue_ip}): OK") + node.is_reachable = True + else: + print(f" {node.name} ({rescue_ip}): NICHT ERREICHBAR") + all_ok = False + + if not all_ok: + print() + print(" [!] Nicht alle Nodes erreichbar!") + answer = input(" Trotzdem fortfahren? [j/N]: ").strip().lower() + if answer not in ('j', 'ja', 'y', 'yes'): + self.cleanup(nodes) + return None + + self.active = True + print() + print(" Rescue-Netzwerk aktiv. Migration kann starten.") + return nodes + + def cleanup(self, nodes: list[NodeInfo]): + """Remove emergency IPs from all nodes.""" + if not self.active and not self.rescue_ips: + return + + print("\n [Rescue] Emergency-IPs entfernen...") + cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen + + for node in nodes: + if node.name not in self.rescue_ips: + continue + + rescue_ip = self.rescue_ips[node.name] + cmd = f"ip addr del {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null" + + if node.is_local: + rc, _, _ = self.ssh.execute_local(cmd) + elif node.is_reachable: + # Try to reach via new IP first (after migration), then rescue IP + if node.new_ip: + rc, _, _ = self.ssh.execute(node.new_ip, cmd) + else: + rc, _, _ = self.ssh.execute(rescue_ip, cmd) + + status = "entfernt" if True else "FEHLER" + print(f" {node.name}: {rescue_ip}/{cidr} {status}") + + self.active = False + print(" [Rescue] Emergency-IPs entfernt.") + + def get_rescue_commands(self, corosync: CorosyncConfig, + subnet: str, bridge: str = "vmbr0") -> list[dict]: + """Generate rescue commands without interactive prompts. + + Returns list of {name, ip, cidr, command, current_ip} + """ + network = ipaddress.ip_network(subnet, strict=False) + hosts = list(network.hosts()) + commands = [] + + for i, cs_node in enumerate(corosync.nodes): + if i >= len(hosts): + break + rescue_ip = str(hosts[i]) + cidr = network.prefixlen + commands.append({ + 'name': cs_node.name, + 'current_ip': cs_node.ring0_addr, + 'ip': rescue_ip, + 'cidr': cidr, + 'command': f"ip addr add {rescue_ip}/{cidr} dev {bridge}", + 'remove_command': f"ip addr del {rescue_ip}/{cidr} dev {bridge}", + }) + + return commands diff --git a/ssh_manager.py b/ssh_manager.py new file mode 100644 index 0000000..130f0cf --- /dev/null +++ b/ssh_manager.py @@ -0,0 +1,140 @@ +"""SSH connection manager for remote Proxmox nodes.""" + +import subprocess +from typing import Optional + + +class SSHManager: + """Manages SSH connections to Proxmox nodes using system ssh.""" + + def __init__(self, ssh_user: str = "root", ssh_key: Optional[str] = None, + ssh_port: int = 22): + self.ssh_user = ssh_user + self.ssh_key = ssh_key + self.ssh_port = ssh_port + + def _build_ssh_cmd(self, host: str, command: str) -> list[str]: + """Build the ssh command list.""" + cmd = [ + "ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + "-o", "BatchMode=yes", + "-p", str(self.ssh_port), + ] + if self.ssh_key: + cmd.extend(["-i", self.ssh_key]) + cmd.append(f"{self.ssh_user}@{host}") + cmd.append(command) + return cmd + + def execute(self, host: str, command: str, timeout: int = 30) -> tuple[int, str, str]: + """Execute a command on a remote host via SSH. + + Returns: (return_code, stdout, stderr) + """ + cmd = self._build_ssh_cmd(host, command) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + ) + return result.returncode, result.stdout, result.stderr + except subprocess.TimeoutExpired: + return -1, "", f"SSH command timed out after {timeout}s" + except Exception as e: + return -1, "", str(e) + + def read_file(self, host: str, path: str) -> tuple[bool, str]: + """Read a file from a remote host. + + Returns: (success, content) + """ + rc, stdout, stderr = self.execute(host, f"cat {path}") + if rc == 0: + return True, stdout + return False, stderr + + def write_file(self, host: str, path: str, content: str) -> tuple[bool, str]: + """Write content to a file on a remote host. + + Returns: (success, message) + """ + # Use heredoc via ssh to write file + escaped = content.replace("'", "'\\''") + cmd = self._build_ssh_cmd(host, f"cat > {path} << 'PROXMOX_NET_EOF'\n{content}\nPROXMOX_NET_EOF") + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0: + return True, "OK" + return False, result.stderr + except Exception as e: + return False, str(e) + + def is_reachable(self, host: str) -> bool: + """Check if a host is reachable via SSH.""" + rc, _, _ = self.execute(host, "echo ok", timeout=10) + return rc == 0 + + def execute_local(self, command: str, timeout: int = 30) -> tuple[int, str, str]: + """Execute a command locally. + + Returns: (return_code, stdout, stderr) + """ + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=timeout, + ) + return result.returncode, result.stdout, result.stderr + except subprocess.TimeoutExpired: + return -1, "", f"Command timed out after {timeout}s" + except Exception as e: + return -1, "", str(e) + + def read_local_file(self, path: str) -> tuple[bool, str]: + """Read a local file.""" + try: + with open(path, 'r') as f: + return True, f.read() + except Exception as e: + return False, str(e) + + def write_local_file(self, path: str, content: str) -> tuple[bool, str]: + """Write a local file.""" + try: + with open(path, 'w') as f: + f.write(content) + return True, "OK" + except Exception as e: + return False, str(e) + + def run_on_node(self, host: str, command: str, is_local: bool = False, + timeout: int = 30) -> tuple[int, str, str]: + """Run a command on a node (local or remote).""" + if is_local: + return self.execute_local(command, timeout) + return self.execute(host, command, timeout) + + def read_node_file(self, host: str, path: str, is_local: bool = False) -> tuple[bool, str]: + """Read a file from a node (local or remote).""" + if is_local: + return self.read_local_file(path) + return self.read_file(host, path) + + def write_node_file(self, host: str, path: str, content: str, + is_local: bool = False) -> tuple[bool, str]: + """Write a file to a node (local or remote).""" + if is_local: + return self.write_local_file(path, content) + return self.write_file(host, path, content) diff --git a/verifier.py b/verifier.py new file mode 100644 index 0000000..472ae6a --- /dev/null +++ b/verifier.py @@ -0,0 +1,112 @@ +"""Phase 5: Verify the migration was successful.""" + +import time +from models import MigrationPlan +from ssh_manager import SSHManager + + +class Verifier: + """Verifies the cluster state after migration.""" + + def __init__(self, ssh: SSHManager): + self.ssh = ssh + + def run(self, plan: MigrationPlan) -> bool: + """Run all verification checks.""" + print("\n=== Phase 5: Verifikation ===\n") + + all_ok = True + + # Check node reachability on new IPs + print("[Node-Erreichbarkeit (neue IPs)]") + for node in plan.nodes: + if not node.new_ip: + continue + + if node.is_local: + # Check local IP + rc, stdout, _ = self.ssh.execute_local( + f"ip addr show | grep -q '{node.new_ip}'" + ) + reachable = rc == 0 + else: + reachable = self.ssh.is_reachable(node.new_ip) + + status = "OK" if reachable else "FEHLER" + print(f" {node.name} ({node.new_ip}): {status}") + if not reachable: + all_ok = False + + # Check cluster status + print("\n[Cluster Status]") + rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null") + if rc == 0: + # Extract relevant info + for line in stdout.split('\n'): + line = line.strip() + if any(k in line for k in ['Quorate:', 'Nodes:', 'Node name', + 'Total votes', 'Expected votes']): + print(f" {line}") + if "Quorate: Yes" not in stdout: + print(" [!] WARNUNG: Cluster hat KEIN Quorum!") + all_ok = False + else: + print(" [!] pvecm status fehlgeschlagen") + all_ok = False + + # Check corosync members + print("\n[Corosync Members]") + rc, stdout, _ = self.ssh.execute_local("corosync-cmapctl 2>/dev/null | grep 'ip(' || true") + if rc == 0 and stdout.strip(): + for line in stdout.strip().split('\n'): + print(f" {line.strip()}") + else: + print(" Keine Corosync-Member-Info verfügbar") + + # Check Ceph if it was configured + if plan.ceph_config: + print("\n[Ceph Status]") + rc, stdout, _ = self.ssh.execute_local("ceph -s 2>/dev/null") + if rc == 0: + for line in stdout.split('\n'): + line = line.strip() + if line: + print(f" {line}") + else: + print(" [!] ceph -s fehlgeschlagen") + all_ok = False + + print("\n[Ceph MON Status]") + rc, stdout, _ = self.ssh.execute_local("ceph mon stat 2>/dev/null") + if rc == 0: + print(f" {stdout.strip()}") + else: + print(" [!] ceph mon stat fehlgeschlagen") + + print("\n[Ceph OSD Status]") + rc, stdout, _ = self.ssh.execute_local("ceph osd tree 2>/dev/null") + if rc == 0: + for line in stdout.split('\n')[:20]: # First 20 lines + if line.strip(): + print(f" {line}") + + # Summary + print("\n" + "=" * 60) + if all_ok: + print(" MIGRATION ERFOLGREICH!") + print(" Alle Checks bestanden.") + else: + print(" MIGRATION MIT WARNUNGEN ABGESCHLOSSEN") + print(" Einige Checks sind fehlgeschlagen. Bitte manuell prüfen!") + print("=" * 60) + + # Suggest next steps + print("\n[Empfohlene nächste Schritte]") + print(" 1. VMs/CTs auf allen Nodes prüfen: qm list / pct list") + print(" 2. Live-Migration testen: qm migrate ") + print(" 3. Ceph Recovery abwarten: ceph -w") + if not all_ok: + print(" 4. Bei Problemen Backup wiederherstellen:") + print(" ls /root/network-migration-backup-*/") + + return all_ok