"""Emergency/Rescue Network - Temporäres Netzwerk zur SSH-Kommunikation. Wenn Nodes in verschiedenen Subnetzen sind und sich nicht mehr erreichen können, wird ein temporäres Emergency-Netzwerk aufgebaut: - Jeder Node bekommt eine zusätzliche IP auf der Bridge (z.B. vmbr0) - Über dieses Netz kann das Tool dann per SSH arbeiten - Nach der Migration werden die Emergency-IPs wieder entfernt """ import ipaddress import time from models import NodeInfo, CorosyncConfig from config_parser import parse_corosync_conf from ssh_manager import SSHManager class RescueNetwork: """Manages an emergency network for broken clusters.""" def __init__(self, ssh: SSHManager): self.ssh = ssh self.rescue_subnet: str = "" self.rescue_ips: dict[str, str] = {} # node_name -> rescue_ip self.bridge: str = "vmbr0" self.active: bool = False def setup_interactive(self, corosync: CorosyncConfig) -> list[NodeInfo] | None: """Interactively set up the rescue network. Returns updated node list with rescue IPs as ssh_host, or None on abort. """ print("\n" + "=" * 60) print(" RESCUE NETZWERK") print("=" * 60) print() print(" Dieses Feature richtet ein temporäres Netzwerk ein,") print(" damit alle Nodes sich wieder per SSH erreichen können.") print() print(" Ablauf:") print(" 1. Du gibst ein freies Subnetz an (z.B. 10.99.99.0/24)") print(" 2. Das Tool zeigt dir für jeden Node den Befehl an") print(" 3. Du führst die Befehle manuell auf jedem Node aus") print(" (z.B. über IPMI/iLO/iDRAC/KVM-Konsole)") print(" 4. Danach kann das Tool alle Nodes per SSH erreichen") print() # Ask for bridge user_input = input(f" Bridge für Emergency-IPs [{self.bridge}]: ").strip() if user_input: self.bridge = user_input # Ask for rescue subnet default_subnet = "10.99.99.0/24" while True: subnet_input = input( f" Emergency Subnetz [{default_subnet}]: " ).strip() if not subnet_input: subnet_input = default_subnet try: subnet = ipaddress.ip_network(subnet_input, strict=False) self.rescue_subnet = str(subnet) break except ValueError as e: print(f" Ungültiges Subnetz: {e}") # Generate IPs for all nodes hosts = list(subnet.hosts()) print() print(" " + "-" * 56) print(f" Emergency Subnetz: {self.rescue_subnet}") print(f" Bridge: {self.bridge}") print(" " + "-" * 56) print() nodes = [] for i, cs_node in enumerate(corosync.nodes): if i >= len(hosts): print(f" [!] FEHLER: Nicht genug IPs im Subnetz für alle Nodes!") return None rescue_ip = str(hosts[i]) self.rescue_ips[cs_node.name] = rescue_ip cidr = subnet.prefixlen node = NodeInfo( name=cs_node.name, current_ip=cs_node.ring0_addr, ssh_host=rescue_ip, # Use rescue IP for SSH ) nodes.append(node) # Show command for this node cmd = f"ip addr add {rescue_ip}/{cidr} dev {self.bridge}" print(f" {cs_node.name} ({cs_node.ring0_addr}):") print(f" Rescue-IP: {rescue_ip}/{cidr}") print(f" Befehl: {cmd}") print() # Apply locally print(" " + "-" * 56) print() # Find local node import socket local_hostname = socket.gethostname() local_node = None for node in nodes: if node.name == local_hostname: local_node = node node.is_local = True break if local_node and local_node.name in self.rescue_ips: rescue_ip = self.rescue_ips[local_node.name] cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen print(f" Lokaler Node erkannt: {local_node.name}") answer = input( f" Emergency-IP {rescue_ip}/{cidr} auf {self.bridge} " f"automatisch setzen? [J/n]: " ).strip().lower() if answer not in ('n', 'nein', 'no'): rc, _, err = self.ssh.execute_local( f"ip addr add {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null; echo ok" ) if rc == 0: print(f" -> {rescue_ip}/{cidr} auf {self.bridge} gesetzt") local_node.is_reachable = True else: print(f" -> WARNUNG: {err}") local_node.is_reachable = True # It's local, still reachable else: local_node.is_reachable = True # Wait for user to configure other nodes print() print(" " + "=" * 56) print(" Bitte führe jetzt die oben genannten Befehle auf den") print(" anderen Nodes aus (IPMI/iLO/iDRAC/KVM-Konsole).") print(" " + "=" * 56) print() input(" Drücke ENTER wenn alle Nodes konfiguriert sind...") # Test connectivity print() print(" [Verbindungstest]") all_ok = True for node in nodes: if node.is_local: print(f" {node.name}: OK (lokal)") continue rescue_ip = self.rescue_ips[node.name] reachable = self.ssh.is_reachable(rescue_ip) if reachable: print(f" {node.name} ({rescue_ip}): OK") node.is_reachable = True else: print(f" {node.name} ({rescue_ip}): NICHT ERREICHBAR") all_ok = False if not all_ok: print() print(" [!] Nicht alle Nodes erreichbar!") answer = input(" Trotzdem fortfahren? [j/N]: ").strip().lower() if answer not in ('j', 'ja', 'y', 'yes'): self.cleanup(nodes) return None self.active = True print() print(" Rescue-Netzwerk aktiv. Migration kann starten.") return nodes def cleanup(self, nodes: list[NodeInfo]): """Remove emergency IPs from all nodes.""" if not self.active and not self.rescue_ips: return print("\n [Rescue] Emergency-IPs entfernen...") cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen for node in nodes: if node.name not in self.rescue_ips: continue rescue_ip = self.rescue_ips[node.name] cmd = f"ip addr del {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null" if node.is_local: rc, _, _ = self.ssh.execute_local(cmd) elif node.is_reachable: # Try to reach via new IP first (after migration), then rescue IP if node.new_ip: rc, _, _ = self.ssh.execute(node.new_ip, cmd) else: rc, _, _ = self.ssh.execute(rescue_ip, cmd) status = "entfernt" if True else "FEHLER" print(f" {node.name}: {rescue_ip}/{cidr} {status}") self.active = False print(" [Rescue] Emergency-IPs entfernt.") def get_rescue_commands(self, corosync: CorosyncConfig, subnet: str, bridge: str = "vmbr0") -> list[dict]: """Generate rescue commands without interactive prompts. Returns list of {name, ip, cidr, command, current_ip} """ network = ipaddress.ip_network(subnet, strict=False) hosts = list(network.hosts()) commands = [] for i, cs_node in enumerate(corosync.nodes): if i >= len(hosts): break rescue_ip = str(hosts[i]) cidr = network.prefixlen commands.append({ 'name': cs_node.name, 'current_ip': cs_node.ring0_addr, 'ip': rescue_ip, 'cidr': cidr, 'command': f"ip addr add {rescue_ip}/{cidr} dev {bridge}", 'remove_command': f"ip addr del {rescue_ip}/{cidr} dev {bridge}", }) return commands