From c0e6f964987b18241355c6d0e6394761513523b3 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Wed, 4 Mar 2026 23:30:34 +0100 Subject: [PATCH] changed order reload network on script node at last --- migrator.py | 87 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/migrator.py b/migrator.py index 6977894..2dafb43 100644 --- a/migrator.py +++ b/migrator.py @@ -372,14 +372,25 @@ class Migrator: def _update_network(self, plan: MigrationPlan, configs: dict, dry_run: bool) -> bool: - """Update /etc/network/interfaces and restart networking.""" - for node in plan.nodes: - if not node.is_reachable or node.name not in configs['nodes']: - continue + """Update /etc/network/interfaces and restart networking. + Strategy: + 1. Copy staged interfaces to /etc/network/interfaces on ALL nodes first + 2. Reload remote nodes with nohup + delay (fire-and-forget, SSH will die) + 3. Reload local node LAST + This avoids cutting off our own SSH connectivity before reaching remote nodes. + """ + active_nodes = [ + n for n in plan.nodes + if n.is_reachable and n.name in configs['nodes'] + ] + remote_nodes = [n for n in active_nodes if not n.is_local] + local_node = next((n for n in active_nodes if n.is_local), None) + + # Phase 1: Copy staged interfaces on ALL nodes (no reload yet) + for node in active_nodes: if dry_run: print(f" [{node.name}] Würde /etc/network/interfaces aktualisieren") - print(f" [{node.name}] Würde 'ifreload -a' ausführen") continue staging = "/root/.network-migration-staged/interfaces" @@ -394,25 +405,63 @@ class Migrator: print(f" [{node.name}] FEHLER: {err}") return False - # Reload network - ifreload -a reloads ALL interfaces - rc, _, _ = self.ssh.run_on_node( - node.ssh_host, "which ifreload", node.is_local - ) - if rc == 0: - reload_cmd = "ifreload -a" - else: - # Fallback: restart networking service - reload_cmd = "systemctl restart networking" + if dry_run: + for node in active_nodes: + print(f" [{node.name}] Würde 'ifreload -a' ausführen") + return True - print(f" [{node.name}] Netzwerk wird neu geladen ({reload_cmd})...") + # Determine reload command + rc, _, _ = self.ssh.execute_local("which ifreload") + reload_cmd = "ifreload -a" if rc == 0 else "systemctl restart networking" + + # Phase 2: Reload REMOTE nodes first (fire-and-forget with nohup) + # The SSH connection will die when the remote network changes, + # so we use nohup + delay to let the SSH command return first. + for node in remote_nodes: + # nohup with 2s delay: SSH returns immediately, then network reloads + bg_cmd = ( + f"nohup bash -c 'sleep 2 && {reload_cmd}' " + f">/tmp/ifreload.log 2>&1 &" + ) + print(f" [{node.name}] Netzwerk-Reload geplant (fire-and-forget)...") + self.ssh.run_on_node(node.ssh_host, bg_cmd, False, timeout=10) + print(f" [{node.name}] {reload_cmd} wird in 2s ausgeführt") + + # Phase 3: Reload LOCAL node last + if local_node: + print(f" [{local_node.name}] Netzwerk wird neu geladen ({reload_cmd})...") rc, _, err = self.ssh.run_on_node( - node.ssh_host, reload_cmd, node.is_local, timeout=60 + local_node.ssh_host, reload_cmd, True, timeout=60 ) if rc == 0: - print(f" [{node.name}] Netzwerk neu geladen") + print(f" [{local_node.name}] Netzwerk neu geladen") else: - print(f" [{node.name}] WARNUNG beim Netzwerk-Reload: {err}") - # Don't fail here - the node might just be unreachable on old IP now + print(f" [{local_node.name}] WARNUNG beim Netzwerk-Reload: {err}") + + # Wait for remote nodes to finish their reload + if remote_nodes: + wait_secs = 8 + print(f"\n Warte {wait_secs}s bis alle Remote-Nodes ihr Netzwerk neu geladen haben...") + time.sleep(wait_secs) + + # Verify: try to reach remote nodes on NEW IPs (with retries) + print(" [Verifikation] Prüfe Erreichbarkeit auf neuen IPs...") + for node in remote_nodes: + if not node.new_ip: + continue + reachable = False + for attempt in range(3): + reachable = self.ssh.is_reachable(node.new_ip) + if reachable: + break + if attempt < 2: + print(f" [{node.name}] {node.new_ip} noch nicht erreichbar, warte 5s...") + time.sleep(5) + if reachable: + print(f" [{node.name}] {node.new_ip} erreichbar") + else: + print(f" [{node.name}] {node.new_ip} NICHT erreichbar nach 3 Versuchen!") + print(f" [{node.name}] WARNUNG: Service-Start auf diesem Node könnte fehlschlagen") return True