changed order reload network on script node at last

2026-03-04 23:30:34 +01:00 · 2026-03-04 23:30:34 +01:00 · c0e6f96498
parent 1083fbb375
commit c0e6f96498
1 changed files with 68 additions and 19 deletions
--- a/migrator.py
+++ b/migrator.py
@ -372,14 +372,25 @@ class Migrator:

    def _update_network(self, plan: MigrationPlan, configs: dict,
                        dry_run: bool) -> bool:
-        """Update /etc/network/interfaces and restart networking."""
-        for node in plan.nodes:
-            if not node.is_reachable or node.name not in configs['nodes']:
-                continue
+        """Update /etc/network/interfaces and restart networking.

+        Strategy:
+        1. Copy staged interfaces to /etc/network/interfaces on ALL nodes first
+        2. Reload remote nodes with nohup + delay (fire-and-forget, SSH will die)
+        3. Reload local node LAST
+        This avoids cutting off our own SSH connectivity before reaching remote nodes.
+        """
+        active_nodes = [
+            n for n in plan.nodes
+            if n.is_reachable and n.name in configs['nodes']
+        ]
+        remote_nodes = [n for n in active_nodes if not n.is_local]
+        local_node = next((n for n in active_nodes if n.is_local), None)
+
+        # Phase 1: Copy staged interfaces on ALL nodes (no reload yet)
+        for node in active_nodes:
            if dry_run:
                print(f"  [{node.name}] Würde /etc/network/interfaces aktualisieren")
-                print(f"  [{node.name}] Würde 'ifreload -a' ausführen")
                continue

            staging = "/root/.network-migration-staged/interfaces"
@ -394,25 +405,63 @@ class Migrator:
                print(f"  [{node.name}] FEHLER: {err}")
                return False

-            # Reload network - ifreload -a reloads ALL interfaces
-            rc, _, _ = self.ssh.run_on_node(
-                node.ssh_host, "which ifreload", node.is_local
-            )
-            if rc == 0:
-                reload_cmd = "ifreload -a"
-            else:
-                # Fallback: restart networking service
-                reload_cmd = "systemctl restart networking"
+        if dry_run:
+            for node in active_nodes:
+                print(f"  [{node.name}] Würde 'ifreload -a' ausführen")
+            return True

-            print(f"  [{node.name}] Netzwerk wird neu geladen ({reload_cmd})...")
+        # Determine reload command
+        rc, _, _ = self.ssh.execute_local("which ifreload")
+        reload_cmd = "ifreload -a" if rc == 0 else "systemctl restart networking"
+
+        # Phase 2: Reload REMOTE nodes first (fire-and-forget with nohup)
+        # The SSH connection will die when the remote network changes,
+        # so we use nohup + delay to let the SSH command return first.
+        for node in remote_nodes:
+            # nohup with 2s delay: SSH returns immediately, then network reloads
+            bg_cmd = (
+                f"nohup bash -c 'sleep 2 && {reload_cmd}' "
+                f">/tmp/ifreload.log 2>&1 &"
+            )
+            print(f"  [{node.name}] Netzwerk-Reload geplant (fire-and-forget)...")
+            self.ssh.run_on_node(node.ssh_host, bg_cmd, False, timeout=10)
+            print(f"  [{node.name}] {reload_cmd} wird in 2s ausgeführt")
+
+        # Phase 3: Reload LOCAL node last
+        if local_node:
+            print(f"  [{local_node.name}] Netzwerk wird neu geladen ({reload_cmd})...")
            rc, _, err = self.ssh.run_on_node(
-                node.ssh_host, reload_cmd, node.is_local, timeout=60
+                local_node.ssh_host, reload_cmd, True, timeout=60
            )
            if rc == 0:
-                print(f"  [{node.name}] Netzwerk neu geladen")
+                print(f"  [{local_node.name}] Netzwerk neu geladen")
            else:
-                print(f"  [{node.name}] WARNUNG beim Netzwerk-Reload: {err}")
-                # Don't fail here - the node might just be unreachable on old IP now
+                print(f"  [{local_node.name}] WARNUNG beim Netzwerk-Reload: {err}")
+
+        # Wait for remote nodes to finish their reload
+        if remote_nodes:
+            wait_secs = 8
+            print(f"\n  Warte {wait_secs}s bis alle Remote-Nodes ihr Netzwerk neu geladen haben...")
+            time.sleep(wait_secs)
+
+        # Verify: try to reach remote nodes on NEW IPs (with retries)
+        print("  [Verifikation] Prüfe Erreichbarkeit auf neuen IPs...")
+        for node in remote_nodes:
+            if not node.new_ip:
+                continue
+            reachable = False
+            for attempt in range(3):
+                reachable = self.ssh.is_reachable(node.new_ip)
+                if reachable:
+                    break
+                if attempt < 2:
+                    print(f"  [{node.name}] {node.new_ip} noch nicht erreichbar, warte 5s...")
+                    time.sleep(5)
+            if reachable:
+                print(f"  [{node.name}] {node.new_ip} erreichbar")
+            else:
+                print(f"  [{node.name}] {node.new_ip} NICHT erreichbar nach 3 Versuchen!")
+                print(f"  [{node.name}] WARNUNG: Service-Start auf diesem Node könnte fehlschlagen")

        return True