From aebad45615692cf94fd01573da08e8a4bb0960f3 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Wed, 4 Mar 2026 23:56:39 +0100 Subject: [PATCH] added mon map correction and injection --- README.md | 41 ++++++++++++++++++ migrator.py | 119 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 146 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 6d30ac9..d54a721 100644 --- a/README.md +++ b/README.md @@ -285,6 +285,47 @@ systemctl restart corosync 5. `journalctl -u corosync` — Corosync-Logs prüfen 6. `journalctl -u pve-cluster` — pmxcfs-Logs prüfen +### Workaround: Ceph MON-Map manuell aktualisieren + +Falls nach der Migration `ceph-mon` und `ceph-mgr` nicht starten (z.B. weil eine ältere Version des Tools die MON-Map nicht aktualisiert hat), muss die Ceph MON-Map manuell korrigiert werden. Die MON-Map ist eine interne Datenbank in der die MON-Adressen gespeichert sind — ein reines Update der `ceph.conf` reicht nicht. + +**Auf jedem Node ausführen:** + +```bash +# 1. MON stoppen +systemctl stop ceph-mon@$(hostname) + +# 2. Aktuelle MON-Map extrahieren und prüfen +ceph-mon -i $(hostname) --extract-monmap /tmp/monmap +monmaptool --print /tmp/monmap + +# 3. Alte Einträge entfernen (für jeden MON-Node) +monmaptool --rm pve1 /tmp/monmap +monmaptool --rm pve2 /tmp/monmap +monmaptool --rm pve3 /tmp/monmap + +# 4. Neue Einträge mit neuen IPs hinzufügen +monmaptool --addv pve1 [v2:172.0.2.101:3300/0,v1:172.0.2.101:6789/0] /tmp/monmap +monmaptool --addv pve2 [v2:172.0.2.102:3300/0,v1:172.0.2.102:6789/0] /tmp/monmap +monmaptool --addv pve3 [v2:172.0.2.103:3300/0,v1:172.0.2.103:6789/0] /tmp/monmap + +# 5. Ergebnis prüfen +monmaptool --print /tmp/monmap + +# 6. Aktualisierte MON-Map zurückschreiben +ceph-mon -i $(hostname) --inject-monmap /tmp/monmap + +# 7. Services starten +systemctl start ceph-mon@$(hostname) +systemctl restart ceph-mgr@$(hostname) +systemctl restart ceph-osd.target + +# 8. Aufräumen +rm -f /tmp/monmap +``` + +> **Hinweis:** Node-Namen und IPs an das eigene Setup anpassen. Schritte 3+4 müssen alle MON-Nodes des Clusters enthalten, nicht nur den lokalen. Aktuelle Versionen des Tools aktualisieren die MON-Map automatisch. + ## Hinweise - Das Tool muss als **root** ausgeführt werden diff --git a/migrator.py b/migrator.py index afcb016..31d3704 100644 --- a/migrator.py +++ b/migrator.py @@ -434,19 +434,27 @@ class Migrator: print(" [Ceph] /etc/pve nicht beschreibbar, schreibe direkt...") self._update_ceph_direct(plan, configs) + # Update Ceph MON map with new IPs (MUST happen before restart) + self._update_ceph_mon_map(plan) + # Restart Ceph services - print(" [Ceph] Services neu starten...") + print("\n [Ceph] Services neu starten...") for node in plan.nodes: if not node.is_reachable: continue new_host = node.new_ip if not node.is_local else node.ssh_host - # Restart MON - self.ssh.run_on_node( + # Start MON (already stopped by monmap update) + rc, _, err = self.ssh.run_on_node( new_host, - f"systemctl restart ceph-mon@{node.name} 2>/dev/null", + f"systemctl start ceph-mon@{node.name} 2>/dev/null", node.is_local, timeout=30, ) + if rc == 0: + print(f" [{node.name}] ceph-mon gestartet") + else: + print(f" [{node.name}] WARNUNG ceph-mon: {err}") + # Restart MGR self.ssh.run_on_node( new_host, @@ -459,7 +467,7 @@ class Migrator: "systemctl restart ceph-osd.target 2>/dev/null", node.is_local, timeout=60, ) - print(f" [{node.name}] Ceph-Services neu gestartet") + print(f" [{node.name}] Ceph-Services gestartet") def _update_ceph_direct(self, plan: MigrationPlan, configs: dict): """Write ceph.conf directly on each node (fallback when no quorum).""" @@ -480,21 +488,104 @@ class Migrator: def _update_ceph_mon_map(self, plan: MigrationPlan): """Update Ceph MON map with new addresses. - This is needed when MON IPs change. + When MON IPs change, the internal monmap (stored in MON's RocksDB) + must be explicitly updated. Just updating ceph.conf is NOT enough. + + Steps per node: + 1. Stop ceph-mon + 2. Extract monmap from MON database + 3. Remove all old MON entries + 4. Re-add MON entries with new IPs (msgr2 + msgr1) + 5. Reinject updated monmap """ ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip} + if not ip_mapping: + print(" [Ceph] Keine IP-Änderungen für MON-Map") + return + + # Build the list of MON nodes with their new IPs + mon_nodes = [] + for node in plan.nodes: + if not node.is_reachable: + continue + new_ip = node.new_ip or node.current_ip + mon_nodes.append((node.name, new_ip)) + + print("\n [Ceph] MON-Map aktualisieren...") + + # Stop ceph-mon on all nodes first for node in plan.nodes: if not node.is_reachable: continue new_host = node.new_ip if not node.is_local else node.ssh_host - new_ip = node.new_ip + self.ssh.run_on_node( + new_host, + f"systemctl stop ceph-mon@{node.name} 2>/dev/null", + node.is_local, timeout=30, + ) + print(f" [{node.name}] ceph-mon gestoppt") - # Extract monmap, modify, and reinject - cmds = [ + # Update monmap on each node + for node in plan.nodes: + if not node.is_reachable: + continue + new_host = node.new_ip if not node.is_local else node.ssh_host + + # Extract current monmap + rc, _, err = self.ssh.run_on_node( + new_host, f"ceph-mon -i {node.name} --extract-monmap /tmp/monmap", - # Remove old entries and add new ones - ] - # This is complex - for now we rely on the ceph.conf update - # and let Ceph handle the MON map update on restart - print(f" [{node.name}] MON-Map wird beim Neustart aktualisiert") + node.is_local, timeout=30, + ) + if rc != 0: + print(f" [{node.name}] WARNUNG: monmap extrahieren fehlgeschlagen: {err}") + print(f" [{node.name}] Überspringe MON-Map Update") + continue + + # Print current monmap for debugging + self.ssh.run_on_node( + new_host, + "monmaptool --print /tmp/monmap", + node.is_local, timeout=10, + ) + + # Remove all existing MON entries + for mon_name, _ in mon_nodes: + self.ssh.run_on_node( + new_host, + f"monmaptool --rm {mon_name} /tmp/monmap 2>/dev/null", + node.is_local, timeout=10, + ) + + # Re-add all MON entries with new IPs (msgr2 on 3300 + msgr1 on 6789) + for mon_name, new_ip in mon_nodes: + rc, _, err = self.ssh.run_on_node( + new_host, + f"monmaptool --addv {mon_name} " + f"[v2:{new_ip}:3300/0,v1:{new_ip}:6789/0] /tmp/monmap", + node.is_local, timeout=10, + ) + if rc != 0: + # Fallback: try legacy --add (older Ceph versions) + self.ssh.run_on_node( + new_host, + f"monmaptool --add {mon_name} {new_ip}:6789 /tmp/monmap", + node.is_local, timeout=10, + ) + + # Reinject updated monmap + rc, _, err = self.ssh.run_on_node( + new_host, + f"ceph-mon -i {node.name} --inject-monmap /tmp/monmap", + node.is_local, timeout=30, + ) + if rc == 0: + print(f" [{node.name}] MON-Map aktualisiert") + else: + print(f" [{node.name}] FEHLER MON-Map reinject: {err}") + + # Cleanup + self.ssh.run_on_node( + new_host, "rm -f /tmp/monmap", node.is_local, + )