added mon map correction and injection

This commit is contained in:
duffyduck 2026-03-04 23:56:39 +01:00
parent f072320ab9
commit aebad45615
2 changed files with 146 additions and 14 deletions

View File

@ -285,6 +285,47 @@ systemctl restart corosync
5. `journalctl -u corosync` — Corosync-Logs prüfen
6. `journalctl -u pve-cluster` — pmxcfs-Logs prüfen
### Workaround: Ceph MON-Map manuell aktualisieren
Falls nach der Migration `ceph-mon` und `ceph-mgr` nicht starten (z.B. weil eine ältere Version des Tools die MON-Map nicht aktualisiert hat), muss die Ceph MON-Map manuell korrigiert werden. Die MON-Map ist eine interne Datenbank in der die MON-Adressen gespeichert sind — ein reines Update der `ceph.conf` reicht nicht.
**Auf jedem Node ausführen:**
```bash
# 1. MON stoppen
systemctl stop ceph-mon@$(hostname)
# 2. Aktuelle MON-Map extrahieren und prüfen
ceph-mon -i $(hostname) --extract-monmap /tmp/monmap
monmaptool --print /tmp/monmap
# 3. Alte Einträge entfernen (für jeden MON-Node)
monmaptool --rm pve1 /tmp/monmap
monmaptool --rm pve2 /tmp/monmap
monmaptool --rm pve3 /tmp/monmap
# 4. Neue Einträge mit neuen IPs hinzufügen
monmaptool --addv pve1 [v2:172.0.2.101:3300/0,v1:172.0.2.101:6789/0] /tmp/monmap
monmaptool --addv pve2 [v2:172.0.2.102:3300/0,v1:172.0.2.102:6789/0] /tmp/monmap
monmaptool --addv pve3 [v2:172.0.2.103:3300/0,v1:172.0.2.103:6789/0] /tmp/monmap
# 5. Ergebnis prüfen
monmaptool --print /tmp/monmap
# 6. Aktualisierte MON-Map zurückschreiben
ceph-mon -i $(hostname) --inject-monmap /tmp/monmap
# 7. Services starten
systemctl start ceph-mon@$(hostname)
systemctl restart ceph-mgr@$(hostname)
systemctl restart ceph-osd.target
# 8. Aufräumen
rm -f /tmp/monmap
```
> **Hinweis:** Node-Namen und IPs an das eigene Setup anpassen. Schritte 3+4 müssen alle MON-Nodes des Clusters enthalten, nicht nur den lokalen. Aktuelle Versionen des Tools aktualisieren die MON-Map automatisch.
## Hinweise
- Das Tool muss als **root** ausgeführt werden

View File

@ -434,19 +434,27 @@ class Migrator:
print(" [Ceph] /etc/pve nicht beschreibbar, schreibe direkt...")
self._update_ceph_direct(plan, configs)
# Update Ceph MON map with new IPs (MUST happen before restart)
self._update_ceph_mon_map(plan)
# Restart Ceph services
print(" [Ceph] Services neu starten...")
print("\n [Ceph] Services neu starten...")
for node in plan.nodes:
if not node.is_reachable:
continue
new_host = node.new_ip if not node.is_local else node.ssh_host
# Restart MON
self.ssh.run_on_node(
# Start MON (already stopped by monmap update)
rc, _, err = self.ssh.run_on_node(
new_host,
f"systemctl restart ceph-mon@{node.name} 2>/dev/null",
f"systemctl start ceph-mon@{node.name} 2>/dev/null",
node.is_local, timeout=30,
)
if rc == 0:
print(f" [{node.name}] ceph-mon gestartet")
else:
print(f" [{node.name}] WARNUNG ceph-mon: {err}")
# Restart MGR
self.ssh.run_on_node(
new_host,
@ -459,7 +467,7 @@ class Migrator:
"systemctl restart ceph-osd.target 2>/dev/null",
node.is_local, timeout=60,
)
print(f" [{node.name}] Ceph-Services neu gestartet")
print(f" [{node.name}] Ceph-Services gestartet")
def _update_ceph_direct(self, plan: MigrationPlan, configs: dict):
"""Write ceph.conf directly on each node (fallback when no quorum)."""
@ -480,21 +488,104 @@ class Migrator:
def _update_ceph_mon_map(self, plan: MigrationPlan):
"""Update Ceph MON map with new addresses.
This is needed when MON IPs change.
When MON IPs change, the internal monmap (stored in MON's RocksDB)
must be explicitly updated. Just updating ceph.conf is NOT enough.
Steps per node:
1. Stop ceph-mon
2. Extract monmap from MON database
3. Remove all old MON entries
4. Re-add MON entries with new IPs (msgr2 + msgr1)
5. Reinject updated monmap
"""
ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip}
if not ip_mapping:
print(" [Ceph] Keine IP-Änderungen für MON-Map")
return
# Build the list of MON nodes with their new IPs
mon_nodes = []
for node in plan.nodes:
if not node.is_reachable:
continue
new_ip = node.new_ip or node.current_ip
mon_nodes.append((node.name, new_ip))
print("\n [Ceph] MON-Map aktualisieren...")
# Stop ceph-mon on all nodes first
for node in plan.nodes:
if not node.is_reachable:
continue
new_host = node.new_ip if not node.is_local else node.ssh_host
new_ip = node.new_ip
self.ssh.run_on_node(
new_host,
f"systemctl stop ceph-mon@{node.name} 2>/dev/null",
node.is_local, timeout=30,
)
print(f" [{node.name}] ceph-mon gestoppt")
# Extract monmap, modify, and reinject
cmds = [
# Update monmap on each node
for node in plan.nodes:
if not node.is_reachable:
continue
new_host = node.new_ip if not node.is_local else node.ssh_host
# Extract current monmap
rc, _, err = self.ssh.run_on_node(
new_host,
f"ceph-mon -i {node.name} --extract-monmap /tmp/monmap",
# Remove old entries and add new ones
]
# This is complex - for now we rely on the ceph.conf update
# and let Ceph handle the MON map update on restart
print(f" [{node.name}] MON-Map wird beim Neustart aktualisiert")
node.is_local, timeout=30,
)
if rc != 0:
print(f" [{node.name}] WARNUNG: monmap extrahieren fehlgeschlagen: {err}")
print(f" [{node.name}] Überspringe MON-Map Update")
continue
# Print current monmap for debugging
self.ssh.run_on_node(
new_host,
"monmaptool --print /tmp/monmap",
node.is_local, timeout=10,
)
# Remove all existing MON entries
for mon_name, _ in mon_nodes:
self.ssh.run_on_node(
new_host,
f"monmaptool --rm {mon_name} /tmp/monmap 2>/dev/null",
node.is_local, timeout=10,
)
# Re-add all MON entries with new IPs (msgr2 on 3300 + msgr1 on 6789)
for mon_name, new_ip in mon_nodes:
rc, _, err = self.ssh.run_on_node(
new_host,
f"monmaptool --addv {mon_name} "
f"[v2:{new_ip}:3300/0,v1:{new_ip}:6789/0] /tmp/monmap",
node.is_local, timeout=10,
)
if rc != 0:
# Fallback: try legacy --add (older Ceph versions)
self.ssh.run_on_node(
new_host,
f"monmaptool --add {mon_name} {new_ip}:6789 /tmp/monmap",
node.is_local, timeout=10,
)
# Reinject updated monmap
rc, _, err = self.ssh.run_on_node(
new_host,
f"ceph-mon -i {node.name} --inject-monmap /tmp/monmap",
node.is_local, timeout=30,
)
if rc == 0:
print(f" [{node.name}] MON-Map aktualisiert")
else:
print(f" [{node.name}] FEHLER MON-Map reinject: {err}")
# Cleanup
self.ssh.run_on_node(
new_host, "rm -f /tmp/monmap", node.is_local,
)