fixed mon map, altered first and only copied to other nodes
This commit is contained in:
parent
aebad45615
commit
7674656bf2
294
migrator.py
294
migrator.py
|
|
@ -438,22 +438,29 @@ class Migrator:
|
|||
self._update_ceph_mon_map(plan)
|
||||
|
||||
# Restart Ceph services
|
||||
# Note: first MON is already running (started during monmap update)
|
||||
print("\n [Ceph] Services neu starten...")
|
||||
first_started = False
|
||||
for node in plan.nodes:
|
||||
if not node.is_reachable:
|
||||
continue
|
||||
new_host = node.new_ip if not node.is_local else node.ssh_host
|
||||
|
||||
# Start MON (already stopped by monmap update)
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"systemctl start ceph-mon@{node.name} 2>/dev/null",
|
||||
node.is_local, timeout=30,
|
||||
)
|
||||
if rc == 0:
|
||||
print(f" [{node.name}] ceph-mon gestartet")
|
||||
if not first_started:
|
||||
# First node's MON was already started during monmap update
|
||||
first_started = True
|
||||
print(f" [{node.name}] ceph-mon läuft bereits (Primary)")
|
||||
else:
|
||||
print(f" [{node.name}] WARNUNG ceph-mon: {err}")
|
||||
# Start MON on remaining nodes
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"systemctl start ceph-mon@{node.name} 2>/dev/null",
|
||||
node.is_local, timeout=30,
|
||||
)
|
||||
if rc == 0:
|
||||
print(f" [{node.name}] ceph-mon gestartet")
|
||||
else:
|
||||
print(f" [{node.name}] WARNUNG ceph-mon: {err}")
|
||||
|
||||
# Restart MGR
|
||||
self.ssh.run_on_node(
|
||||
|
|
@ -467,6 +474,12 @@ class Migrator:
|
|||
"systemctl restart ceph-osd.target 2>/dev/null",
|
||||
node.is_local, timeout=60,
|
||||
)
|
||||
# Restart MDS if present (CephFS metadata server)
|
||||
self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"systemctl restart ceph-mds@{node.name} 2>/dev/null",
|
||||
node.is_local, timeout=30,
|
||||
)
|
||||
print(f" [{node.name}] Ceph-Services gestartet")
|
||||
|
||||
def _update_ceph_direct(self, plan: MigrationPlan, configs: dict):
|
||||
|
|
@ -491,12 +504,9 @@ class Migrator:
|
|||
When MON IPs change, the internal monmap (stored in MON's RocksDB)
|
||||
must be explicitly updated. Just updating ceph.conf is NOT enough.
|
||||
|
||||
Steps per node:
|
||||
1. Stop ceph-mon
|
||||
2. Extract monmap from MON database
|
||||
3. Remove all old MON entries
|
||||
4. Re-add MON entries with new IPs (msgr2 + msgr1)
|
||||
5. Reinject updated monmap
|
||||
Strategy: Update monmap on the FIRST node, start its MON, then get
|
||||
the authoritative monmap from the running MON and inject it into all
|
||||
remaining nodes. This avoids epoch mismatches between nodes.
|
||||
"""
|
||||
ip_mapping = {n.current_ip: n.new_ip for n in plan.nodes if n.new_ip}
|
||||
|
||||
|
|
@ -506,18 +516,22 @@ class Migrator:
|
|||
|
||||
# Build the list of MON nodes with their new IPs
|
||||
mon_nodes = []
|
||||
reachable_nodes = []
|
||||
for node in plan.nodes:
|
||||
if not node.is_reachable:
|
||||
continue
|
||||
new_ip = node.new_ip or node.current_ip
|
||||
mon_nodes.append((node.name, new_ip))
|
||||
reachable_nodes.append(node)
|
||||
|
||||
if not reachable_nodes:
|
||||
print(" [Ceph] Keine erreichbaren Nodes für MON-Map Update")
|
||||
return
|
||||
|
||||
print("\n [Ceph] MON-Map aktualisieren...")
|
||||
|
||||
# Stop ceph-mon on all nodes first
|
||||
for node in plan.nodes:
|
||||
if not node.is_reachable:
|
||||
continue
|
||||
for node in reachable_nodes:
|
||||
new_host = node.new_ip if not node.is_local else node.ssh_host
|
||||
self.ssh.run_on_node(
|
||||
new_host,
|
||||
|
|
@ -526,66 +540,210 @@ class Migrator:
|
|||
)
|
||||
print(f" [{node.name}] ceph-mon gestoppt")
|
||||
|
||||
# Update monmap on each node
|
||||
for node in plan.nodes:
|
||||
if not node.is_reachable:
|
||||
continue
|
||||
new_host = node.new_ip if not node.is_local else node.ssh_host
|
||||
# --- Phase 1: Update monmap on the FIRST node ---
|
||||
first_node = reachable_nodes[0]
|
||||
first_host = first_node.new_ip if not first_node.is_local else first_node.ssh_host
|
||||
remaining_nodes = reachable_nodes[1:]
|
||||
|
||||
# Extract current monmap
|
||||
print(f"\n [{first_node.name}] Erstelle neue MON-Map (Primary)...")
|
||||
|
||||
# Extract current monmap from first node
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
first_host,
|
||||
f"ceph-mon -i {first_node.name} --extract-monmap /tmp/monmap",
|
||||
first_node.is_local, timeout=30,
|
||||
)
|
||||
if rc != 0:
|
||||
print(f" [{first_node.name}] FEHLER: monmap extrahieren fehlgeschlagen: {err}")
|
||||
return
|
||||
|
||||
# Show current monmap
|
||||
self.ssh.run_on_node(
|
||||
first_host, "monmaptool --print /tmp/monmap",
|
||||
first_node.is_local, timeout=10,
|
||||
)
|
||||
|
||||
# Remove all existing MON entries
|
||||
for mon_name, _ in mon_nodes:
|
||||
self.ssh.run_on_node(
|
||||
first_host,
|
||||
f"monmaptool --rm {mon_name} /tmp/monmap 2>/dev/null",
|
||||
first_node.is_local, timeout=10,
|
||||
)
|
||||
|
||||
# Re-add all MON entries with new IPs (msgr2 on 3300 + msgr1 on 6789)
|
||||
for mon_name, new_ip in mon_nodes:
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"ceph-mon -i {node.name} --extract-monmap /tmp/monmap",
|
||||
node.is_local, timeout=30,
|
||||
first_host,
|
||||
f"monmaptool --addv {mon_name} "
|
||||
f"[v2:{new_ip}:3300/0,v1:{new_ip}:6789/0] /tmp/monmap",
|
||||
first_node.is_local, timeout=10,
|
||||
)
|
||||
if rc != 0:
|
||||
print(f" [{node.name}] WARNUNG: monmap extrahieren fehlgeschlagen: {err}")
|
||||
print(f" [{node.name}] Überspringe MON-Map Update")
|
||||
continue
|
||||
|
||||
# Print current monmap for debugging
|
||||
self.ssh.run_on_node(
|
||||
new_host,
|
||||
"monmaptool --print /tmp/monmap",
|
||||
node.is_local, timeout=10,
|
||||
)
|
||||
|
||||
# Remove all existing MON entries
|
||||
for mon_name, _ in mon_nodes:
|
||||
# Fallback: try legacy --add (older Ceph versions)
|
||||
self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"monmaptool --rm {mon_name} /tmp/monmap 2>/dev/null",
|
||||
node.is_local, timeout=10,
|
||||
first_host,
|
||||
f"monmaptool --add {mon_name} {new_ip}:6789 /tmp/monmap",
|
||||
first_node.is_local, timeout=10,
|
||||
)
|
||||
|
||||
# Re-add all MON entries with new IPs (msgr2 on 3300 + msgr1 on 6789)
|
||||
for mon_name, new_ip in mon_nodes:
|
||||
# Show updated monmap
|
||||
print(f" [{first_node.name}] Neue MON-Map:")
|
||||
self.ssh.run_on_node(
|
||||
first_host, "monmaptool --print /tmp/monmap",
|
||||
first_node.is_local, timeout=10,
|
||||
)
|
||||
|
||||
# Inject into first node
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
first_host,
|
||||
f"ceph-mon -i {first_node.name} --inject-monmap /tmp/monmap",
|
||||
first_node.is_local, timeout=30,
|
||||
)
|
||||
if rc == 0:
|
||||
print(f" [{first_node.name}] MON-Map injiziert")
|
||||
else:
|
||||
print(f" [{first_node.name}] FEHLER MON-Map reinject: {err}")
|
||||
self.ssh.run_on_node(first_host, "rm -f /tmp/monmap", first_node.is_local)
|
||||
return
|
||||
|
||||
# Start first MON so we can get the authoritative map
|
||||
print(f" [{first_node.name}] Starte ceph-mon (Primary)...")
|
||||
self.ssh.run_on_node(
|
||||
first_host,
|
||||
f"systemctl start ceph-mon@{first_node.name}",
|
||||
first_node.is_local, timeout=30,
|
||||
)
|
||||
# Give it a moment to initialize
|
||||
time.sleep(3)
|
||||
|
||||
# --- Phase 2: Get authoritative monmap from running MON ---
|
||||
if remaining_nodes:
|
||||
print(f"\n Hole autoritative MON-Map vom laufenden MON...")
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
first_host,
|
||||
"ceph mon getmap -o /tmp/monmap_auth",
|
||||
first_node.is_local, timeout=30,
|
||||
)
|
||||
|
||||
if rc == 0:
|
||||
# Use authoritative map from running MON
|
||||
monmap_path = "/tmp/monmap_auth"
|
||||
print(f" Autoritative MON-Map erhalten")
|
||||
else:
|
||||
# Fallback: use the manually built map
|
||||
print(f" WARNUNG: Konnte autoritative Map nicht holen ({err})")
|
||||
print(f" Verwende manuell erstellte Map als Fallback")
|
||||
monmap_path = "/tmp/monmap"
|
||||
|
||||
# --- Phase 3: Inject authoritative map into remaining nodes ---
|
||||
for node in remaining_nodes:
|
||||
new_host = node.new_ip if not node.is_local else node.ssh_host
|
||||
|
||||
# Copy monmap from first node to this node via SSH
|
||||
if first_node.is_local:
|
||||
# First node is local: SCP the map to remote node
|
||||
rc, _, err = self.ssh.execute_local(
|
||||
f"sshpass -p '{self.ssh.ssh_password}' "
|
||||
f"scp -o StrictHostKeyChecking=no "
|
||||
f"-o PubkeyAuthentication=no "
|
||||
f"-P {self.ssh.ssh_port} "
|
||||
f"{monmap_path} "
|
||||
f"{self.ssh.ssh_user}@{new_host}:/tmp/monmap",
|
||||
timeout=30,
|
||||
)
|
||||
elif node.is_local:
|
||||
# This node is local: SCP from remote first node
|
||||
rc, _, err = self.ssh.execute_local(
|
||||
f"sshpass -p '{self.ssh.ssh_password}' "
|
||||
f"scp -o StrictHostKeyChecking=no "
|
||||
f"-o PubkeyAuthentication=no "
|
||||
f"-P {self.ssh.ssh_port} "
|
||||
f"{self.ssh.ssh_user}@{first_host}:{monmap_path} "
|
||||
f"/tmp/monmap",
|
||||
timeout=30,
|
||||
)
|
||||
else:
|
||||
# Both remote: read from first, write to second
|
||||
rc_read, stdout, _ = self.ssh.execute(
|
||||
first_host, f"base64 {monmap_path}", timeout=30,
|
||||
)
|
||||
if rc_read == 0:
|
||||
rc, _, err = self.ssh.execute(
|
||||
new_host,
|
||||
f"echo '{stdout.strip()}' | base64 -d > /tmp/monmap",
|
||||
timeout=30,
|
||||
)
|
||||
else:
|
||||
rc = -1
|
||||
err = "Konnte monmap nicht vom Primary lesen"
|
||||
|
||||
if rc != 0:
|
||||
print(f" [{node.name}] WARNUNG: monmap kopieren fehlgeschlagen: {err}")
|
||||
print(f" [{node.name}] Erstelle Map manuell als Fallback...")
|
||||
# Fallback: build map manually on this node
|
||||
self._update_monmap_manual(node, new_host, mon_nodes)
|
||||
continue
|
||||
|
||||
# Inject monmap
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"monmaptool --addv {mon_name} "
|
||||
f"[v2:{new_ip}:3300/0,v1:{new_ip}:6789/0] /tmp/monmap",
|
||||
f"ceph-mon -i {node.name} --inject-monmap /tmp/monmap",
|
||||
node.is_local, timeout=30,
|
||||
)
|
||||
if rc == 0:
|
||||
print(f" [{node.name}] Autoritative MON-Map injiziert")
|
||||
else:
|
||||
print(f" [{node.name}] FEHLER MON-Map reinject: {err}")
|
||||
|
||||
# Cleanup
|
||||
self.ssh.run_on_node(new_host, "rm -f /tmp/monmap", node.is_local)
|
||||
|
||||
# Cleanup on first node
|
||||
self.ssh.run_on_node(
|
||||
first_host, "rm -f /tmp/monmap /tmp/monmap_auth",
|
||||
first_node.is_local,
|
||||
)
|
||||
|
||||
def _update_monmap_manual(self, node, host: str, mon_nodes: list):
|
||||
"""Fallback: manually build and inject monmap on a single node."""
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
host,
|
||||
f"ceph-mon -i {node.name} --extract-monmap /tmp/monmap",
|
||||
node.is_local, timeout=30,
|
||||
)
|
||||
if rc != 0:
|
||||
print(f" [{node.name}] FEHLER: monmap extrahieren fehlgeschlagen")
|
||||
return
|
||||
|
||||
for mon_name, _ in mon_nodes:
|
||||
self.ssh.run_on_node(
|
||||
host,
|
||||
f"monmaptool --rm {mon_name} /tmp/monmap 2>/dev/null",
|
||||
node.is_local, timeout=10,
|
||||
)
|
||||
for mon_name, new_ip in mon_nodes:
|
||||
rc, _, _ = self.ssh.run_on_node(
|
||||
host,
|
||||
f"monmaptool --addv {mon_name} "
|
||||
f"[v2:{new_ip}:3300/0,v1:{new_ip}:6789/0] /tmp/monmap",
|
||||
node.is_local, timeout=10,
|
||||
)
|
||||
if rc != 0:
|
||||
self.ssh.run_on_node(
|
||||
host,
|
||||
f"monmaptool --add {mon_name} {new_ip}:6789 /tmp/monmap",
|
||||
node.is_local, timeout=10,
|
||||
)
|
||||
if rc != 0:
|
||||
# Fallback: try legacy --add (older Ceph versions)
|
||||
self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"monmaptool --add {mon_name} {new_ip}:6789 /tmp/monmap",
|
||||
node.is_local, timeout=10,
|
||||
)
|
||||
|
||||
# Reinject updated monmap
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
new_host,
|
||||
f"ceph-mon -i {node.name} --inject-monmap /tmp/monmap",
|
||||
node.is_local, timeout=30,
|
||||
)
|
||||
if rc == 0:
|
||||
print(f" [{node.name}] MON-Map aktualisiert")
|
||||
else:
|
||||
print(f" [{node.name}] FEHLER MON-Map reinject: {err}")
|
||||
rc, _, err = self.ssh.run_on_node(
|
||||
host,
|
||||
f"ceph-mon -i {node.name} --inject-monmap /tmp/monmap",
|
||||
node.is_local, timeout=30,
|
||||
)
|
||||
if rc == 0:
|
||||
print(f" [{node.name}] MON-Map manuell aktualisiert (Fallback)")
|
||||
else:
|
||||
print(f" [{node.name}] FEHLER MON-Map reinject: {err}")
|
||||
|
||||
# Cleanup
|
||||
self.ssh.run_on_node(
|
||||
new_host, "rm -f /tmp/monmap", node.is_local,
|
||||
)
|
||||
self.ssh.run_on_node(host, "rm -f /tmp/monmap", node.is_local)
|
||||
|
|
|
|||
Loading…
Reference in New Issue