229 lines
8.2 KiB
Python
229 lines
8.2 KiB
Python
"""Emergency/Rescue Network - Temporäres Netzwerk zur SSH-Kommunikation.
|
|
|
|
Wenn Nodes in verschiedenen Subnetzen sind und sich nicht mehr erreichen
|
|
können, wird ein temporäres Emergency-Netzwerk aufgebaut:
|
|
- Jeder Node bekommt eine zusätzliche IP auf der Bridge (z.B. vmbr0)
|
|
- Über dieses Netz kann das Tool dann per SSH arbeiten
|
|
- Nach der Migration werden die Emergency-IPs wieder entfernt
|
|
"""
|
|
|
|
import ipaddress
|
|
import time
|
|
from models import NodeInfo, CorosyncConfig
|
|
from config_parser import parse_corosync_conf
|
|
from ssh_manager import SSHManager
|
|
|
|
|
|
class RescueNetwork:
|
|
"""Manages an emergency network for broken clusters."""
|
|
|
|
def __init__(self, ssh: SSHManager):
|
|
self.ssh = ssh
|
|
self.rescue_subnet: str = ""
|
|
self.rescue_ips: dict[str, str] = {} # node_name -> rescue_ip
|
|
self.bridge: str = "vmbr0"
|
|
self.active: bool = False
|
|
|
|
def setup_interactive(self, corosync: CorosyncConfig) -> list[NodeInfo] | None:
|
|
"""Interactively set up the rescue network.
|
|
|
|
Returns updated node list with rescue IPs as ssh_host, or None on abort.
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print(" RESCUE NETZWERK")
|
|
print("=" * 60)
|
|
print()
|
|
print(" Dieses Feature richtet ein temporäres Netzwerk ein,")
|
|
print(" damit alle Nodes sich wieder per SSH erreichen können.")
|
|
print()
|
|
print(" Ablauf:")
|
|
print(" 1. Du gibst ein freies Subnetz an (z.B. 10.99.99.0/24)")
|
|
print(" 2. Das Tool zeigt dir für jeden Node den Befehl an")
|
|
print(" 3. Du führst die Befehle manuell auf jedem Node aus")
|
|
print(" (z.B. über IPMI/iLO/iDRAC/KVM-Konsole)")
|
|
print(" 4. Danach kann das Tool alle Nodes per SSH erreichen")
|
|
print()
|
|
|
|
# Ask for bridge
|
|
user_input = input(f" Bridge für Emergency-IPs [{self.bridge}]: ").strip()
|
|
if user_input:
|
|
self.bridge = user_input
|
|
|
|
# Ask for rescue subnet
|
|
while True:
|
|
subnet_input = input(" Emergency Subnetz (z.B. 10.99.99.0/24): ").strip()
|
|
if not subnet_input:
|
|
print(" Abgebrochen.")
|
|
return None
|
|
try:
|
|
subnet = ipaddress.ip_network(subnet_input, strict=False)
|
|
self.rescue_subnet = str(subnet)
|
|
break
|
|
except ValueError as e:
|
|
print(f" Ungültiges Subnetz: {e}")
|
|
|
|
# Generate IPs for all nodes
|
|
hosts = list(subnet.hosts())
|
|
print()
|
|
print(" " + "-" * 56)
|
|
print(f" Emergency Subnetz: {self.rescue_subnet}")
|
|
print(f" Bridge: {self.bridge}")
|
|
print(" " + "-" * 56)
|
|
print()
|
|
|
|
nodes = []
|
|
for i, cs_node in enumerate(corosync.nodes):
|
|
if i >= len(hosts):
|
|
print(f" [!] FEHLER: Nicht genug IPs im Subnetz für alle Nodes!")
|
|
return None
|
|
|
|
rescue_ip = str(hosts[i])
|
|
self.rescue_ips[cs_node.name] = rescue_ip
|
|
cidr = subnet.prefixlen
|
|
|
|
node = NodeInfo(
|
|
name=cs_node.name,
|
|
current_ip=cs_node.ring0_addr,
|
|
ssh_host=rescue_ip, # Use rescue IP for SSH
|
|
)
|
|
nodes.append(node)
|
|
|
|
# Show command for this node
|
|
cmd = f"ip addr add {rescue_ip}/{cidr} dev {self.bridge}"
|
|
print(f" {cs_node.name} ({cs_node.ring0_addr}):")
|
|
print(f" Rescue-IP: {rescue_ip}/{cidr}")
|
|
print(f" Befehl: {cmd}")
|
|
print()
|
|
|
|
# Apply locally
|
|
print(" " + "-" * 56)
|
|
print()
|
|
|
|
# Find local node
|
|
import socket
|
|
local_hostname = socket.gethostname()
|
|
local_node = None
|
|
for node in nodes:
|
|
if node.name == local_hostname:
|
|
local_node = node
|
|
node.is_local = True
|
|
break
|
|
|
|
if local_node and local_node.name in self.rescue_ips:
|
|
rescue_ip = self.rescue_ips[local_node.name]
|
|
cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen
|
|
print(f" Lokaler Node erkannt: {local_node.name}")
|
|
answer = input(
|
|
f" Emergency-IP {rescue_ip}/{cidr} auf {self.bridge} "
|
|
f"automatisch setzen? [J/n]: "
|
|
).strip().lower()
|
|
|
|
if answer not in ('n', 'nein', 'no'):
|
|
rc, _, err = self.ssh.execute_local(
|
|
f"ip addr add {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null; echo ok"
|
|
)
|
|
if rc == 0:
|
|
print(f" -> {rescue_ip}/{cidr} auf {self.bridge} gesetzt")
|
|
local_node.is_reachable = True
|
|
else:
|
|
print(f" -> WARNUNG: {err}")
|
|
local_node.is_reachable = True # It's local, still reachable
|
|
else:
|
|
local_node.is_reachable = True
|
|
|
|
# Wait for user to configure other nodes
|
|
print()
|
|
print(" " + "=" * 56)
|
|
print(" Bitte führe jetzt die oben genannten Befehle auf den")
|
|
print(" anderen Nodes aus (IPMI/iLO/iDRAC/KVM-Konsole).")
|
|
print(" " + "=" * 56)
|
|
print()
|
|
input(" Drücke ENTER wenn alle Nodes konfiguriert sind...")
|
|
|
|
# Test connectivity
|
|
print()
|
|
print(" [Verbindungstest]")
|
|
all_ok = True
|
|
for node in nodes:
|
|
if node.is_local:
|
|
print(f" {node.name}: OK (lokal)")
|
|
continue
|
|
|
|
rescue_ip = self.rescue_ips[node.name]
|
|
reachable = self.ssh.is_reachable(rescue_ip)
|
|
if reachable:
|
|
print(f" {node.name} ({rescue_ip}): OK")
|
|
node.is_reachable = True
|
|
else:
|
|
print(f" {node.name} ({rescue_ip}): NICHT ERREICHBAR")
|
|
all_ok = False
|
|
|
|
if not all_ok:
|
|
print()
|
|
print(" [!] Nicht alle Nodes erreichbar!")
|
|
answer = input(" Trotzdem fortfahren? [j/N]: ").strip().lower()
|
|
if answer not in ('j', 'ja', 'y', 'yes'):
|
|
self.cleanup(nodes)
|
|
return None
|
|
|
|
self.active = True
|
|
print()
|
|
print(" Rescue-Netzwerk aktiv. Migration kann starten.")
|
|
return nodes
|
|
|
|
def cleanup(self, nodes: list[NodeInfo]):
|
|
"""Remove emergency IPs from all nodes."""
|
|
if not self.active and not self.rescue_ips:
|
|
return
|
|
|
|
print("\n [Rescue] Emergency-IPs entfernen...")
|
|
cidr = ipaddress.ip_network(self.rescue_subnet, strict=False).prefixlen
|
|
|
|
for node in nodes:
|
|
if node.name not in self.rescue_ips:
|
|
continue
|
|
|
|
rescue_ip = self.rescue_ips[node.name]
|
|
cmd = f"ip addr del {rescue_ip}/{cidr} dev {self.bridge} 2>/dev/null"
|
|
|
|
if node.is_local:
|
|
rc, _, _ = self.ssh.execute_local(cmd)
|
|
elif node.is_reachable:
|
|
# Try to reach via new IP first (after migration), then rescue IP
|
|
if node.new_ip:
|
|
rc, _, _ = self.ssh.execute(node.new_ip, cmd)
|
|
else:
|
|
rc, _, _ = self.ssh.execute(rescue_ip, cmd)
|
|
|
|
status = "entfernt" if True else "FEHLER"
|
|
print(f" {node.name}: {rescue_ip}/{cidr} {status}")
|
|
|
|
self.active = False
|
|
print(" [Rescue] Emergency-IPs entfernt.")
|
|
|
|
def get_rescue_commands(self, corosync: CorosyncConfig,
|
|
subnet: str, bridge: str = "vmbr0") -> list[dict]:
|
|
"""Generate rescue commands without interactive prompts.
|
|
|
|
Returns list of {name, ip, cidr, command, current_ip}
|
|
"""
|
|
network = ipaddress.ip_network(subnet, strict=False)
|
|
hosts = list(network.hosts())
|
|
commands = []
|
|
|
|
for i, cs_node in enumerate(corosync.nodes):
|
|
if i >= len(hosts):
|
|
break
|
|
rescue_ip = str(hosts[i])
|
|
cidr = network.prefixlen
|
|
commands.append({
|
|
'name': cs_node.name,
|
|
'current_ip': cs_node.ring0_addr,
|
|
'ip': rescue_ip,
|
|
'cidr': cidr,
|
|
'command': f"ip addr add {rescue_ip}/{cidr} dev {bridge}",
|
|
'remove_command': f"ip addr del {rescue_ip}/{cidr} dev {bridge}",
|
|
})
|
|
|
|
return commands
|