190 lines
7.1 KiB
Python
190 lines
7.1 KiB
Python
"""Phase 1: Discovery - Read current cluster configuration."""
|
|
|
|
import socket
|
|
from models import NodeInfo, CorosyncConfig, CephConfig
|
|
from config_parser import parse_corosync_conf, parse_ceph_conf, parse_network_interfaces
|
|
from ssh_manager import SSHManager
|
|
|
|
|
|
class Discovery:
|
|
"""Discovers current Proxmox cluster and Ceph configuration."""
|
|
|
|
def __init__(self, ssh: SSHManager):
|
|
self.ssh = ssh
|
|
self.local_hostname = socket.gethostname()
|
|
|
|
def discover_corosync(self) -> CorosyncConfig | None:
|
|
"""Read and parse corosync.conf from the local node."""
|
|
# Try /etc/pve/corosync.conf first (cluster filesystem)
|
|
ok, content = self.ssh.read_local_file("/etc/pve/corosync.conf")
|
|
if not ok:
|
|
# Fallback to local corosync config
|
|
ok, content = self.ssh.read_local_file("/etc/corosync/corosync.conf")
|
|
if not ok:
|
|
print(f" [!] Corosync config nicht gefunden: {content}")
|
|
return None
|
|
|
|
config = parse_corosync_conf(content)
|
|
print(f" Cluster: {config.cluster_name}")
|
|
print(f" Transport: {config.transport}")
|
|
print(f" Config Version: {config.config_version}")
|
|
print(f" Nodes gefunden: {len(config.nodes)}")
|
|
for node in config.nodes:
|
|
print(f" - {node.name} (ID: {node.nodeid}) -> {node.ring0_addr}")
|
|
return config
|
|
|
|
def discover_ceph(self) -> CephConfig | None:
|
|
"""Read and parse ceph.conf."""
|
|
ok, content = self.ssh.read_local_file("/etc/pve/ceph.conf")
|
|
if not ok:
|
|
ok, content = self.ssh.read_local_file("/etc/ceph/ceph.conf")
|
|
if not ok:
|
|
print(" [!] Ceph config nicht gefunden (Ceph evtl. nicht installiert)")
|
|
return None
|
|
|
|
config = parse_ceph_conf(content)
|
|
print(f" FSID: {config.fsid}")
|
|
print(f" Public Network: {config.public_network}")
|
|
print(f" Cluster Network: {config.cluster_network}")
|
|
if config.mon_hosts:
|
|
print(f" MON Hosts: {', '.join(config.mon_hosts)}")
|
|
if config.mon_sections:
|
|
print(f" MON Sections: {', '.join(config.mon_sections.keys())}")
|
|
return config
|
|
|
|
def discover_nodes(self, corosync: CorosyncConfig) -> list[NodeInfo]:
|
|
"""Build node list from corosync config and check reachability."""
|
|
nodes = []
|
|
for cs_node in corosync.nodes:
|
|
is_local = (cs_node.name == self.local_hostname)
|
|
node = NodeInfo(
|
|
name=cs_node.name,
|
|
current_ip=cs_node.ring0_addr,
|
|
ssh_host=cs_node.ring0_addr,
|
|
is_local=is_local,
|
|
)
|
|
|
|
# Check reachability
|
|
if is_local:
|
|
node.is_reachable = True
|
|
else:
|
|
node.is_reachable = self.ssh.is_reachable(cs_node.ring0_addr)
|
|
|
|
# Try to reach by hostname if IP doesn't work
|
|
if not node.is_reachable and not is_local:
|
|
if self.ssh.is_reachable(cs_node.name):
|
|
node.is_reachable = True
|
|
node.ssh_host = cs_node.name
|
|
|
|
if node.is_reachable:
|
|
self._read_node_configs(node)
|
|
|
|
status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR"
|
|
local_tag = " (lokal)" if is_local else ""
|
|
print(f" {node.name}: {node.current_ip} - {status}{local_tag}")
|
|
|
|
nodes.append(node)
|
|
|
|
return nodes
|
|
|
|
def discover_nodes_with_overrides(self, corosync: CorosyncConfig,
|
|
override_nodes: list[NodeInfo]) -> list[NodeInfo]:
|
|
"""Re-discover nodes using override SSH hosts (e.g. rescue IPs).
|
|
|
|
Takes pre-configured nodes (with rescue IPs as ssh_host) and
|
|
reads their configs.
|
|
"""
|
|
print("\n[Nodes - via Rescue-Netzwerk]")
|
|
for node in override_nodes:
|
|
if node.is_reachable:
|
|
self._read_node_configs(node)
|
|
|
|
status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR"
|
|
local_tag = " (lokal)" if node.is_local else ""
|
|
via = f" via {node.ssh_host}" if not node.is_local else ""
|
|
print(f" {node.name}: {node.current_ip}{via} - {status}{local_tag}")
|
|
|
|
return override_nodes
|
|
|
|
def _read_node_configs(self, node: NodeInfo):
|
|
"""Read network interfaces and hosts from a node."""
|
|
# Read /etc/network/interfaces
|
|
ok, content = self.ssh.read_node_file(
|
|
node.ssh_host, "/etc/network/interfaces", node.is_local
|
|
)
|
|
if ok:
|
|
node.network_interfaces_content = content
|
|
node.interfaces = parse_network_interfaces(content)
|
|
|
|
# Read /etc/hosts
|
|
ok, content = self.ssh.read_node_file(
|
|
node.ssh_host, "/etc/hosts", node.is_local
|
|
)
|
|
if ok:
|
|
node.hosts_content = content
|
|
|
|
def check_quorum(self) -> bool:
|
|
"""Check if the cluster currently has quorum."""
|
|
rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null")
|
|
if rc != 0:
|
|
print(" [!] pvecm status fehlgeschlagen - kein Quorum oder kein Cluster")
|
|
return False
|
|
|
|
if "Quorate: Yes" in stdout or "Activity blocked" not in stdout:
|
|
# Also check if /etc/pve is writable
|
|
rc2, _, _ = self.ssh.execute_local(
|
|
"touch /etc/pve/.migration_test && rm -f /etc/pve/.migration_test"
|
|
)
|
|
if rc2 == 0:
|
|
print(" Quorum: JA (/etc/pve ist beschreibbar)")
|
|
return True
|
|
|
|
print(" Quorum: NEIN (/etc/pve ist read-only!)")
|
|
return False
|
|
|
|
def check_ceph_health(self) -> str | None:
|
|
"""Get current Ceph health status."""
|
|
rc, stdout, _ = self.ssh.execute_local("ceph health 2>/dev/null")
|
|
if rc == 0:
|
|
status = stdout.strip()
|
|
print(f" Ceph Health: {status}")
|
|
return status
|
|
return None
|
|
|
|
def run(self) -> tuple[CorosyncConfig | None, CephConfig | None,
|
|
list[NodeInfo], bool]:
|
|
"""Run full discovery.
|
|
|
|
Returns: (corosync_config, ceph_config, nodes, has_quorum)
|
|
"""
|
|
print("\n=== Phase 1: Discovery ===\n")
|
|
|
|
print("[Corosync]")
|
|
corosync = self.discover_corosync()
|
|
if not corosync or not corosync.nodes:
|
|
print("FEHLER: Konnte keine Corosync-Konfiguration lesen!")
|
|
return None, None, [], False
|
|
|
|
print("\n[Ceph]")
|
|
ceph = self.discover_ceph()
|
|
|
|
print("\n[Nodes]")
|
|
nodes = self.discover_nodes(corosync)
|
|
|
|
print("\n[Cluster Status]")
|
|
has_quorum = self.check_quorum()
|
|
|
|
if ceph:
|
|
print("\n[Ceph Health]")
|
|
self.check_ceph_health()
|
|
|
|
unreachable = [n for n in nodes if not n.is_reachable]
|
|
if unreachable:
|
|
print(f"\n[!] WARNUNG: {len(unreachable)} Node(s) nicht erreichbar:")
|
|
for n in unreachable:
|
|
print(f" - {n.name} ({n.current_ip})")
|
|
print(" Diese Nodes wurden möglicherweise bereits manuell geändert.")
|
|
print(" Das Tool wird versuchen, sie über ihren Hostnamen zu erreichen.")
|
|
|
|
return corosync, ceph, nodes, has_quorum
|