proxmox-cluster-network-cha.../discovery.py

190 lines
7.1 KiB
Python

"""Phase 1: Discovery - Read current cluster configuration."""
import socket
from models import NodeInfo, CorosyncConfig, CephConfig
from config_parser import parse_corosync_conf, parse_ceph_conf, parse_network_interfaces
from ssh_manager import SSHManager
class Discovery:
"""Discovers current Proxmox cluster and Ceph configuration."""
def __init__(self, ssh: SSHManager):
self.ssh = ssh
self.local_hostname = socket.gethostname()
def discover_corosync(self) -> CorosyncConfig | None:
"""Read and parse corosync.conf from the local node."""
# Try /etc/pve/corosync.conf first (cluster filesystem)
ok, content = self.ssh.read_local_file("/etc/pve/corosync.conf")
if not ok:
# Fallback to local corosync config
ok, content = self.ssh.read_local_file("/etc/corosync/corosync.conf")
if not ok:
print(f" [!] Corosync config nicht gefunden: {content}")
return None
config = parse_corosync_conf(content)
print(f" Cluster: {config.cluster_name}")
print(f" Transport: {config.transport}")
print(f" Config Version: {config.config_version}")
print(f" Nodes gefunden: {len(config.nodes)}")
for node in config.nodes:
print(f" - {node.name} (ID: {node.nodeid}) -> {node.ring0_addr}")
return config
def discover_ceph(self) -> CephConfig | None:
"""Read and parse ceph.conf."""
ok, content = self.ssh.read_local_file("/etc/pve/ceph.conf")
if not ok:
ok, content = self.ssh.read_local_file("/etc/ceph/ceph.conf")
if not ok:
print(" [!] Ceph config nicht gefunden (Ceph evtl. nicht installiert)")
return None
config = parse_ceph_conf(content)
print(f" FSID: {config.fsid}")
print(f" Public Network: {config.public_network}")
print(f" Cluster Network: {config.cluster_network}")
if config.mon_hosts:
print(f" MON Hosts: {', '.join(config.mon_hosts)}")
if config.mon_sections:
print(f" MON Sections: {', '.join(config.mon_sections.keys())}")
return config
def discover_nodes(self, corosync: CorosyncConfig) -> list[NodeInfo]:
"""Build node list from corosync config and check reachability."""
nodes = []
for cs_node in corosync.nodes:
is_local = (cs_node.name == self.local_hostname)
node = NodeInfo(
name=cs_node.name,
current_ip=cs_node.ring0_addr,
ssh_host=cs_node.ring0_addr,
is_local=is_local,
)
# Check reachability
if is_local:
node.is_reachable = True
else:
node.is_reachable = self.ssh.is_reachable(cs_node.ring0_addr)
# Try to reach by hostname if IP doesn't work
if not node.is_reachable and not is_local:
if self.ssh.is_reachable(cs_node.name):
node.is_reachable = True
node.ssh_host = cs_node.name
if node.is_reachable:
self._read_node_configs(node)
status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR"
local_tag = " (lokal)" if is_local else ""
print(f" {node.name}: {node.current_ip} - {status}{local_tag}")
nodes.append(node)
return nodes
def discover_nodes_with_overrides(self, corosync: CorosyncConfig,
override_nodes: list[NodeInfo]) -> list[NodeInfo]:
"""Re-discover nodes using override SSH hosts (e.g. rescue IPs).
Takes pre-configured nodes (with rescue IPs as ssh_host) and
reads their configs.
"""
print("\n[Nodes - via Rescue-Netzwerk]")
for node in override_nodes:
if node.is_reachable:
self._read_node_configs(node)
status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR"
local_tag = " (lokal)" if node.is_local else ""
via = f" via {node.ssh_host}" if not node.is_local else ""
print(f" {node.name}: {node.current_ip}{via} - {status}{local_tag}")
return override_nodes
def _read_node_configs(self, node: NodeInfo):
"""Read network interfaces and hosts from a node."""
# Read /etc/network/interfaces
ok, content = self.ssh.read_node_file(
node.ssh_host, "/etc/network/interfaces", node.is_local
)
if ok:
node.network_interfaces_content = content
node.interfaces = parse_network_interfaces(content)
# Read /etc/hosts
ok, content = self.ssh.read_node_file(
node.ssh_host, "/etc/hosts", node.is_local
)
if ok:
node.hosts_content = content
def check_quorum(self) -> bool:
"""Check if the cluster currently has quorum."""
rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null")
if rc != 0:
print(" [!] pvecm status fehlgeschlagen - kein Quorum oder kein Cluster")
return False
if "Quorate: Yes" in stdout or "Activity blocked" not in stdout:
# Also check if /etc/pve is writable
rc2, _, _ = self.ssh.execute_local(
"touch /etc/pve/.migration_test && rm -f /etc/pve/.migration_test"
)
if rc2 == 0:
print(" Quorum: JA (/etc/pve ist beschreibbar)")
return True
print(" Quorum: NEIN (/etc/pve ist read-only!)")
return False
def check_ceph_health(self) -> str | None:
"""Get current Ceph health status."""
rc, stdout, _ = self.ssh.execute_local("ceph health 2>/dev/null")
if rc == 0:
status = stdout.strip()
print(f" Ceph Health: {status}")
return status
return None
def run(self) -> tuple[CorosyncConfig | None, CephConfig | None,
list[NodeInfo], bool]:
"""Run full discovery.
Returns: (corosync_config, ceph_config, nodes, has_quorum)
"""
print("\n=== Phase 1: Discovery ===\n")
print("[Corosync]")
corosync = self.discover_corosync()
if not corosync or not corosync.nodes:
print("FEHLER: Konnte keine Corosync-Konfiguration lesen!")
return None, None, [], False
print("\n[Ceph]")
ceph = self.discover_ceph()
print("\n[Nodes]")
nodes = self.discover_nodes(corosync)
print("\n[Cluster Status]")
has_quorum = self.check_quorum()
if ceph:
print("\n[Ceph Health]")
self.check_ceph_health()
unreachable = [n for n in nodes if not n.is_reachable]
if unreachable:
print(f"\n[!] WARNUNG: {len(unreachable)} Node(s) nicht erreichbar:")
for n in unreachable:
print(f" - {n.name} ({n.current_ip})")
print(" Diese Nodes wurden möglicherweise bereits manuell geändert.")
print(" Das Tool wird versuchen, sie über ihren Hostnamen zu erreichen.")
return corosync, ceph, nodes, has_quorum