"""Phase 1: Discovery - Read current cluster configuration.""" import socket from models import NodeInfo, CorosyncConfig, CephConfig from config_parser import parse_corosync_conf, parse_ceph_conf, parse_network_interfaces from ssh_manager import SSHManager class Discovery: """Discovers current Proxmox cluster and Ceph configuration.""" def __init__(self, ssh: SSHManager): self.ssh = ssh self.local_hostname = socket.gethostname() def discover_corosync(self) -> CorosyncConfig | None: """Read and parse corosync.conf from the local node.""" # Try /etc/pve/corosync.conf first (cluster filesystem) ok, content = self.ssh.read_local_file("/etc/pve/corosync.conf") if not ok: # Fallback to local corosync config ok, content = self.ssh.read_local_file("/etc/corosync/corosync.conf") if not ok: print(f" [!] Corosync config nicht gefunden: {content}") return None config = parse_corosync_conf(content) print(f" Cluster: {config.cluster_name}") print(f" Transport: {config.transport}") print(f" Config Version: {config.config_version}") print(f" Nodes gefunden: {len(config.nodes)}") for node in config.nodes: print(f" - {node.name} (ID: {node.nodeid}) -> {node.ring0_addr}") return config def discover_ceph(self) -> CephConfig | None: """Read and parse ceph.conf.""" ok, content = self.ssh.read_local_file("/etc/pve/ceph.conf") if not ok: ok, content = self.ssh.read_local_file("/etc/ceph/ceph.conf") if not ok: print(" [!] Ceph config nicht gefunden (Ceph evtl. nicht installiert)") return None config = parse_ceph_conf(content) print(f" FSID: {config.fsid}") print(f" Public Network: {config.public_network}") print(f" Cluster Network: {config.cluster_network}") if config.mon_hosts: print(f" MON Hosts: {', '.join(config.mon_hosts)}") if config.mon_sections: print(f" MON Sections: {', '.join(config.mon_sections.keys())}") return config def discover_nodes(self, corosync: CorosyncConfig) -> list[NodeInfo]: """Build node list from corosync config and check reachability.""" nodes = [] for cs_node in corosync.nodes: is_local = (cs_node.name == self.local_hostname) node = NodeInfo( name=cs_node.name, current_ip=cs_node.ring0_addr, ssh_host=cs_node.ring0_addr, is_local=is_local, ) # Check reachability if is_local: node.is_reachable = True else: node.is_reachable = self.ssh.is_reachable(cs_node.ring0_addr) # Try to reach by hostname if IP doesn't work if not node.is_reachable and not is_local: if self.ssh.is_reachable(cs_node.name): node.is_reachable = True node.ssh_host = cs_node.name if node.is_reachable: self._read_node_configs(node) status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" local_tag = " (lokal)" if is_local else "" print(f" {node.name}: {node.current_ip} - {status}{local_tag}") nodes.append(node) return nodes def discover_nodes_with_overrides(self, corosync: CorosyncConfig, override_nodes: list[NodeInfo]) -> list[NodeInfo]: """Re-discover nodes using override SSH hosts (e.g. rescue IPs). Takes pre-configured nodes (with rescue IPs as ssh_host) and reads their configs. """ print("\n[Nodes - via Rescue-Netzwerk]") for node in override_nodes: if node.is_reachable: self._read_node_configs(node) status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR" local_tag = " (lokal)" if node.is_local else "" via = f" via {node.ssh_host}" if not node.is_local else "" print(f" {node.name}: {node.current_ip}{via} - {status}{local_tag}") return override_nodes def _read_node_configs(self, node: NodeInfo): """Read network interfaces and hosts from a node.""" # Read /etc/network/interfaces ok, content = self.ssh.read_node_file( node.ssh_host, "/etc/network/interfaces", node.is_local ) if ok: node.network_interfaces_content = content node.interfaces = parse_network_interfaces(content) # Read /etc/hosts ok, content = self.ssh.read_node_file( node.ssh_host, "/etc/hosts", node.is_local ) if ok: node.hosts_content = content def check_quorum(self) -> bool: """Check if the cluster currently has quorum.""" rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null") if rc != 0: print(" [!] pvecm status fehlgeschlagen - kein Quorum oder kein Cluster") return False if "Quorate: Yes" in stdout or "Activity blocked" not in stdout: # Also check if /etc/pve is writable rc2, _, _ = self.ssh.execute_local( "touch /etc/pve/.migration_test && rm -f /etc/pve/.migration_test" ) if rc2 == 0: print(" Quorum: JA (/etc/pve ist beschreibbar)") return True print(" Quorum: NEIN (/etc/pve ist read-only!)") return False def check_ceph_health(self) -> str | None: """Get current Ceph health status.""" rc, stdout, _ = self.ssh.execute_local("ceph health 2>/dev/null") if rc == 0: status = stdout.strip() print(f" Ceph Health: {status}") return status return None def run(self) -> tuple[CorosyncConfig | None, CephConfig | None, list[NodeInfo], bool]: """Run full discovery. Returns: (corosync_config, ceph_config, nodes, has_quorum) """ print("\n=== Phase 1: Discovery ===\n") print("[Corosync]") corosync = self.discover_corosync() if not corosync or not corosync.nodes: print("FEHLER: Konnte keine Corosync-Konfiguration lesen!") return None, None, [], False print("\n[Ceph]") ceph = self.discover_ceph() print("\n[Nodes]") nodes = self.discover_nodes(corosync) print("\n[Cluster Status]") has_quorum = self.check_quorum() if ceph: print("\n[Ceph Health]") self.check_ceph_health() unreachable = [n for n in nodes if not n.is_reachable] if unreachable: print(f"\n[!] WARNUNG: {len(unreachable)} Node(s) nicht erreichbar:") for n in unreachable: print(f" - {n.name} ({n.current_ip})") print(" Diese Nodes wurden möglicherweise bereits manuell geändert.") print(" Das Tool wird versuchen, sie über ihren Hostnamen zu erreichen.") return corosync, ceph, nodes, has_quorum