proxmox-cluster-network-cha.../discovery.py

"""Phase 1: Discovery - Read current cluster configuration."""

import socket
from models import NodeInfo, CorosyncConfig, CephConfig
from config_parser import parse_corosync_conf, parse_ceph_conf, parse_network_interfaces
from ssh_manager import SSHManager


class Discovery:
    """Discovers current Proxmox cluster and Ceph configuration."""

    def __init__(self, ssh: SSHManager):
        self.ssh = ssh
        self.local_hostname = socket.gethostname()

    def discover_corosync(self) -> CorosyncConfig | None:
        """Read and parse corosync.conf from the local node."""
        # Try /etc/pve/corosync.conf first (cluster filesystem)
        ok, content = self.ssh.read_local_file("/etc/pve/corosync.conf")
        if not ok:
            # Fallback to local corosync config
            ok, content = self.ssh.read_local_file("/etc/corosync/corosync.conf")
        if not ok:
            print(f"  [!] Corosync config nicht gefunden: {content}")
            return None

        config = parse_corosync_conf(content)
        print(f"  Cluster: {config.cluster_name}")
        print(f"  Transport: {config.transport}")
        print(f"  Config Version: {config.config_version}")
        print(f"  Nodes gefunden: {len(config.nodes)}")
        for node in config.nodes:
            print(f"    - {node.name} (ID: {node.nodeid}) -> {node.ring0_addr}")
        return config

    def discover_ceph(self) -> CephConfig | None:
        """Read and parse ceph.conf."""
        ok, content = self.ssh.read_local_file("/etc/pve/ceph.conf")
        if not ok:
            ok, content = self.ssh.read_local_file("/etc/ceph/ceph.conf")
        if not ok:
            print("  [!] Ceph config nicht gefunden (Ceph evtl. nicht installiert)")
            return None

        config = parse_ceph_conf(content)
        print(f"  FSID: {config.fsid}")
        print(f"  Public Network: {config.public_network}")
        print(f"  Cluster Network: {config.cluster_network}")
        if config.mon_hosts:
            print(f"  MON Hosts: {', '.join(config.mon_hosts)}")
        if config.mon_sections:
            print(f"  MON Sections: {', '.join(config.mon_sections.keys())}")
        return config

    def discover_nodes(self, corosync: CorosyncConfig) -> list[NodeInfo]:
        """Build node list from corosync config and check reachability."""
        nodes = []
        for cs_node in corosync.nodes:
            is_local = (cs_node.name == self.local_hostname)
            node = NodeInfo(
                name=cs_node.name,
                current_ip=cs_node.ring0_addr,
                ssh_host=cs_node.ring0_addr,
                is_local=is_local,
            )

            # Check reachability
            if is_local:
                node.is_reachable = True
            else:
                node.is_reachable = self.ssh.is_reachable(cs_node.ring0_addr)

            # Try to reach by hostname if IP doesn't work
            if not node.is_reachable and not is_local:
                if self.ssh.is_reachable(cs_node.name):
                    node.is_reachable = True
                    node.ssh_host = cs_node.name

            if node.is_reachable:
                self._read_node_configs(node)

            status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR"
            local_tag = " (lokal)" if is_local else ""
            print(f"  {node.name}: {node.current_ip} - {status}{local_tag}")

            nodes.append(node)

        return nodes

    def discover_nodes_with_overrides(self, corosync: CorosyncConfig,
                                       override_nodes: list[NodeInfo]) -> list[NodeInfo]:
        """Re-discover nodes using override SSH hosts (e.g. rescue IPs).

        Takes pre-configured nodes (with rescue IPs as ssh_host) and
        reads their configs.
        """
        print("\n[Nodes - via Rescue-Netzwerk]")
        for node in override_nodes:
            if node.is_reachable:
                self._read_node_configs(node)

            status = "erreichbar" if node.is_reachable else "NICHT ERREICHBAR"
            local_tag = " (lokal)" if node.is_local else ""
            via = f" via {node.ssh_host}" if not node.is_local else ""
            print(f"  {node.name}: {node.current_ip}{via} - {status}{local_tag}")

        return override_nodes

    def _read_node_configs(self, node: NodeInfo):
        """Read network interfaces and hosts from a node."""
        # Read /etc/network/interfaces
        ok, content = self.ssh.read_node_file(
            node.ssh_host, "/etc/network/interfaces", node.is_local
        )
        if ok:
            node.network_interfaces_content = content
            node.interfaces = parse_network_interfaces(content)

        # Read /etc/hosts
        ok, content = self.ssh.read_node_file(
            node.ssh_host, "/etc/hosts", node.is_local
        )
        if ok:
            node.hosts_content = content

    def check_quorum(self) -> bool:
        """Check if the cluster currently has quorum."""
        rc, stdout, _ = self.ssh.execute_local("pvecm status 2>/dev/null")
        if rc != 0:
            print("  [!] pvecm status fehlgeschlagen - kein Quorum oder kein Cluster")
            return False

        if "Quorate:          Yes" in stdout or "Activity blocked" not in stdout:
            # Also check if /etc/pve is writable
            rc2, _, _ = self.ssh.execute_local(
                "touch /etc/pve/.migration_test && rm -f /etc/pve/.migration_test"
            )
            if rc2 == 0:
                print("  Quorum: JA (/etc/pve ist beschreibbar)")
                return True

        print("  Quorum: NEIN (/etc/pve ist read-only!)")
        return False

    def check_ceph_health(self) -> str | None:
        """Get current Ceph health status."""
        rc, stdout, _ = self.ssh.execute_local("ceph health 2>/dev/null")
        if rc == 0:
            status = stdout.strip()
            print(f"  Ceph Health: {status}")
            return status
        return None

    def run(self) -> tuple[CorosyncConfig | None, CephConfig | None,
                           list[NodeInfo], bool]:
        """Run full discovery.

        Returns: (corosync_config, ceph_config, nodes, has_quorum)
        """
        print("\n=== Phase 1: Discovery ===\n")

        print("[Corosync]")
        corosync = self.discover_corosync()
        if not corosync or not corosync.nodes:
            print("FEHLER: Konnte keine Corosync-Konfiguration lesen!")
            return None, None, [], False

        print("\n[Ceph]")
        ceph = self.discover_ceph()

        print("\n[Nodes]")
        nodes = self.discover_nodes(corosync)

        print("\n[Cluster Status]")
        has_quorum = self.check_quorum()

        if ceph:
            print("\n[Ceph Health]")
            self.check_ceph_health()

        unreachable = [n for n in nodes if not n.is_reachable]
        if unreachable:
            print(f"\n[!] WARNUNG: {len(unreachable)} Node(s) nicht erreichbar:")
            for n in unreachable:
                print(f"    - {n.name} ({n.current_ip})")
            print("    Diese Nodes wurden möglicherweise bereits manuell geändert.")
            print("    Das Tool wird versuchen, sie über ihren Hostnamen zu erreichen.")

        return corosync, ceph, nodes, has_quorum