From 23c16c00ecf2cddf67f8134167035ea5189704f0 Mon Sep 17 00:00:00 2001 From: xin liang Date: Fri, 14 Jun 2024 14:40:31 +0800 Subject: [PATCH] Dev: ui_sbd: Add new 'crm sbd' sublevel (jsc#PED-8256) ** Motivation The main configurations for sbd use cases are scattered among sysconfig, on-disk meta data, CIB, and even could be related to other OS components eg. coredump, SCSI, multipath. It's desirable to reduce the management complexity among them and to streamline the workflow for the main use case scenarios. ** Changed include **** Disk-based SBD scenarios 1. Show usage when syntax error 2. Completion 3. Display SBD related configuration (UC4 in PED-8256) 4. Change the on-disk meta data of the existing sbd disks (UC2.1 in PED-8256) 5. Add a sbd disk with the existing sbd configuration (UC2.2 in PED-8256) 6. Remove a sbd disk (UC2.3 in PED-8256) 7. Remove sbd from cluster 8. Replace the storage for a sbd disk (UC2.4 in PED-8256)] 9. display status (focusing on the runtime information only) (UC5 in PED-8256) **** Disk-less SBD scenarios 1. Show usage when syntax error (diskless) 2. completion (diskless) 3. Display SBD related configuration (UC4 in PED-8256, diskless) 4. Manipulate the basic diskless sbd configuration (UC3.1 in PED-8256) --- crmsh/bootstrap.py | 7 +- crmsh/sbd.py | 139 ++++++++++---- crmsh/ui_root.py | 5 + crmsh/ui_sbd.py | 452 +++++++++++++++++++++++++++++++++++++++++++++ crmsh/utils.py | 21 ++- crmsh/watchdog.py | 22 ++- 6 files changed, 603 insertions(+), 43 deletions(-) create mode 100644 crmsh/ui_sbd.py diff --git a/crmsh/bootstrap.py b/crmsh/bootstrap.py index f68ffaa9ec..40d227a0dd 100644 --- a/crmsh/bootstrap.py +++ b/crmsh/bootstrap.py @@ -2785,7 +2785,12 @@ def sync_file(path): """ Sync files between cluster nodes """ - if _context.skip_csync2: + if _context: + skip_csync2 = _context.skip_csync2 + else: + skip_csync2 = not ServiceManager().service_is_active(CSYNC2_SERVICE) + + if skip_csync2: utils.cluster_copy_file(path, nodes=_context.node_list_in_cluster, output=False) else: csync2_update(path) diff --git a/crmsh/sbd.py b/crmsh/sbd.py index d7f569e687..cf25bc4547 100644 --- a/crmsh/sbd.py +++ b/crmsh/sbd.py @@ -1,5 +1,6 @@ import os import re +import typing from . import utils, sh from . import bootstrap from .bootstrap import SYSCONFIG_SBD, SBD_SYSTEMD_DELAY_START_DIR @@ -68,6 +69,17 @@ def _set_sbd_msgwait(self): sbd_msgwait = sbd_msgwait_default self.sbd_msgwait = sbd_msgwait + @classmethod + def get_advised_sbd_timeout(cls) -> typing.Tuple[int, int]: + """ + Get suitable sbd_watchdog_timeout and sbd_msgwait + """ + ctx = bootstrap.Context() + ctx.load_profiles() + time_inst = cls(ctx) + time_inst.initialize_timeout() + return time_inst.sbd_watchdog_timeout, time_inst.sbd_msgwait + def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self): """ When using diskless SBD with Qdevice, adjust value of sbd_watchdog_timeout @@ -85,17 +97,34 @@ def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self): logger.warning("sbd_watchdog_timeout is set to {} for qdevice, it was {}".format(self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE, self.sbd_watchdog_timeout)) self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE + @staticmethod + def get_sbd_device_metadata(dev, timeout_only=False) -> dict: + """ + Extract metadata from sbd device header + """ + sbd_info = {} + try: + out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev)) + except: + return sbd_info + pattern = r"UUID\s+:\s+(\S+)|Timeout\s+\((\w+)\)\s+:\s+(\d+)" + matches = re.findall(pattern, out) + for uuid, timeout_type, timeout_value in matches: + if uuid and not timeout_only: + sbd_info["uuid"] = uuid + elif timeout_type and timeout_value: + sbd_info[timeout_type] = int(timeout_value) + return sbd_info + @staticmethod def get_sbd_msgwait(dev): """ Get msgwait for sbd device """ - out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev)) - # Format like "Timeout (msgwait) : 30" - res = re.search("\(msgwait\)\s+:\s+(\d+)", out) + res = SBDTimeout.get_sbd_device_metadata(dev).get("msgwait") if not res: - raise ValueError("Cannot get sbd msgwait for {}".format(dev)) - return int(res.group(1)) + raise ValueError(f"Cannot get sbd msgwait for {dev}") + return res @staticmethod def get_sbd_watchdog_timeout(): @@ -195,6 +224,12 @@ def is_sbd_delay_start(): res = SBDManager.get_sbd_value_from_config("SBD_DELAY_START") return res and res != "no" + @staticmethod + def get_sbd_systemd_start_timeout() -> int: + cmd = "systemctl show -p TimeoutStartUSec sbd --value" + out = sh.cluster_shell().get_stdout_or_raise_error(cmd) + return utils.get_systemd_timeout_start_in_sec(out) + def adjust_systemd_start_timeout(self): """ Adjust start timeout for sbd when set SBD_DELAY_START @@ -203,9 +238,7 @@ def adjust_systemd_start_timeout(self): if sbd_delay_start_value == "no": return - cmd = "systemctl show -p TimeoutStartUSec sbd --value" - out = sh.cluster_shell().get_stdout_or_raise_error(cmd) - start_timeout = utils.get_systemd_timeout_start_in_sec(out) + start_timeout = SBDTimeout.get_sbd_systemd_start_timeout() if start_timeout > int(sbd_delay_start_value): return @@ -269,6 +302,7 @@ class SBDManager(object): DISKLESS_CRM_CMD = "crm configure property stonith-enabled=true stonith-watchdog-timeout={} stonith-timeout={}" SBD_RA = "stonith:fence_sbd" SBD_RA_ID = "stonith-sbd" + SBD_DEVICE_MAX = 3 def __init__(self, context): """ @@ -292,11 +326,10 @@ def _get_device_uuid(dev, node=None): """ Get UUID for specific device and node """ - out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev), node) - res = re.search("UUID\s*:\s*(.*)\n", out) + res = SBDTimeout.get_sbd_device_metadata(dev).get("uuid") if not res: raise ValueError("Cannot find sbd device UUID for {}".format(dev)) - return res.group(1) + return res def _compare_device_uuid(self, dev, node_list): """ @@ -314,8 +347,8 @@ def _verify_sbd_device(self, dev_list, compare_node_list=[]): """ Verify sbd device """ - if len(dev_list) > 3: - raise ValueError("Maximum number of SBD device is 3") + if len(dev_list) > self.SBD_DEVICE_MAX: + raise ValueError(f"Maximum number of SBD device is {self.SBD_DEVICE_MAX}") for dev in dev_list: if not utils.is_block_device(dev): raise ValueError("{} doesn't look like a block device".format(dev)) @@ -402,26 +435,32 @@ def _initialize_sbd(self): For diskless-sbd, set sbd_watchdog_timeout then return; For disk-based-sbd, also calculate the msgwait value, then initialize the SBD device. """ - msg = "" if self.diskless_sbd: - msg = "Configuring diskless SBD" - elif not all(self.no_overwrite_map.values()): - msg = "Initializing SBD" - if msg: - logger.info(msg) + logger.info("Configuring diskless SBD") self.timeout_inst = SBDTimeout(self._context) self.timeout_inst.initialize_timeout() if self.diskless_sbd: return - opt = "-4 {} -1 {}".format(self.timeout_inst.sbd_msgwait, self.timeout_inst.sbd_watchdog_timeout) + opt_str = "-4 {} -1 {}".format(self.timeout_inst.sbd_msgwait, self.timeout_inst.sbd_watchdog_timeout) + device_list = [ + dev for dev in self._sbd_devices + if dev not in self.no_overwrite_map + or not self.no_overwrite_map[dev] + ] + SBDManager.initialize_sbd_device(device_list, opt_str) - for dev in self._sbd_devices: - if dev in self.no_overwrite_map and self.no_overwrite_map[dev]: - continue - rc, _, err = bootstrap.invoke("sbd {} -d {} create".format(opt, dev)) - if not rc: - utils.fatal("Failed to initialize SBD device {}: {}".format(dev, err)) + @staticmethod + def initialize_sbd_device(device_list: typing.List[str], opt_str: str) -> None: + """ + Initialize sbd device with options + """ + shell = sh.cluster_shell() + for dev in device_list: + logger.info("Initializing SBD device %s", dev) + cmd = f"sbd {opt_str} -d {dev} create" + logger.debug("Running command: %s", cmd) + shell.get_stdout_or_raise_error(cmd) def _update_sbd_configuration(self): """ @@ -438,8 +477,7 @@ def _update_sbd_configuration(self): } if self._sbd_devices: sbd_config_dict["SBD_DEVICE"] = ';'.join(self._sbd_devices) - utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict) - bootstrap.sync_file(SYSCONFIG_SBD) + SBDManager.update_configuration(sbd_config_dict) def _get_sbd_device_from_config(self): """ @@ -568,16 +606,16 @@ def join_sbd(self, remote_user, peer_host): bootstrap.invoke("systemctl enable sbd.service") @classmethod - def verify_sbd_device(cls): + def verify_sbd_device(cls, device_list=[], compare_node_list=[]): """ This classmethod is for verifying sbd device on a running cluster Raise ValueError for exceptions """ inst = cls(bootstrap.Context()) - dev_list = inst._get_sbd_device_from_config() + dev_list = device_list or inst._get_sbd_device_from_config() if not dev_list: raise ValueError("No sbd device configured") - inst._verify_sbd_device(dev_list, utils.list_cluster_nodes_except_me()) + inst._verify_sbd_device(dev_list, compare_node_list) @classmethod def get_sbd_device_from_config(cls): @@ -599,10 +637,12 @@ def is_using_diskless_sbd(cls): return False @staticmethod - def update_configuration(sbd_config_dict): + def update_configuration(sbd_config_dict: typing.Dict[str, str]) -> None: """ Update and sync sbd configuration """ + for key, value in sbd_config_dict.items(): + logger.info("Update %s in %s: %s", key, SYSCONFIG_SBD, value) utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict) bootstrap.sync_file(SYSCONFIG_SBD) @@ -630,5 +670,40 @@ def clean_up_existing_sbd_resource(): sbd_id_list = xmlutil.CrmMonXmlParser().get_resource_id_list_via_type(SBDManager.SBD_RA) if xmlutil.CrmMonXmlParser().is_resource_started(SBDManager.SBD_RA): for sbd_id in sbd_id_list: + logger.info("Stop sbd resource '%s'(%s)", sbd_id, SBDManager.SBD_RA) utils.ext_cmd("crm resource stop {}".format(sbd_id)) + logger.info("Remove sbd resource '%s'", ';' .join(sbd_id_list)) utils.ext_cmd("crm configure delete {}".format(' '.join(sbd_id_list))) + + +def enable_sbd_on_cluster(): + cluster_nodes = utils.list_cluster_nodes() + service_manager = ServiceManager() + for node in cluster_nodes: + if not service_manager.service_is_enabled("sbd.service", node): + logger.info("Enable sbd.service on node %s", node) + service_manager.enable_service("sbd.service", node) + + +def disable_sbd_from_cluster(): + ''' + Disable SBD from cluster, the process includes: + - stop and remove sbd agent + - disable sbd.service + - adjust cluster attributes + - adjust related timeout values + ''' + clean_up_existing_sbd_resource() + + cluster_nodes = utils.list_cluster_nodes() + service_manager = ServiceManager() + for node in cluster_nodes: + if service_manager.service_is_enabled("sbd.service", node): + logger.info("Disable sbd.service on node %s", node) + service_manager.disable_service("sbd.service", node) + + out = sh.cluster_shell().get_stdout_or_raise_error("stonith_admin -L") + res = re.search("([0-9]+) fence device[s]* found", out) + # after disable sbd.service, check if sbd is the last stonith device + if res and int(res.group(1)) <= 1: + utils.cleanup_stonith_related_properties() diff --git a/crmsh/ui_root.py b/crmsh/ui_root.py index 12d0f2e1ff..19dd5bd109 100644 --- a/crmsh/ui_root.py +++ b/crmsh/ui_root.py @@ -33,6 +33,7 @@ from . import ui_resource from . import ui_script from . import ui_site +from . import ui_sbd class Root(command.UI): @@ -150,6 +151,10 @@ def do_report(self, context, *args): def do_resource(self): pass + @command.level(ui_sbd.SBD) + def do_sbd(self): + pass + @command.level(ui_script.Script) @command.help('''Cluster scripts Cluster scripts can perform cluster-wide configuration, diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py new file mode 100644 index 0000000000..34107cb0fa --- /dev/null +++ b/crmsh/ui_sbd.py @@ -0,0 +1,452 @@ +import logging +import typing +import re + +from crmsh import sbd +from crmsh import watchdog +from crmsh import command +from crmsh import utils +from crmsh import bootstrap +from crmsh import completers +from crmsh import sh +from crmsh import xmlutil +from crmsh.service_manager import ServiceManager +from crmsh.bootstrap import SYSCONFIG_SBD + + +logger = logging.getLogger(__name__) + + +def sbd_devices_completer(completed_list: typing.List[str]) -> typing.List[str]: + ''' + completion for sbd devices + ''' + if not ServiceManager().service_is_active("sbd.service"): + return [] + dev_list = sbd.SBDManager.get_sbd_device_from_config() + if dev_list: + return [dev for dev in dev_list if dev not in completed_list] + return [] + + +def sbd_configure_completer(completed_list: typing.List[str]) -> typing.List[str]: + ''' + completion for sbd configure command + ''' + service_manager = ServiceManager() + if not service_manager.service_is_active("pacemaker.service"): + return [] + sbd_service_is_enabled = service_manager.service_is_enabled("sbd.service") + dev_list = sbd.SBDManager.get_sbd_device_from_config() + # Show disk-based sbd configure options + # if there are devices in config or sbd.service is not enabled + is_diskbased = bool(dev_list) or not sbd_service_is_enabled + + parameters_pool = [] + if completed_list[1] == '': + parameters_pool = ["show"] + elif completed_list[1] == "show": + if len(completed_list) == 3: + show_types = SBD.SHOW_TYPES if is_diskbased else SBD.DISKLESS_SHOW_TYPES + return [t for t in show_types if t not in completed_list] + else: + return [] + if completed_list[-1] == "device=": + return [] + + timeout_types = SBD.TIMEOUT_TYPES if is_diskbased else SBD.DISKLESS_TIMEOUT_TYPES + parameters_pool.extend([f"{t}-timeout=" for t in timeout_types]) + parameters_pool.append("watchdog-device=") + parameters_pool = [ + p + for p in parameters_pool + if not any(c.startswith(p) for c in completed_list) + ] + + if is_diskbased: + dev_count = sum(1 for c in completed_list if c.startswith("device=")) + if dev_count < sbd.SBDManager.SBD_DEVICE_MAX: + parameters_pool.append("device=") + + return parameters_pool + + +class SBD(command.UI): + ''' + Class for sbd sub-level + + Includes commands: + - sbd configure + - sbd remove + - sbd status + ''' + name = "sbd" + TIMEOUT_TYPES = ("watchdog", "allocate", "loop", "msgwait") + DISKLESS_TIMEOUT_TYPES = ("watchdog",) + SHOW_TYPES = ("disk_metadata", "sysconfig", "property") + DISKLESS_SHOW_TYPES = ("sysconfig", "property") + SYNCED_INFO = f"Already synced {SYSCONFIG_SBD} to all nodes" + RESTART_INFO = "Requires to restart cluster service to take effect" + PCMK_ATTRS = ( + "have-watchdog", + "stonith-timeout", + "stonith-watchdog-timeout", + "stonith-enabled", + "priority-fencing-delay", + "pcmk_delay_max" + ) + PARSE_RE = re.compile( + # Match "device" key with any value, including empty + r'(device)=("[^"]*"|[\w/\d;]*)' + # Match other keys with non-empty values, capturing possible suffix + r'|(\w+)(?:-(\w+))?=("[^"]+"|[\w/\d;]+)' + # Match standalone device path + r'|(/dev/[\w\d]+)' + ) + + class SyntaxError(Exception): + pass + + def __init__(self): + command.UI.__init__(self) + + self.device_list_from_config = sbd.SBDManager.get_sbd_device_from_config() + self.device_meta_dict_runtime = {} + if self.device_list_from_config: + self.device_meta_dict_runtime = sbd.SBDTimeout.get_sbd_device_metadata(self.device_list_from_config[0], timeout_only=True) + else: + try: + self.watchdog_timeout_from_config = sbd.SBDTimeout.get_sbd_watchdog_timeout() + except: + self.watchdog_timeout_from_config = None + self.watchdog_device_from_config = watchdog.Watchdog.get_watchdog_device_from_sbd_config() + + self.service_manager = ServiceManager() + self.cluster_shell = sh.cluster_shell() + self.cluster_nodes = utils.list_cluster_nodes() + + def _pre_check(self, need_sbd_service=False) -> bool: + if not self.service_manager.service_is_active("pacemaker.service"): + logger.error("pacemaker.service is not active") + return False + if not utils.package_is_installed("sbd"): + logger.error("sbd is not installed") + return False + if need_sbd_service and not self.service_manager.service_is_active("sbd.service"): + logger.error("sbd.service is not active") + return False + return True + + @property + def configure_usage(self) -> str: + ''' + Build usage string for sbd configure command, + including disk-based and diskless sbd cases + ''' + timeout_types = self.TIMEOUT_TYPES if self.device_list_from_config else self.DISKLESS_TIMEOUT_TYPES + timeout_usage_str = " ".join([f"[{t}-timeout=]" for t in timeout_types]) + show_types = self.SHOW_TYPES if self.device_list_from_config else self.DISKLESS_SHOW_TYPES + show_usage_str = f"[{'|'.join(show_types)}]" + return ("Usage:\n" + f"crm sbd configure show {show_usage_str}\n" + f"crm sbd configure [device=]... [watchdog-device=] {timeout_usage_str}\n") + + @staticmethod + def _show_sysconfig() -> None: + with open(SYSCONFIG_SBD) as f: + content_list = [line.strip() for line in f.readlines() + if not line.startswith("#") + and line.strip()] + for line in content_list: + print(line) + + def _show_disk_metadata(self) -> None: + for dev in self.device_list_from_config: + print(self.cluster_shell.get_stdout_or_raise_error(f"sbd -d {dev} dump")) + print() + + def _show_property(self) -> None: + out = self.cluster_shell.get_stdout_or_raise_error("crm configure show") + regex = f"({'|'.join(self.PCMK_ATTRS)})=([^\s]+)" + matches = re.findall(regex, out) + for match in matches: + print(f"{match[0]}={match[1]}") + systemd_start_timeout = sbd.SBDTimeout.get_sbd_systemd_start_timeout() + print(f"TimeoutStartUSec={systemd_start_timeout}") + + def _handle_show(self, args) -> bool: + if len(args) > 2: + raise self.SyntaxError("Invalid argument") + elif len(args) == 2: + match args[1]: + case "disk_metadata": + self._show_disk_metadata() + case "sysconfig": + SBD._show_sysconfig() + case "property": + self._show_property() + case _: + raise self.SyntaxError(f"Unknown argument: {args[1]}") + else: + self._show_disk_metadata() + if self.device_list_from_config: + print() + SBD._show_sysconfig() + print() + self._show_property() + return True + + def _parse_args(self, args: typing.List[str]) -> dict[str, int|str|list[str]]: + """ + Parse arguments and verify them + + Possible arguments format like: + device="/dev/sdb5;/dev/sda6" + device="" watchdog-timeout=10 + /dev/sda5 watchdog-timeout=10 watchdog-device=/dev/watchdog + device=/dev/sdb5 device=/dev/sda6 watchdog-timeout=10 msgwait-timeout=20 + """ + parameter_dict = {"device-list": []} + + for arg in args: + match = self.PARSE_RE.match(arg) + if not match: + raise self.SyntaxError(f"Invalid argument: {arg}") + device_key, device_value, key, suffix, value, device_path = match.groups() + + # device= parameter + if device_key: + if device_value: + parameter_dict.setdefault("device-list", []).extend(device_value.split(";")) + # explicitly set empty value, stands for diskless sbd + elif not parameter_dict.get("device-list"): + parameter_dict.pop("device-list", None) + # standalone device parameter + elif device_path: + parameter_dict.setdefault("device-list", []).append(device_path) + # timeout related parameters + elif key in self.TIMEOUT_TYPES and suffix and suffix == "timeout": + if not value.isdigit(): + raise self.SyntaxError(f"Invalid timeout value: {value}") + parameter_dict[key] = int(value) + # watchdog device parameter + elif key == "watchdog" and suffix == "device": + parameter_dict["watchdog-device"] = value + else: + raise self.SyntaxError(f"Unknown argument: {arg}") + + # disk-based sbd case, need to verify device list + if "device-list" in parameter_dict: + device_list = parameter_dict["device-list"] + if device_list: + if len(device_list) > len(set(device_list)): + raise self.SyntaxError("Duplicate device") + sbd.SBDManager.verify_sbd_device(list(set(device_list)-set(self.device_list_from_config))) + if len(set(device_list)|set(self.device_list_from_config)) > sbd.SBDManager.SBD_DEVICE_MAX: + raise self.SyntaxError(f"Exceed max device number: {sbd.SBDManager.SBD_DEVICE_MAX}") + # no device specified and no device in sysconfig + elif not self.device_list_from_config: + raise self.SyntaxError("No device specified") + + watchdog_device = parameter_dict.get("watchdog-device") + parameter_dict["watchdog-device"] = watchdog.Watchdog.get_watchdog_device(watchdog_device) + + logger.debug("Parsed arguments: %s", parameter_dict) + return parameter_dict + + def _has_specified_timeout(self, timeout_dict: dict) -> bool: + return timeout_dict and timeout_dict != self.device_meta_dict_runtime + + @staticmethod + def _check_and_adjust_timeout(timeout_dict: typing.Dict[str, int]) -> typing.Dict[str, int]: + watchdog_timeout = timeout_dict.get("watchdog") + if not watchdog_timeout: + watchdog_timeout, _ = sbd.SBDTimeout.get_advised_sbd_timeout() + logger.info("No watchdog-timeout specified, use advised value: %d", watchdog_timeout) + timeout_dict["watchdog"] = watchdog_timeout + + msgwait_timeout = timeout_dict.get("msgwait") + if not msgwait_timeout: + msgwait_timeout = 2*watchdog_timeout + logger.info("No msgwait-timeout specified, use 2*watchdog-timeout: %d", msgwait_timeout) + timeout_dict["msgwait"] = msgwait_timeout + + if msgwait_timeout < 2*watchdog_timeout: + logger.warning("It's recommended that msgwait-timeout(now:%d) should be at least 2*watchdog-timeout(now:%d)", + msgwait_timeout, watchdog_timeout) + + return timeout_dict + + def _configure_diskbase(self, parameter_dict: dict): + ''' + ''' + device_list = parameter_dict.get("device-list", []) + all_device_list = list(dict.fromkeys(self.device_list_from_config + device_list)) + new_device_list = list(set(device_list) - set(self.device_list_from_config)) + timeout_dict = {k: v for k, v in parameter_dict.items() if k in self.TIMEOUT_TYPES} + + device_list_to_init = [] + # initialize new devices if timeout parameters are not specified + # or it is a subset of runtime metadata + if not timeout_dict or utils.is_subdict(timeout_dict, self.device_meta_dict_runtime): + device_list_to_init = new_device_list + # else initialize all devices + else: + device_list_to_init = all_device_list + # merge runtime metadata with new timeout parameters + timeout_dict = self.device_meta_dict_runtime | timeout_dict + timeout_dict = SBD._check_and_adjust_timeout(timeout_dict) + + timeout_opt_str = SBD._convert_meta_dict_to_str(timeout_dict) + sbd.SBDManager.initialize_sbd_device(device_list=device_list_to_init, opt_str=timeout_opt_str) + + update_dict = {} + if new_device_list: + update_dict = {"SBD_DEVICE": ";".join(all_device_list)} + watchdog_device = parameter_dict.get("watchdog-device") + if watchdog_device and watchdog_device != self.watchdog_device_from_config: + update_dict["SBD_WATCHDOG_DEV"] = watchdog_device + watchdog_timeout = parameter_dict.get("watchdog") + if watchdog_timeout and watchdog_timeout != self.device_meta_dict_runtime.get("watchdog"): + update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout) + if update_dict: + sbd.SBDManager.update_configuration(update_dict) + + msgwait_timeout = parameter_dict.get("msgwait") + if msgwait_timeout and msgwait_timeout != self.device_meta_dict_runtime.get("msgwait"): + sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration() + + sbd.enable_sbd_on_cluster() + if update_dict: + logger.info(self.SYNCED_INFO) + logger.info(self.RESTART_INFO) + + def _configure_diskless(self, parameter_dict: dict): + ''' + ''' + update_dict = {} + watchdog_timeout = parameter_dict.get("watchdog") + if watchdog_timeout and watchdog_timeout != self.watchdog_timeout_from_config: + update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout) + watchdog_device = parameter_dict.get("watchdog-device") + if watchdog_device and watchdog_device != self.watchdog_device_from_config: + update_dict["SBD_WATCHDOG_DEV"] = watchdog_device + if update_dict: + sbd.SBDManager.update_configuration(update_dict) + + if watchdog_timeout and watchdog_timeout != self.watchdog_timeout_from_config: + sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration() + if update_dict: + logger.info(self.SYNCED_INFO) + + @staticmethod + def _convert_meta_dict_to_str(meta_dict: dict) -> str: + timeout_option_mapping = { + "watchdog": "-1", + "allocate": "-2", + "loop": "-3", + "msgwait": "-4" + } + timeout_opt_list = [f"{timeout_option_mapping[k]} {v}" for k, v in meta_dict.items() + if k in timeout_option_mapping] + return ' '.join(timeout_opt_list) + + @command.completers_repeating(sbd_configure_completer) + def do_configure(self, context, *args) -> bool: + ''' + Implement sbd configure command + ''' + if not self._pre_check(): + return False + + try: + if not args: + raise self.SyntaxError("No argument") + + if args[0] == "show": + return self._handle_show(args) + parameter_dict = self._parse_args(args) + # disk-based sbd case + if "device-list" in parameter_dict: + return self._configure_diskbase(parameter_dict) + # diskless sbd case + else: + return self._configure_diskless(parameter_dict) + + except self.SyntaxError as e: + logger.error(str(e)) + print(self.configure_usage) + return False + + @command.completers_repeating(sbd_devices_completer) + def do_remove(self, context, *args) -> bool: + ''' + Implement sbd remove command + ''' + if not self._pre_check(need_sbd_service=True): + return False + + parameter_dict = self._parse_args(args) + dev_list = parameter_dict.get("device-list", []) + if dev_list: + if not self.device_list_from_config: + logger.error("No sbd device found in config") + return False + for dev in dev_list: + if dev not in self.device_list_from_config: + logger.error("Device %s is not in config", dev) + return False + changed_dev_list = set(self.device_list_from_config) - set(dev_list) + # remove part of devices from config + if changed_dev_list: + logger.info("Remove '%s' from %s", ";".join(dev_list), SYSCONFIG_SBD) + sbd.SBDManager.update_configuration({"SBD_DEVICE": ";".join(changed_dev_list)}) + logger.info(self.SYNCED_INFO) + # remove all devices, equivalent to stop sbd.service + else: + sbd.disable_sbd_from_cluster() + else: + sbd.disable_sbd_from_cluster() + + logger.info(self.RESTART_INFO) + return True + + def do_status(self, context) -> bool: + ''' + Implement sbd status command + ''' + if not self._pre_check(): + return False + + print("sbd.service status: (active|enabled|since)") + for node in self.cluster_nodes: + is_active = self.service_manager.service_is_active("sbd.service", node) + is_active_str = "YES" if is_active else "NO" + is_enabled = self.service_manager.service_is_enabled("sbd.service", node) + is_enabled_str = "YES" if is_enabled else "NO" + systemd_property = "ActiveEnterTimestamp" if is_active else "ActiveExitTimestamp" + since_str_prefix = "active since" if is_active else "disactive since" + systemctl_show_cmd = f"systemctl show sbd.service --property={systemd_property} --value" + since = self.cluster_shell.get_stdout_or_raise_error(systemctl_show_cmd, node) + print(f"{node}: {is_active_str:<4}|{is_enabled_str:<4}|{since_str_prefix} {since}") + print() + + print("watchdog info: (device|driver|kernel timeout)") + watchdog_sbd_re = "\[[0-9]+\] (/dev/.*)\nIdentity: Busy: .*sbd.*\nDriver: (.*)" + for node in self.cluster_nodes: + out = self.cluster_shell.get_stdout_or_raise_error("sbd query-watchdog", node) + res = re.search(watchdog_sbd_re, out) + if res: + device, driver = res.groups() + kernel_timeout = self.cluster_shell.get_stdout_or_raise_error("cat /proc/sys/kernel/watchdog_thresh", node) + print(f"{node}: {device}|{driver}|{kernel_timeout}") + else: + logger.error("Failed to get watchdog info from %s", node) + print() + + if xmlutil.CrmMonXmlParser().is_resource_configured(sbd.SBDManager.SBD_RA): + print("fence_sbd status: ") + sbd_id_list = xmlutil.CrmMonXmlParser().get_resource_id_list_via_type(sbd.SBDManager.SBD_RA) + for sbd_id in sbd_id_list: + out = self.cluster_shell.get_stdout_or_raise_error(f"crm resource status {sbd_id}") + print(out) diff --git a/crmsh/utils.py b/crmsh/utils.py index 605bcacaa9..666cb8c740 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -2781,13 +2781,15 @@ def get_pcmk_delay_max(two_node_without_qdevice=False): return 0 -def get_property(name, property_type="crm_config", peer=None): +def get_property(name, property_type="crm_config", peer=None, get_default=True): """ Get cluster properties "property_type" can be crm_config|rsc_defaults|op_defaults + "get_default" is used to get the default value from cluster metadata, + when it is False, the property value will be got from cib """ - if property_type == "crm_config": + if property_type == "crm_config" and get_default: cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE) cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name) else: @@ -3137,4 +3139,19 @@ def time_value_with_unit(time_value): Check if the time value contains unit """ return re.search(r'^\d+[a-z]+$', time_value) is not None + + +def cleanup_stonith_related_properties(): + for p in ("stonith-watchdog-timeout", "stonith-timeout", "priority-fencing-delay"): + if get_property(p, get_default=False): + delete_property(p) + if get_property("stonith-enabled") == "true": + set_property("stonith-enabled", "false") + + +def is_subdict(sub_dict, main_dict): + """ + Check if sub_dict is a sub-dictionary of main_dict + """ + return all(item in main_dict.items() for item in sub_dict.items()) # vim:ts=4:sw=4:et: diff --git a/crmsh/watchdog.py b/crmsh/watchdog.py index 6d0d2cff44..00e0f60a5e 100644 --- a/crmsh/watchdog.py +++ b/crmsh/watchdog.py @@ -27,7 +27,7 @@ def watchdog_device_name(self): return self._watchdog_device_name @staticmethod - def _verify_watchdog_device(dev, ignore_error=False): + def verify_watchdog_device(dev, ignore_error=False): """ Use wdctl to verify watchdog device """ @@ -48,7 +48,7 @@ def _load_watchdog_driver(driver): invoke("systemctl restart systemd-modules-load") @staticmethod - def _get_watchdog_device_from_sbd_config(): + def get_watchdog_device_from_sbd_config(): """ Try to get watchdog device name from sbd config file """ @@ -81,7 +81,7 @@ def _get_device_through_driver(self, driver_name): Get watchdog device name which has driver_name """ for device, driver in self._watchdog_info_dict.items(): - if driver == driver_name and self._verify_watchdog_device(device): + if driver == driver_name and self.verify_watchdog_device(device): return device return None @@ -108,7 +108,7 @@ def _get_first_unused_device(self): Get first unused watchdog device name """ for dev in self._watchdog_info_dict: - if self._verify_watchdog_device(dev, ignore_error=True): + if self.verify_watchdog_device(dev, ignore_error=True): return dev return None @@ -120,8 +120,8 @@ def _set_input(self): 3. Set the self._input as softdog """ if not self._input: - dev = self._get_watchdog_device_from_sbd_config() - if dev and self._verify_watchdog_device(dev, ignore_error=True): + dev = self.get_watchdog_device_from_sbd_config() + if dev and self.verify_watchdog_device(dev, ignore_error=True): self._input = dev return first_unused = self._get_first_unused_device() @@ -131,7 +131,7 @@ def _valid_device(self, dev): """ Is an unused watchdog device """ - if dev in self._watchdog_info_dict and self._verify_watchdog_device(dev): + if dev in self._watchdog_info_dict and self.verify_watchdog_device(dev): return True return False @@ -142,7 +142,7 @@ def join_watchdog(self): """ self._set_watchdog_info() - res = self._get_watchdog_device_from_sbd_config() + res = self.get_watchdog_device_from_sbd_config() if not res: utils.fatal("Failed to get watchdog device from {}".format(SYSCONFIG_SBD)) self._input = res @@ -177,3 +177,9 @@ def init_watchdog(self): if res: self._watchdog_device_name = res return + + @classmethod + def get_watchdog_device(cls, dev_or_driver=None): + w = cls(_input=dev_or_driver) + w.init_watchdog() + return w.watchdog_device_name