diff --git a/crmsh/bootstrap.py b/crmsh/bootstrap.py index f68ffaa9e..40d227a0d 100644 --- a/crmsh/bootstrap.py +++ b/crmsh/bootstrap.py @@ -2785,7 +2785,12 @@ def sync_file(path): """ Sync files between cluster nodes """ - if _context.skip_csync2: + if _context: + skip_csync2 = _context.skip_csync2 + else: + skip_csync2 = not ServiceManager().service_is_active(CSYNC2_SERVICE) + + if skip_csync2: utils.cluster_copy_file(path, nodes=_context.node_list_in_cluster, output=False) else: csync2_update(path) diff --git a/crmsh/sbd.py b/crmsh/sbd.py index d7f569e68..cf25bc454 100644 --- a/crmsh/sbd.py +++ b/crmsh/sbd.py @@ -1,5 +1,6 @@ import os import re +import typing from . import utils, sh from . import bootstrap from .bootstrap import SYSCONFIG_SBD, SBD_SYSTEMD_DELAY_START_DIR @@ -68,6 +69,17 @@ def _set_sbd_msgwait(self): sbd_msgwait = sbd_msgwait_default self.sbd_msgwait = sbd_msgwait + @classmethod + def get_advised_sbd_timeout(cls) -> typing.Tuple[int, int]: + """ + Get suitable sbd_watchdog_timeout and sbd_msgwait + """ + ctx = bootstrap.Context() + ctx.load_profiles() + time_inst = cls(ctx) + time_inst.initialize_timeout() + return time_inst.sbd_watchdog_timeout, time_inst.sbd_msgwait + def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self): """ When using diskless SBD with Qdevice, adjust value of sbd_watchdog_timeout @@ -85,17 +97,34 @@ def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self): logger.warning("sbd_watchdog_timeout is set to {} for qdevice, it was {}".format(self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE, self.sbd_watchdog_timeout)) self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE + @staticmethod + def get_sbd_device_metadata(dev, timeout_only=False) -> dict: + """ + Extract metadata from sbd device header + """ + sbd_info = {} + try: + out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev)) + except: + return sbd_info + pattern = r"UUID\s+:\s+(\S+)|Timeout\s+\((\w+)\)\s+:\s+(\d+)" + matches = re.findall(pattern, out) + for uuid, timeout_type, timeout_value in matches: + if uuid and not timeout_only: + sbd_info["uuid"] = uuid + elif timeout_type and timeout_value: + sbd_info[timeout_type] = int(timeout_value) + return sbd_info + @staticmethod def get_sbd_msgwait(dev): """ Get msgwait for sbd device """ - out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev)) - # Format like "Timeout (msgwait) : 30" - res = re.search("\(msgwait\)\s+:\s+(\d+)", out) + res = SBDTimeout.get_sbd_device_metadata(dev).get("msgwait") if not res: - raise ValueError("Cannot get sbd msgwait for {}".format(dev)) - return int(res.group(1)) + raise ValueError(f"Cannot get sbd msgwait for {dev}") + return res @staticmethod def get_sbd_watchdog_timeout(): @@ -195,6 +224,12 @@ def is_sbd_delay_start(): res = SBDManager.get_sbd_value_from_config("SBD_DELAY_START") return res and res != "no" + @staticmethod + def get_sbd_systemd_start_timeout() -> int: + cmd = "systemctl show -p TimeoutStartUSec sbd --value" + out = sh.cluster_shell().get_stdout_or_raise_error(cmd) + return utils.get_systemd_timeout_start_in_sec(out) + def adjust_systemd_start_timeout(self): """ Adjust start timeout for sbd when set SBD_DELAY_START @@ -203,9 +238,7 @@ def adjust_systemd_start_timeout(self): if sbd_delay_start_value == "no": return - cmd = "systemctl show -p TimeoutStartUSec sbd --value" - out = sh.cluster_shell().get_stdout_or_raise_error(cmd) - start_timeout = utils.get_systemd_timeout_start_in_sec(out) + start_timeout = SBDTimeout.get_sbd_systemd_start_timeout() if start_timeout > int(sbd_delay_start_value): return @@ -269,6 +302,7 @@ class SBDManager(object): DISKLESS_CRM_CMD = "crm configure property stonith-enabled=true stonith-watchdog-timeout={} stonith-timeout={}" SBD_RA = "stonith:fence_sbd" SBD_RA_ID = "stonith-sbd" + SBD_DEVICE_MAX = 3 def __init__(self, context): """ @@ -292,11 +326,10 @@ def _get_device_uuid(dev, node=None): """ Get UUID for specific device and node """ - out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev), node) - res = re.search("UUID\s*:\s*(.*)\n", out) + res = SBDTimeout.get_sbd_device_metadata(dev).get("uuid") if not res: raise ValueError("Cannot find sbd device UUID for {}".format(dev)) - return res.group(1) + return res def _compare_device_uuid(self, dev, node_list): """ @@ -314,8 +347,8 @@ def _verify_sbd_device(self, dev_list, compare_node_list=[]): """ Verify sbd device """ - if len(dev_list) > 3: - raise ValueError("Maximum number of SBD device is 3") + if len(dev_list) > self.SBD_DEVICE_MAX: + raise ValueError(f"Maximum number of SBD device is {self.SBD_DEVICE_MAX}") for dev in dev_list: if not utils.is_block_device(dev): raise ValueError("{} doesn't look like a block device".format(dev)) @@ -402,26 +435,32 @@ def _initialize_sbd(self): For diskless-sbd, set sbd_watchdog_timeout then return; For disk-based-sbd, also calculate the msgwait value, then initialize the SBD device. """ - msg = "" if self.diskless_sbd: - msg = "Configuring diskless SBD" - elif not all(self.no_overwrite_map.values()): - msg = "Initializing SBD" - if msg: - logger.info(msg) + logger.info("Configuring diskless SBD") self.timeout_inst = SBDTimeout(self._context) self.timeout_inst.initialize_timeout() if self.diskless_sbd: return - opt = "-4 {} -1 {}".format(self.timeout_inst.sbd_msgwait, self.timeout_inst.sbd_watchdog_timeout) + opt_str = "-4 {} -1 {}".format(self.timeout_inst.sbd_msgwait, self.timeout_inst.sbd_watchdog_timeout) + device_list = [ + dev for dev in self._sbd_devices + if dev not in self.no_overwrite_map + or not self.no_overwrite_map[dev] + ] + SBDManager.initialize_sbd_device(device_list, opt_str) - for dev in self._sbd_devices: - if dev in self.no_overwrite_map and self.no_overwrite_map[dev]: - continue - rc, _, err = bootstrap.invoke("sbd {} -d {} create".format(opt, dev)) - if not rc: - utils.fatal("Failed to initialize SBD device {}: {}".format(dev, err)) + @staticmethod + def initialize_sbd_device(device_list: typing.List[str], opt_str: str) -> None: + """ + Initialize sbd device with options + """ + shell = sh.cluster_shell() + for dev in device_list: + logger.info("Initializing SBD device %s", dev) + cmd = f"sbd {opt_str} -d {dev} create" + logger.debug("Running command: %s", cmd) + shell.get_stdout_or_raise_error(cmd) def _update_sbd_configuration(self): """ @@ -438,8 +477,7 @@ def _update_sbd_configuration(self): } if self._sbd_devices: sbd_config_dict["SBD_DEVICE"] = ';'.join(self._sbd_devices) - utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict) - bootstrap.sync_file(SYSCONFIG_SBD) + SBDManager.update_configuration(sbd_config_dict) def _get_sbd_device_from_config(self): """ @@ -568,16 +606,16 @@ def join_sbd(self, remote_user, peer_host): bootstrap.invoke("systemctl enable sbd.service") @classmethod - def verify_sbd_device(cls): + def verify_sbd_device(cls, device_list=[], compare_node_list=[]): """ This classmethod is for verifying sbd device on a running cluster Raise ValueError for exceptions """ inst = cls(bootstrap.Context()) - dev_list = inst._get_sbd_device_from_config() + dev_list = device_list or inst._get_sbd_device_from_config() if not dev_list: raise ValueError("No sbd device configured") - inst._verify_sbd_device(dev_list, utils.list_cluster_nodes_except_me()) + inst._verify_sbd_device(dev_list, compare_node_list) @classmethod def get_sbd_device_from_config(cls): @@ -599,10 +637,12 @@ def is_using_diskless_sbd(cls): return False @staticmethod - def update_configuration(sbd_config_dict): + def update_configuration(sbd_config_dict: typing.Dict[str, str]) -> None: """ Update and sync sbd configuration """ + for key, value in sbd_config_dict.items(): + logger.info("Update %s in %s: %s", key, SYSCONFIG_SBD, value) utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict) bootstrap.sync_file(SYSCONFIG_SBD) @@ -630,5 +670,40 @@ def clean_up_existing_sbd_resource(): sbd_id_list = xmlutil.CrmMonXmlParser().get_resource_id_list_via_type(SBDManager.SBD_RA) if xmlutil.CrmMonXmlParser().is_resource_started(SBDManager.SBD_RA): for sbd_id in sbd_id_list: + logger.info("Stop sbd resource '%s'(%s)", sbd_id, SBDManager.SBD_RA) utils.ext_cmd("crm resource stop {}".format(sbd_id)) + logger.info("Remove sbd resource '%s'", ';' .join(sbd_id_list)) utils.ext_cmd("crm configure delete {}".format(' '.join(sbd_id_list))) + + +def enable_sbd_on_cluster(): + cluster_nodes = utils.list_cluster_nodes() + service_manager = ServiceManager() + for node in cluster_nodes: + if not service_manager.service_is_enabled("sbd.service", node): + logger.info("Enable sbd.service on node %s", node) + service_manager.enable_service("sbd.service", node) + + +def disable_sbd_from_cluster(): + ''' + Disable SBD from cluster, the process includes: + - stop and remove sbd agent + - disable sbd.service + - adjust cluster attributes + - adjust related timeout values + ''' + clean_up_existing_sbd_resource() + + cluster_nodes = utils.list_cluster_nodes() + service_manager = ServiceManager() + for node in cluster_nodes: + if service_manager.service_is_enabled("sbd.service", node): + logger.info("Disable sbd.service on node %s", node) + service_manager.disable_service("sbd.service", node) + + out = sh.cluster_shell().get_stdout_or_raise_error("stonith_admin -L") + res = re.search("([0-9]+) fence device[s]* found", out) + # after disable sbd.service, check if sbd is the last stonith device + if res and int(res.group(1)) <= 1: + utils.cleanup_stonith_related_properties() diff --git a/crmsh/ui_root.py b/crmsh/ui_root.py index 12d0f2e1f..19dd5bd10 100644 --- a/crmsh/ui_root.py +++ b/crmsh/ui_root.py @@ -33,6 +33,7 @@ from . import ui_resource from . import ui_script from . import ui_site +from . import ui_sbd class Root(command.UI): @@ -150,6 +151,10 @@ def do_report(self, context, *args): def do_resource(self): pass + @command.level(ui_sbd.SBD) + def do_sbd(self): + pass + @command.level(ui_script.Script) @command.help('''Cluster scripts Cluster scripts can perform cluster-wide configuration, diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py new file mode 100644 index 000000000..34107cb0f --- /dev/null +++ b/crmsh/ui_sbd.py @@ -0,0 +1,452 @@ +import logging +import typing +import re + +from crmsh import sbd +from crmsh import watchdog +from crmsh import command +from crmsh import utils +from crmsh import bootstrap +from crmsh import completers +from crmsh import sh +from crmsh import xmlutil +from crmsh.service_manager import ServiceManager +from crmsh.bootstrap import SYSCONFIG_SBD + + +logger = logging.getLogger(__name__) + + +def sbd_devices_completer(completed_list: typing.List[str]) -> typing.List[str]: + ''' + completion for sbd devices + ''' + if not ServiceManager().service_is_active("sbd.service"): + return [] + dev_list = sbd.SBDManager.get_sbd_device_from_config() + if dev_list: + return [dev for dev in dev_list if dev not in completed_list] + return [] + + +def sbd_configure_completer(completed_list: typing.List[str]) -> typing.List[str]: + ''' + completion for sbd configure command + ''' + service_manager = ServiceManager() + if not service_manager.service_is_active("pacemaker.service"): + return [] + sbd_service_is_enabled = service_manager.service_is_enabled("sbd.service") + dev_list = sbd.SBDManager.get_sbd_device_from_config() + # Show disk-based sbd configure options + # if there are devices in config or sbd.service is not enabled + is_diskbased = bool(dev_list) or not sbd_service_is_enabled + + parameters_pool = [] + if completed_list[1] == '': + parameters_pool = ["show"] + elif completed_list[1] == "show": + if len(completed_list) == 3: + show_types = SBD.SHOW_TYPES if is_diskbased else SBD.DISKLESS_SHOW_TYPES + return [t for t in show_types if t not in completed_list] + else: + return [] + if completed_list[-1] == "device=": + return [] + + timeout_types = SBD.TIMEOUT_TYPES if is_diskbased else SBD.DISKLESS_TIMEOUT_TYPES + parameters_pool.extend([f"{t}-timeout=" for t in timeout_types]) + parameters_pool.append("watchdog-device=") + parameters_pool = [ + p + for p in parameters_pool + if not any(c.startswith(p) for c in completed_list) + ] + + if is_diskbased: + dev_count = sum(1 for c in completed_list if c.startswith("device=")) + if dev_count < sbd.SBDManager.SBD_DEVICE_MAX: + parameters_pool.append("device=") + + return parameters_pool + + +class SBD(command.UI): + ''' + Class for sbd sub-level + + Includes commands: + - sbd configure + - sbd remove + - sbd status + ''' + name = "sbd" + TIMEOUT_TYPES = ("watchdog", "allocate", "loop", "msgwait") + DISKLESS_TIMEOUT_TYPES = ("watchdog",) + SHOW_TYPES = ("disk_metadata", "sysconfig", "property") + DISKLESS_SHOW_TYPES = ("sysconfig", "property") + SYNCED_INFO = f"Already synced {SYSCONFIG_SBD} to all nodes" + RESTART_INFO = "Requires to restart cluster service to take effect" + PCMK_ATTRS = ( + "have-watchdog", + "stonith-timeout", + "stonith-watchdog-timeout", + "stonith-enabled", + "priority-fencing-delay", + "pcmk_delay_max" + ) + PARSE_RE = re.compile( + # Match "device" key with any value, including empty + r'(device)=("[^"]*"|[\w/\d;]*)' + # Match other keys with non-empty values, capturing possible suffix + r'|(\w+)(?:-(\w+))?=("[^"]+"|[\w/\d;]+)' + # Match standalone device path + r'|(/dev/[\w\d]+)' + ) + + class SyntaxError(Exception): + pass + + def __init__(self): + command.UI.__init__(self) + + self.device_list_from_config = sbd.SBDManager.get_sbd_device_from_config() + self.device_meta_dict_runtime = {} + if self.device_list_from_config: + self.device_meta_dict_runtime = sbd.SBDTimeout.get_sbd_device_metadata(self.device_list_from_config[0], timeout_only=True) + else: + try: + self.watchdog_timeout_from_config = sbd.SBDTimeout.get_sbd_watchdog_timeout() + except: + self.watchdog_timeout_from_config = None + self.watchdog_device_from_config = watchdog.Watchdog.get_watchdog_device_from_sbd_config() + + self.service_manager = ServiceManager() + self.cluster_shell = sh.cluster_shell() + self.cluster_nodes = utils.list_cluster_nodes() + + def _pre_check(self, need_sbd_service=False) -> bool: + if not self.service_manager.service_is_active("pacemaker.service"): + logger.error("pacemaker.service is not active") + return False + if not utils.package_is_installed("sbd"): + logger.error("sbd is not installed") + return False + if need_sbd_service and not self.service_manager.service_is_active("sbd.service"): + logger.error("sbd.service is not active") + return False + return True + + @property + def configure_usage(self) -> str: + ''' + Build usage string for sbd configure command, + including disk-based and diskless sbd cases + ''' + timeout_types = self.TIMEOUT_TYPES if self.device_list_from_config else self.DISKLESS_TIMEOUT_TYPES + timeout_usage_str = " ".join([f"[{t}-timeout=]" for t in timeout_types]) + show_types = self.SHOW_TYPES if self.device_list_from_config else self.DISKLESS_SHOW_TYPES + show_usage_str = f"[{'|'.join(show_types)}]" + return ("Usage:\n" + f"crm sbd configure show {show_usage_str}\n" + f"crm sbd configure [device=]... [watchdog-device=] {timeout_usage_str}\n") + + @staticmethod + def _show_sysconfig() -> None: + with open(SYSCONFIG_SBD) as f: + content_list = [line.strip() for line in f.readlines() + if not line.startswith("#") + and line.strip()] + for line in content_list: + print(line) + + def _show_disk_metadata(self) -> None: + for dev in self.device_list_from_config: + print(self.cluster_shell.get_stdout_or_raise_error(f"sbd -d {dev} dump")) + print() + + def _show_property(self) -> None: + out = self.cluster_shell.get_stdout_or_raise_error("crm configure show") + regex = f"({'|'.join(self.PCMK_ATTRS)})=([^\s]+)" + matches = re.findall(regex, out) + for match in matches: + print(f"{match[0]}={match[1]}") + systemd_start_timeout = sbd.SBDTimeout.get_sbd_systemd_start_timeout() + print(f"TimeoutStartUSec={systemd_start_timeout}") + + def _handle_show(self, args) -> bool: + if len(args) > 2: + raise self.SyntaxError("Invalid argument") + elif len(args) == 2: + match args[1]: + case "disk_metadata": + self._show_disk_metadata() + case "sysconfig": + SBD._show_sysconfig() + case "property": + self._show_property() + case _: + raise self.SyntaxError(f"Unknown argument: {args[1]}") + else: + self._show_disk_metadata() + if self.device_list_from_config: + print() + SBD._show_sysconfig() + print() + self._show_property() + return True + + def _parse_args(self, args: typing.List[str]) -> dict[str, int|str|list[str]]: + """ + Parse arguments and verify them + + Possible arguments format like: + device="/dev/sdb5;/dev/sda6" + device="" watchdog-timeout=10 + /dev/sda5 watchdog-timeout=10 watchdog-device=/dev/watchdog + device=/dev/sdb5 device=/dev/sda6 watchdog-timeout=10 msgwait-timeout=20 + """ + parameter_dict = {"device-list": []} + + for arg in args: + match = self.PARSE_RE.match(arg) + if not match: + raise self.SyntaxError(f"Invalid argument: {arg}") + device_key, device_value, key, suffix, value, device_path = match.groups() + + # device= parameter + if device_key: + if device_value: + parameter_dict.setdefault("device-list", []).extend(device_value.split(";")) + # explicitly set empty value, stands for diskless sbd + elif not parameter_dict.get("device-list"): + parameter_dict.pop("device-list", None) + # standalone device parameter + elif device_path: + parameter_dict.setdefault("device-list", []).append(device_path) + # timeout related parameters + elif key in self.TIMEOUT_TYPES and suffix and suffix == "timeout": + if not value.isdigit(): + raise self.SyntaxError(f"Invalid timeout value: {value}") + parameter_dict[key] = int(value) + # watchdog device parameter + elif key == "watchdog" and suffix == "device": + parameter_dict["watchdog-device"] = value + else: + raise self.SyntaxError(f"Unknown argument: {arg}") + + # disk-based sbd case, need to verify device list + if "device-list" in parameter_dict: + device_list = parameter_dict["device-list"] + if device_list: + if len(device_list) > len(set(device_list)): + raise self.SyntaxError("Duplicate device") + sbd.SBDManager.verify_sbd_device(list(set(device_list)-set(self.device_list_from_config))) + if len(set(device_list)|set(self.device_list_from_config)) > sbd.SBDManager.SBD_DEVICE_MAX: + raise self.SyntaxError(f"Exceed max device number: {sbd.SBDManager.SBD_DEVICE_MAX}") + # no device specified and no device in sysconfig + elif not self.device_list_from_config: + raise self.SyntaxError("No device specified") + + watchdog_device = parameter_dict.get("watchdog-device") + parameter_dict["watchdog-device"] = watchdog.Watchdog.get_watchdog_device(watchdog_device) + + logger.debug("Parsed arguments: %s", parameter_dict) + return parameter_dict + + def _has_specified_timeout(self, timeout_dict: dict) -> bool: + return timeout_dict and timeout_dict != self.device_meta_dict_runtime + + @staticmethod + def _check_and_adjust_timeout(timeout_dict: typing.Dict[str, int]) -> typing.Dict[str, int]: + watchdog_timeout = timeout_dict.get("watchdog") + if not watchdog_timeout: + watchdog_timeout, _ = sbd.SBDTimeout.get_advised_sbd_timeout() + logger.info("No watchdog-timeout specified, use advised value: %d", watchdog_timeout) + timeout_dict["watchdog"] = watchdog_timeout + + msgwait_timeout = timeout_dict.get("msgwait") + if not msgwait_timeout: + msgwait_timeout = 2*watchdog_timeout + logger.info("No msgwait-timeout specified, use 2*watchdog-timeout: %d", msgwait_timeout) + timeout_dict["msgwait"] = msgwait_timeout + + if msgwait_timeout < 2*watchdog_timeout: + logger.warning("It's recommended that msgwait-timeout(now:%d) should be at least 2*watchdog-timeout(now:%d)", + msgwait_timeout, watchdog_timeout) + + return timeout_dict + + def _configure_diskbase(self, parameter_dict: dict): + ''' + ''' + device_list = parameter_dict.get("device-list", []) + all_device_list = list(dict.fromkeys(self.device_list_from_config + device_list)) + new_device_list = list(set(device_list) - set(self.device_list_from_config)) + timeout_dict = {k: v for k, v in parameter_dict.items() if k in self.TIMEOUT_TYPES} + + device_list_to_init = [] + # initialize new devices if timeout parameters are not specified + # or it is a subset of runtime metadata + if not timeout_dict or utils.is_subdict(timeout_dict, self.device_meta_dict_runtime): + device_list_to_init = new_device_list + # else initialize all devices + else: + device_list_to_init = all_device_list + # merge runtime metadata with new timeout parameters + timeout_dict = self.device_meta_dict_runtime | timeout_dict + timeout_dict = SBD._check_and_adjust_timeout(timeout_dict) + + timeout_opt_str = SBD._convert_meta_dict_to_str(timeout_dict) + sbd.SBDManager.initialize_sbd_device(device_list=device_list_to_init, opt_str=timeout_opt_str) + + update_dict = {} + if new_device_list: + update_dict = {"SBD_DEVICE": ";".join(all_device_list)} + watchdog_device = parameter_dict.get("watchdog-device") + if watchdog_device and watchdog_device != self.watchdog_device_from_config: + update_dict["SBD_WATCHDOG_DEV"] = watchdog_device + watchdog_timeout = parameter_dict.get("watchdog") + if watchdog_timeout and watchdog_timeout != self.device_meta_dict_runtime.get("watchdog"): + update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout) + if update_dict: + sbd.SBDManager.update_configuration(update_dict) + + msgwait_timeout = parameter_dict.get("msgwait") + if msgwait_timeout and msgwait_timeout != self.device_meta_dict_runtime.get("msgwait"): + sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration() + + sbd.enable_sbd_on_cluster() + if update_dict: + logger.info(self.SYNCED_INFO) + logger.info(self.RESTART_INFO) + + def _configure_diskless(self, parameter_dict: dict): + ''' + ''' + update_dict = {} + watchdog_timeout = parameter_dict.get("watchdog") + if watchdog_timeout and watchdog_timeout != self.watchdog_timeout_from_config: + update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout) + watchdog_device = parameter_dict.get("watchdog-device") + if watchdog_device and watchdog_device != self.watchdog_device_from_config: + update_dict["SBD_WATCHDOG_DEV"] = watchdog_device + if update_dict: + sbd.SBDManager.update_configuration(update_dict) + + if watchdog_timeout and watchdog_timeout != self.watchdog_timeout_from_config: + sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration() + if update_dict: + logger.info(self.SYNCED_INFO) + + @staticmethod + def _convert_meta_dict_to_str(meta_dict: dict) -> str: + timeout_option_mapping = { + "watchdog": "-1", + "allocate": "-2", + "loop": "-3", + "msgwait": "-4" + } + timeout_opt_list = [f"{timeout_option_mapping[k]} {v}" for k, v in meta_dict.items() + if k in timeout_option_mapping] + return ' '.join(timeout_opt_list) + + @command.completers_repeating(sbd_configure_completer) + def do_configure(self, context, *args) -> bool: + ''' + Implement sbd configure command + ''' + if not self._pre_check(): + return False + + try: + if not args: + raise self.SyntaxError("No argument") + + if args[0] == "show": + return self._handle_show(args) + parameter_dict = self._parse_args(args) + # disk-based sbd case + if "device-list" in parameter_dict: + return self._configure_diskbase(parameter_dict) + # diskless sbd case + else: + return self._configure_diskless(parameter_dict) + + except self.SyntaxError as e: + logger.error(str(e)) + print(self.configure_usage) + return False + + @command.completers_repeating(sbd_devices_completer) + def do_remove(self, context, *args) -> bool: + ''' + Implement sbd remove command + ''' + if not self._pre_check(need_sbd_service=True): + return False + + parameter_dict = self._parse_args(args) + dev_list = parameter_dict.get("device-list", []) + if dev_list: + if not self.device_list_from_config: + logger.error("No sbd device found in config") + return False + for dev in dev_list: + if dev not in self.device_list_from_config: + logger.error("Device %s is not in config", dev) + return False + changed_dev_list = set(self.device_list_from_config) - set(dev_list) + # remove part of devices from config + if changed_dev_list: + logger.info("Remove '%s' from %s", ";".join(dev_list), SYSCONFIG_SBD) + sbd.SBDManager.update_configuration({"SBD_DEVICE": ";".join(changed_dev_list)}) + logger.info(self.SYNCED_INFO) + # remove all devices, equivalent to stop sbd.service + else: + sbd.disable_sbd_from_cluster() + else: + sbd.disable_sbd_from_cluster() + + logger.info(self.RESTART_INFO) + return True + + def do_status(self, context) -> bool: + ''' + Implement sbd status command + ''' + if not self._pre_check(): + return False + + print("sbd.service status: (active|enabled|since)") + for node in self.cluster_nodes: + is_active = self.service_manager.service_is_active("sbd.service", node) + is_active_str = "YES" if is_active else "NO" + is_enabled = self.service_manager.service_is_enabled("sbd.service", node) + is_enabled_str = "YES" if is_enabled else "NO" + systemd_property = "ActiveEnterTimestamp" if is_active else "ActiveExitTimestamp" + since_str_prefix = "active since" if is_active else "disactive since" + systemctl_show_cmd = f"systemctl show sbd.service --property={systemd_property} --value" + since = self.cluster_shell.get_stdout_or_raise_error(systemctl_show_cmd, node) + print(f"{node}: {is_active_str:<4}|{is_enabled_str:<4}|{since_str_prefix} {since}") + print() + + print("watchdog info: (device|driver|kernel timeout)") + watchdog_sbd_re = "\[[0-9]+\] (/dev/.*)\nIdentity: Busy: .*sbd.*\nDriver: (.*)" + for node in self.cluster_nodes: + out = self.cluster_shell.get_stdout_or_raise_error("sbd query-watchdog", node) + res = re.search(watchdog_sbd_re, out) + if res: + device, driver = res.groups() + kernel_timeout = self.cluster_shell.get_stdout_or_raise_error("cat /proc/sys/kernel/watchdog_thresh", node) + print(f"{node}: {device}|{driver}|{kernel_timeout}") + else: + logger.error("Failed to get watchdog info from %s", node) + print() + + if xmlutil.CrmMonXmlParser().is_resource_configured(sbd.SBDManager.SBD_RA): + print("fence_sbd status: ") + sbd_id_list = xmlutil.CrmMonXmlParser().get_resource_id_list_via_type(sbd.SBDManager.SBD_RA) + for sbd_id in sbd_id_list: + out = self.cluster_shell.get_stdout_or_raise_error(f"crm resource status {sbd_id}") + print(out) diff --git a/crmsh/utils.py b/crmsh/utils.py index 605bcacaa..666cb8c74 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -2781,13 +2781,15 @@ def get_pcmk_delay_max(two_node_without_qdevice=False): return 0 -def get_property(name, property_type="crm_config", peer=None): +def get_property(name, property_type="crm_config", peer=None, get_default=True): """ Get cluster properties "property_type" can be crm_config|rsc_defaults|op_defaults + "get_default" is used to get the default value from cluster metadata, + when it is False, the property value will be got from cib """ - if property_type == "crm_config": + if property_type == "crm_config" and get_default: cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE) cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name) else: @@ -3137,4 +3139,19 @@ def time_value_with_unit(time_value): Check if the time value contains unit """ return re.search(r'^\d+[a-z]+$', time_value) is not None + + +def cleanup_stonith_related_properties(): + for p in ("stonith-watchdog-timeout", "stonith-timeout", "priority-fencing-delay"): + if get_property(p, get_default=False): + delete_property(p) + if get_property("stonith-enabled") == "true": + set_property("stonith-enabled", "false") + + +def is_subdict(sub_dict, main_dict): + """ + Check if sub_dict is a sub-dictionary of main_dict + """ + return all(item in main_dict.items() for item in sub_dict.items()) # vim:ts=4:sw=4:et: diff --git a/crmsh/watchdog.py b/crmsh/watchdog.py index 6d0d2cff4..00e0f60a5 100644 --- a/crmsh/watchdog.py +++ b/crmsh/watchdog.py @@ -27,7 +27,7 @@ def watchdog_device_name(self): return self._watchdog_device_name @staticmethod - def _verify_watchdog_device(dev, ignore_error=False): + def verify_watchdog_device(dev, ignore_error=False): """ Use wdctl to verify watchdog device """ @@ -48,7 +48,7 @@ def _load_watchdog_driver(driver): invoke("systemctl restart systemd-modules-load") @staticmethod - def _get_watchdog_device_from_sbd_config(): + def get_watchdog_device_from_sbd_config(): """ Try to get watchdog device name from sbd config file """ @@ -81,7 +81,7 @@ def _get_device_through_driver(self, driver_name): Get watchdog device name which has driver_name """ for device, driver in self._watchdog_info_dict.items(): - if driver == driver_name and self._verify_watchdog_device(device): + if driver == driver_name and self.verify_watchdog_device(device): return device return None @@ -108,7 +108,7 @@ def _get_first_unused_device(self): Get first unused watchdog device name """ for dev in self._watchdog_info_dict: - if self._verify_watchdog_device(dev, ignore_error=True): + if self.verify_watchdog_device(dev, ignore_error=True): return dev return None @@ -120,8 +120,8 @@ def _set_input(self): 3. Set the self._input as softdog """ if not self._input: - dev = self._get_watchdog_device_from_sbd_config() - if dev and self._verify_watchdog_device(dev, ignore_error=True): + dev = self.get_watchdog_device_from_sbd_config() + if dev and self.verify_watchdog_device(dev, ignore_error=True): self._input = dev return first_unused = self._get_first_unused_device() @@ -131,7 +131,7 @@ def _valid_device(self, dev): """ Is an unused watchdog device """ - if dev in self._watchdog_info_dict and self._verify_watchdog_device(dev): + if dev in self._watchdog_info_dict and self.verify_watchdog_device(dev): return True return False @@ -142,7 +142,7 @@ def join_watchdog(self): """ self._set_watchdog_info() - res = self._get_watchdog_device_from_sbd_config() + res = self.get_watchdog_device_from_sbd_config() if not res: utils.fatal("Failed to get watchdog device from {}".format(SYSCONFIG_SBD)) self._input = res @@ -177,3 +177,9 @@ def init_watchdog(self): if res: self._watchdog_device_name = res return + + @classmethod + def get_watchdog_device(cls, dev_or_driver=None): + w = cls(_input=dev_or_driver) + w.init_watchdog() + return w.watchdog_device_name