Skip to content

Commit

Permalink
Dev: ui_sbd: Add new 'crm sbd' sublevel (jsc#PED-8256)
Browse files Browse the repository at this point in the history
** Motivation
The main configurations for sbd use cases are scattered among sysconfig,
on-disk meta data, CIB, and even could be related to other OS components
eg. coredump, SCSI, multipath.

It's desirable to reduce the management complexity among them and to
streamline the workflow for the main use case scenarios.

** Changed include
**** Disk-based SBD scenarios
1. Show usage when syntax error
2. Completion
3. Display SBD related configuration (UC4 in PED-8256)
4. Change the on-disk meta data of the existing sbd disks (UC2.1 in
   PED-8256)
5. Add a sbd disk with the existing sbd configuration (UC2.2 in
   PED-8256)
6. Remove a sbd disk (UC2.3 in PED-8256)
7. Remove sbd from cluster
8. Replace the storage for a sbd disk (UC2.4 in PED-8256)]
9. display status (focusing on the runtime information only) (UC5 in
   PED-8256)

**** Disk-less SBD scenarios
1. Show usage when syntax error (diskless)
2. completion (diskless)
3. Display SBD related configuration (UC4 in PED-8256, diskless)
4. Manipulate the basic diskless sbd configuration (UC3.1 in PED-8256)
  • Loading branch information
liangxin1300 committed Jul 26, 2024
1 parent f638738 commit 23c16c0
Show file tree
Hide file tree
Showing 6 changed files with 603 additions and 43 deletions.
7 changes: 6 additions & 1 deletion crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2785,7 +2785,12 @@ def sync_file(path):
"""
Sync files between cluster nodes
"""
if _context.skip_csync2:
if _context:
skip_csync2 = _context.skip_csync2
else:
skip_csync2 = not ServiceManager().service_is_active(CSYNC2_SERVICE)

if skip_csync2:
utils.cluster_copy_file(path, nodes=_context.node_list_in_cluster, output=False)
else:
csync2_update(path)
Expand Down
139 changes: 107 additions & 32 deletions crmsh/sbd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import re
import typing
from . import utils, sh
from . import bootstrap
from .bootstrap import SYSCONFIG_SBD, SBD_SYSTEMD_DELAY_START_DIR
Expand Down Expand Up @@ -68,6 +69,17 @@ def _set_sbd_msgwait(self):
sbd_msgwait = sbd_msgwait_default
self.sbd_msgwait = sbd_msgwait

@classmethod
def get_advised_sbd_timeout(cls) -> typing.Tuple[int, int]:
"""
Get suitable sbd_watchdog_timeout and sbd_msgwait
"""
ctx = bootstrap.Context()
ctx.load_profiles()
time_inst = cls(ctx)
time_inst.initialize_timeout()
return time_inst.sbd_watchdog_timeout, time_inst.sbd_msgwait

def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self):
"""
When using diskless SBD with Qdevice, adjust value of sbd_watchdog_timeout
Expand All @@ -85,17 +97,34 @@ def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self):
logger.warning("sbd_watchdog_timeout is set to {} for qdevice, it was {}".format(self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE, self.sbd_watchdog_timeout))
self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE

@staticmethod
def get_sbd_device_metadata(dev, timeout_only=False) -> dict:
"""
Extract metadata from sbd device header
"""
sbd_info = {}
try:
out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev))
except:
return sbd_info
pattern = r"UUID\s+:\s+(\S+)|Timeout\s+\((\w+)\)\s+:\s+(\d+)"
matches = re.findall(pattern, out)
for uuid, timeout_type, timeout_value in matches:
if uuid and not timeout_only:
sbd_info["uuid"] = uuid
elif timeout_type and timeout_value:
sbd_info[timeout_type] = int(timeout_value)
return sbd_info

@staticmethod
def get_sbd_msgwait(dev):
"""
Get msgwait for sbd device
"""
out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev))
# Format like "Timeout (msgwait) : 30"
res = re.search("\(msgwait\)\s+:\s+(\d+)", out)
res = SBDTimeout.get_sbd_device_metadata(dev).get("msgwait")
if not res:
raise ValueError("Cannot get sbd msgwait for {}".format(dev))
return int(res.group(1))
raise ValueError(f"Cannot get sbd msgwait for {dev}")
return res

@staticmethod
def get_sbd_watchdog_timeout():
Expand Down Expand Up @@ -195,6 +224,12 @@ def is_sbd_delay_start():
res = SBDManager.get_sbd_value_from_config("SBD_DELAY_START")
return res and res != "no"

@staticmethod
def get_sbd_systemd_start_timeout() -> int:
cmd = "systemctl show -p TimeoutStartUSec sbd --value"
out = sh.cluster_shell().get_stdout_or_raise_error(cmd)
return utils.get_systemd_timeout_start_in_sec(out)

def adjust_systemd_start_timeout(self):
"""
Adjust start timeout for sbd when set SBD_DELAY_START
Expand All @@ -203,9 +238,7 @@ def adjust_systemd_start_timeout(self):
if sbd_delay_start_value == "no":
return

cmd = "systemctl show -p TimeoutStartUSec sbd --value"
out = sh.cluster_shell().get_stdout_or_raise_error(cmd)
start_timeout = utils.get_systemd_timeout_start_in_sec(out)
start_timeout = SBDTimeout.get_sbd_systemd_start_timeout()
if start_timeout > int(sbd_delay_start_value):
return

Expand Down Expand Up @@ -269,6 +302,7 @@ class SBDManager(object):
DISKLESS_CRM_CMD = "crm configure property stonith-enabled=true stonith-watchdog-timeout={} stonith-timeout={}"
SBD_RA = "stonith:fence_sbd"
SBD_RA_ID = "stonith-sbd"
SBD_DEVICE_MAX = 3

def __init__(self, context):
"""
Expand All @@ -292,11 +326,10 @@ def _get_device_uuid(dev, node=None):
"""
Get UUID for specific device and node
"""
out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev), node)
res = re.search("UUID\s*:\s*(.*)\n", out)
res = SBDTimeout.get_sbd_device_metadata(dev).get("uuid")
if not res:
raise ValueError("Cannot find sbd device UUID for {}".format(dev))
return res.group(1)
return res

def _compare_device_uuid(self, dev, node_list):
"""
Expand All @@ -314,8 +347,8 @@ def _verify_sbd_device(self, dev_list, compare_node_list=[]):
"""
Verify sbd device
"""
if len(dev_list) > 3:
raise ValueError("Maximum number of SBD device is 3")
if len(dev_list) > self.SBD_DEVICE_MAX:
raise ValueError(f"Maximum number of SBD device is {self.SBD_DEVICE_MAX}")
for dev in dev_list:
if not utils.is_block_device(dev):
raise ValueError("{} doesn't look like a block device".format(dev))
Expand Down Expand Up @@ -402,26 +435,32 @@ def _initialize_sbd(self):
For diskless-sbd, set sbd_watchdog_timeout then return;
For disk-based-sbd, also calculate the msgwait value, then initialize the SBD device.
"""
msg = ""
if self.diskless_sbd:
msg = "Configuring diskless SBD"
elif not all(self.no_overwrite_map.values()):
msg = "Initializing SBD"
if msg:
logger.info(msg)
logger.info("Configuring diskless SBD")
self.timeout_inst = SBDTimeout(self._context)
self.timeout_inst.initialize_timeout()
if self.diskless_sbd:
return

opt = "-4 {} -1 {}".format(self.timeout_inst.sbd_msgwait, self.timeout_inst.sbd_watchdog_timeout)
opt_str = "-4 {} -1 {}".format(self.timeout_inst.sbd_msgwait, self.timeout_inst.sbd_watchdog_timeout)
device_list = [
dev for dev in self._sbd_devices
if dev not in self.no_overwrite_map
or not self.no_overwrite_map[dev]
]
SBDManager.initialize_sbd_device(device_list, opt_str)

for dev in self._sbd_devices:
if dev in self.no_overwrite_map and self.no_overwrite_map[dev]:
continue
rc, _, err = bootstrap.invoke("sbd {} -d {} create".format(opt, dev))
if not rc:
utils.fatal("Failed to initialize SBD device {}: {}".format(dev, err))
@staticmethod
def initialize_sbd_device(device_list: typing.List[str], opt_str: str) -> None:
"""
Initialize sbd device with options
"""
shell = sh.cluster_shell()
for dev in device_list:
logger.info("Initializing SBD device %s", dev)
cmd = f"sbd {opt_str} -d {dev} create"
logger.debug("Running command: %s", cmd)
shell.get_stdout_or_raise_error(cmd)

def _update_sbd_configuration(self):
"""
Expand All @@ -438,8 +477,7 @@ def _update_sbd_configuration(self):
}
if self._sbd_devices:
sbd_config_dict["SBD_DEVICE"] = ';'.join(self._sbd_devices)
utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict)
bootstrap.sync_file(SYSCONFIG_SBD)
SBDManager.update_configuration(sbd_config_dict)

def _get_sbd_device_from_config(self):
"""
Expand Down Expand Up @@ -568,16 +606,16 @@ def join_sbd(self, remote_user, peer_host):
bootstrap.invoke("systemctl enable sbd.service")

@classmethod
def verify_sbd_device(cls):
def verify_sbd_device(cls, device_list=[], compare_node_list=[]):
"""
This classmethod is for verifying sbd device on a running cluster
Raise ValueError for exceptions
"""
inst = cls(bootstrap.Context())
dev_list = inst._get_sbd_device_from_config()
dev_list = device_list or inst._get_sbd_device_from_config()
if not dev_list:
raise ValueError("No sbd device configured")
inst._verify_sbd_device(dev_list, utils.list_cluster_nodes_except_me())
inst._verify_sbd_device(dev_list, compare_node_list)

@classmethod
def get_sbd_device_from_config(cls):
Expand All @@ -599,10 +637,12 @@ def is_using_diskless_sbd(cls):
return False

@staticmethod
def update_configuration(sbd_config_dict):
def update_configuration(sbd_config_dict: typing.Dict[str, str]) -> None:
"""
Update and sync sbd configuration
"""
for key, value in sbd_config_dict.items():
logger.info("Update %s in %s: %s", key, SYSCONFIG_SBD, value)
utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict)
bootstrap.sync_file(SYSCONFIG_SBD)

Expand Down Expand Up @@ -630,5 +670,40 @@ def clean_up_existing_sbd_resource():
sbd_id_list = xmlutil.CrmMonXmlParser().get_resource_id_list_via_type(SBDManager.SBD_RA)
if xmlutil.CrmMonXmlParser().is_resource_started(SBDManager.SBD_RA):
for sbd_id in sbd_id_list:
logger.info("Stop sbd resource '%s'(%s)", sbd_id, SBDManager.SBD_RA)
utils.ext_cmd("crm resource stop {}".format(sbd_id))
logger.info("Remove sbd resource '%s'", ';' .join(sbd_id_list))
utils.ext_cmd("crm configure delete {}".format(' '.join(sbd_id_list)))


def enable_sbd_on_cluster():
cluster_nodes = utils.list_cluster_nodes()
service_manager = ServiceManager()
for node in cluster_nodes:
if not service_manager.service_is_enabled("sbd.service", node):
logger.info("Enable sbd.service on node %s", node)
service_manager.enable_service("sbd.service", node)


def disable_sbd_from_cluster():
'''
Disable SBD from cluster, the process includes:
- stop and remove sbd agent
- disable sbd.service
- adjust cluster attributes
- adjust related timeout values
'''
clean_up_existing_sbd_resource()

cluster_nodes = utils.list_cluster_nodes()
service_manager = ServiceManager()
for node in cluster_nodes:
if service_manager.service_is_enabled("sbd.service", node):
logger.info("Disable sbd.service on node %s", node)
service_manager.disable_service("sbd.service", node)

out = sh.cluster_shell().get_stdout_or_raise_error("stonith_admin -L")
res = re.search("([0-9]+) fence device[s]* found", out)
# after disable sbd.service, check if sbd is the last stonith device
if res and int(res.group(1)) <= 1:
utils.cleanup_stonith_related_properties()
5 changes: 5 additions & 0 deletions crmsh/ui_root.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from . import ui_resource
from . import ui_script
from . import ui_site
from . import ui_sbd


class Root(command.UI):
Expand Down Expand Up @@ -150,6 +151,10 @@ def do_report(self, context, *args):
def do_resource(self):
pass

@command.level(ui_sbd.SBD)
def do_sbd(self):
pass

@command.level(ui_script.Script)
@command.help('''Cluster scripts
Cluster scripts can perform cluster-wide configuration,
Expand Down
Loading

0 comments on commit 23c16c0

Please sign in to comment.