diff --git a/changelogs/fragments/11750-pacemaker-wait-race-condition.yml b/changelogs/fragments/11750-pacemaker-wait-race-condition.yml new file mode 100644 index 0000000000..117907adcb --- /dev/null +++ b/changelogs/fragments/11750-pacemaker-wait-race-condition.yml @@ -0,0 +1,2 @@ +bugfixes: + - pacemaker_resource, pacemaker_stonith - fix resource and stonith creation race condition by polling PCS status (https://github.com/ansible-collections/community.general/issues/11574, https://github.com/ansible-collections/community.general/pull/11750). diff --git a/plugins/module_utils/pacemaker.py b/plugins/module_utils/pacemaker.py index 01f675f9f6..32617623be 100644 --- a/plugins/module_utils/pacemaker.py +++ b/plugins/module_utils/pacemaker.py @@ -5,6 +5,7 @@ from __future__ import annotations import re +import time import typing as t from ansible_collections.community.general.plugins.module_utils.cmd_runner import CmdRunner, cmd_runner_fmt @@ -59,6 +60,22 @@ def get_pacemaker_maintenance_mode(runner: CmdRunner) -> bool: return bool(maintenance_mode_output) +def wait_for_resource(runner: CmdRunner, cli_noun: str, name: str, wait: int, sleep_interval: int = 5) -> None: + """Poll ``pcs status `` until the resource reports Started or the wait budget expires. + + Raises an exception if the resource does not reach the Started state within *wait* seconds. + """ + deadline = time.monotonic() + wait + while True: + with runner("cli_action state name") as ctx: + rc, out, err = ctx.run(cli_action=cli_noun, state="status") + if out and "Started" in out: + return + if time.monotonic() >= deadline: + raise Exception(f"Timed out waiting {wait}s for {cli_noun} resource '{name}' to start") + time.sleep(sleep_interval) + + def pacemaker_runner(module: AnsibleModule, **kwargs) -> CmdRunner: runner_command = ["pcs"] runner = CmdRunner( diff --git a/plugins/modules/pacemaker_resource.py b/plugins/modules/pacemaker_resource.py index 42edcbf2d6..dc7c2a7d28 100644 --- a/plugins/modules/pacemaker_resource.py +++ b/plugins/modules/pacemaker_resource.py @@ -149,6 +149,7 @@ from ansible_collections.community.general.plugins.module_utils.module_helper im from ansible_collections.community.general.plugins.module_utils.pacemaker import ( get_pacemaker_maintenance_mode, pacemaker_runner, + wait_for_resource, ) @@ -237,7 +238,7 @@ class PacemakerResource(StateModuleHelper): def state_present(self): with self.runner( "cli_action state name resource_type resource_option resource_operation resource_meta resource_argument " - "resource_clone_ids resource_clone_meta wait", + "resource_clone_ids resource_clone_meta", output_process=self._process_command_output( not get_pacemaker_maintenance_mode(self.runner), "already exists" ), @@ -247,10 +248,12 @@ class PacemakerResource(StateModuleHelper): cli_action="resource", resource_clone_ids=self.fmt_as_stack_argument(self.module.params["resource_clone_ids"], "clone"), ) + if not self.module.check_mode and self.vars.wait and not get_pacemaker_maintenance_mode(self.runner): + wait_for_resource(self.runner, "resource", self.vars.name, self.vars.wait) def state_cloned(self): with self.runner( - "cli_action state name resource_clone_ids resource_clone_meta wait", + "cli_action state name resource_clone_ids resource_clone_meta", output_process=self._process_command_output( not get_pacemaker_maintenance_mode(self.runner), "already a clone resource" ), @@ -260,6 +263,8 @@ class PacemakerResource(StateModuleHelper): cli_action="resource", resource_clone_meta=self.fmt_as_stack_argument(self.module.params["resource_clone_meta"], "meta"), ) + if not self.module.check_mode and self.vars.wait and not get_pacemaker_maintenance_mode(self.runner): + wait_for_resource(self.runner, "resource", self.vars.name, self.vars.wait) def state_enabled(self): with self.runner( diff --git a/plugins/modules/pacemaker_stonith.py b/plugins/modules/pacemaker_stonith.py index d4eb229694..7e1d20fecd 100644 --- a/plugins/modules/pacemaker_stonith.py +++ b/plugins/modules/pacemaker_stonith.py @@ -125,7 +125,7 @@ value: """ from ansible_collections.community.general.plugins.module_utils.module_helper import StateModuleHelper -from ansible_collections.community.general.plugins.module_utils.pacemaker import pacemaker_runner +from ansible_collections.community.general.plugins.module_utils.pacemaker import pacemaker_runner, wait_for_resource class PacemakerStonith(StateModuleHelper): @@ -206,7 +206,7 @@ class PacemakerStonith(StateModuleHelper): def state_present(self): with self.runner( - "cli_action state name resource_type resource_option resource_operation resource_meta resource_argument agent_validation wait", + "cli_action state name resource_type resource_option resource_operation resource_meta resource_argument agent_validation", output_process=self._process_command_output(True, "already exists"), check_mode_skip=True, ) as ctx: @@ -218,6 +218,8 @@ class PacemakerStonith(StateModuleHelper): resource_meta=self.vars.stonith_metas, resource_argument=self.vars.stonith_argument, ) + if not self.module.check_mode and self.vars.wait: + wait_for_resource(self.runner, "stonith", self.vars.name, self.vars.wait) def state_enabled(self): with self.runner( diff --git a/tests/unit/plugins/modules/test_pacemaker_resource.py b/tests/unit/plugins/modules/test_pacemaker_resource.py index 1e2c7cdf9b..a0fe178298 100644 --- a/tests/unit/plugins/modules/test_pacemaker_resource.py +++ b/tests/unit/plugins/modules/test_pacemaker_resource.py @@ -10,8 +10,138 @@ from __future__ import annotations +import json + +import pytest + from ansible_collections.community.general.plugins.modules import pacemaker_resource from .uthelper import RunCommandMock, UTHelper UTHelper.from_module(pacemaker_resource, __name__, mocks=[RunCommandMock]) + + +# --------------------------------------------------------------------------- +# Race condition tests: resource starts after one or more Stopped polls +# --------------------------------------------------------------------------- + +NO_MAINTENANCE_OUT = ( + "Cluster Properties: cib-bootstrap-options\n" + "cluster-infrastructure=corosync\n" + "cluster-name=hacluster\n" + "dc-version=2.1.9-1.fc41-7188dbf\n" + "have-watchdog=false\n" +) + + +@pytest.fixture +def patch_bin(mocker): + def mockie(self_, path, *args, **kwargs): + return f"/testbin/{path}" + + mocker.patch("ansible.module_utils.basic.AnsibleModule.get_bin_path", mockie) + + +@pytest.mark.usefixtures("patch_bin") +def test_present_race_condition_stopped_then_started(mocker, capfd): + """Resource reports Stopped on the first poll then Started on the second — must succeed.""" + mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep") + + # Sequence of run_command calls: + # 1. initial _get(): resource status → not found (rc=1) + # 2. state_present: property config → no maintenance + # 3. state_present: resource create → rc=0 + # 4. post-create maintenance check → no maintenance + # 5. wait_for_resource poll 1: status → Stopped (not yet running) + # 6. wait_for_resource poll 2: status → Started + # 7. __quit_module__ _get(): status → Started + run_command_calls = [ + (1, "", "Error: resource or tag id 'virtual-ip' not found"), + (1, NO_MAINTENANCE_OUT, ""), + (0, "Assumed agent name 'ocf:heartbeat:IPaddr2'", ""), + (1, NO_MAINTENANCE_OUT, ""), + (0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Stopped", ""), + (0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started", ""), + (0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started", ""), + ] + + def side_effect(self_, **kwargs): + return run_command_calls.pop(0) + + mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect) + + with pytest.raises(SystemExit): + with pytest.MonkeyPatch().context() as mp: + mp.setattr( + "ansible.module_utils.basic._ANSIBLE_ARGS", + json.dumps( + { + "ANSIBLE_MODULE_ARGS": { + "state": "present", + "name": "virtual-ip", + "resource_type": {"resource_name": "IPaddr2"}, + "resource_option": ["ip=192.168.2.1"], + "wait": 30, + } + } + ).encode(), + ) + mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False) + pacemaker_resource.main() + + out, _err = capfd.readouterr() + result = json.loads(out) + assert result["changed"] is True + assert result.get("failed") is not True + assert "Started" in result["value"] + + +@pytest.mark.usefixtures("patch_bin") +def test_present_wait_timeout_raises(mocker, capfd): + """Resource never starts within the wait window — must fail with a timeout message.""" + mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep") + + # Simulate time advancing past the deadline immediately on the first poll + monotonic_values = iter([0.0, 999.0]) + mocker.patch( + "ansible_collections.community.general.plugins.module_utils.pacemaker.time.monotonic", + side_effect=lambda: next(monotonic_values), + ) + + run_command_calls = [ + (1, "", "Error: resource or tag id 'virtual-ip' not found"), + (1, NO_MAINTENANCE_OUT, ""), + (0, "Assumed agent name 'ocf:heartbeat:IPaddr2'", ""), + (1, NO_MAINTENANCE_OUT, ""), + (0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Stopped", ""), + ] + + def side_effect(self_, **kwargs): + return run_command_calls.pop(0) + + mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect) + + with pytest.raises(SystemExit): + with pytest.MonkeyPatch().context() as mp: + mp.setattr( + "ansible.module_utils.basic._ANSIBLE_ARGS", + json.dumps( + { + "ANSIBLE_MODULE_ARGS": { + "state": "present", + "name": "virtual-ip", + "resource_type": {"resource_name": "IPaddr2"}, + "resource_option": ["ip=192.168.2.1"], + "wait": 10, + } + } + ).encode(), + ) + mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False) + pacemaker_resource.main() + + out, _err = capfd.readouterr() + result = json.loads(out) + assert result.get("failed") is True + assert "Timed out" in result["msg"] + assert "virtual-ip" in result["msg"] diff --git a/tests/unit/plugins/modules/test_pacemaker_resource.yaml b/tests/unit/plugins/modules/test_pacemaker_resource.yaml index 25953c40a1..f755a26913 100644 --- a/tests/unit/plugins/modules/test_pacemaker_resource.yaml +++ b/tests/unit/plugins/modules/test_pacemaker_resource.yaml @@ -40,11 +40,26 @@ test_cases: dc-version=2.1.9-1.fc41-7188dbf have-watchdog=false err: "" - - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300] + - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"] environ: *env-def rc: 0 out: "Assumed agent name 'ocf:heartbeat:IPaddr2' (deduced from 'IPAddr2')" err: "" + - command: [/testbin/pcs, property, config] + environ: *env-def + rc: 1 + out: | + Cluster Properties: cib-bootstrap-options + cluster-infrastructure=corosync + cluster-name=hacluster + dc-version=2.1.9-1.fc41-7188dbf + have-watchdog=false + err: "" + - command: [/testbin/pcs, resource, status, virtual-ip] + environ: *env-def + rc: 0 + out: " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started" + err: "" - command: [/testbin/pcs, resource, status, virtual-ip] environ: *env-def rc: 0 @@ -103,11 +118,26 @@ test_cases: dc-version=2.1.9-1.fc41-7188dbf have-watchdog=false err: "" - - command: [/testbin/pcs, resource, create, virtual-ip, ocf:heartbeat:IPaddr2, "ip=[192.168.2.1]", op, start, timeout=1200, op, stop, timeout=1200, op, monitor, timeout=1200, meta, test_meta1=123, meta, test_meta2=456, --group, test_group, clone, test_clone, test_clone_meta1=789, --wait=200] + - command: [/testbin/pcs, resource, create, virtual-ip, ocf:heartbeat:IPaddr2, "ip=[192.168.2.1]", op, start, timeout=1200, op, stop, timeout=1200, op, monitor, timeout=1200, meta, test_meta1=123, meta, test_meta2=456, --group, test_group, clone, test_clone, test_clone_meta1=789] environ: *env-def rc: 0 out: "Assumed agent name 'ocf:heartbeat:IPaddr2'" err: "" + - command: [/testbin/pcs, property, config] + environ: *env-def + rc: 1 + out: | + Cluster Properties: cib-bootstrap-options + cluster-infrastructure=corosync + cluster-name=hacluster + dc-version=2.1.9-1.fc41-7188dbf + have-watchdog=false + err: "" + - command: [/testbin/pcs, resource, status, virtual-ip] + environ: *env-def + rc: 0 + out: " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started" + err: "" - command: [/testbin/pcs, resource, status, virtual-ip] environ: *env-def rc: 0 @@ -142,11 +172,26 @@ test_cases: dc-version=2.1.9-1.fc41-7188dbf have-watchdog=false err: "" - - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300] + - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"] environ: *env-def rc: 1 out: "" err: "Error: 'virtual-ip' already exists\n" + - command: [/testbin/pcs, property, config] + environ: *env-def + rc: 1 + out: | + Cluster Properties: cib-bootstrap-options + cluster-infrastructure=corosync + cluster-name=hacluster + dc-version=2.1.9-1.fc41-7188dbf + have-watchdog=false + err: "" + - command: [/testbin/pcs, resource, status, virtual-ip] + environ: *env-def + rc: 0 + out: " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started" + err: "" - command: [/testbin/pcs, resource, status, virtual-ip] environ: *env-def rc: 0 @@ -182,11 +227,22 @@ test_cases: have-watchdog=false maintenance-mode=true err: "" - - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300] + - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"] environ: *env-def - rc: 1 + rc: 0 out: "" - err: "Error: resource 'virtual-ip' is not running on any node" + err: "" + - command: [/testbin/pcs, property, config] + environ: *env-def + rc: 0 + out: | + Cluster Properties: cib-bootstrap-options + cluster-infrastructure=corosync + cluster-name=hacluster + dc-version=2.1.9-1.fc41-7188dbf + have-watchdog=false + maintenance-mode=true + err: "" - command: [/testbin/pcs, resource, status, virtual-ip] environ: *env-def rc: 0 @@ -222,11 +278,22 @@ test_cases: have-watchdog: false maintenance-mode: true err: "" - - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300] + - command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"] environ: *env-def - rc: 1 + rc: 0 out: "" - err: "Error: resource 'virtual-ip' is not running on any node" + err: "" + - command: [/testbin/pcs, property, config] + environ: *env-def + rc: 0 + out: | + Cluster Properties: cib-bootstrap-options + cluster-infrastructure: corosync + cluster-name: hacluster + dc-version: 2.1.9-1.fc41-7188dbf + have-watchdog: false + maintenance-mode: true + err: "" - command: [/testbin/pcs, resource, status, virtual-ip] environ: *env-def rc: 0 @@ -523,7 +590,7 @@ test_cases: dc-version=2.1.9-1.fc41-7188dbf have-watchdog=false err: "" - - command: [/testbin/pcs, resource, clone, virtual-ip, --wait=300] + - command: [/testbin/pcs, resource, clone, virtual-ip] environ: *env-def rc: 1 out: "" @@ -558,11 +625,26 @@ test_cases: dc-version=2.1.9-1.fc41-7188dbf have-watchdog=false err: "" - - command: [/testbin/pcs, resource, clone, virtual-ip, test_clone, meta, test_clone_meta1=789, --wait=200] + - command: [/testbin/pcs, resource, clone, virtual-ip, test_clone, meta, test_clone_meta1=789] environ: *env-def rc: 0 out: "" err: "" + - command: [/testbin/pcs, property, config] + environ: *env-def + rc: 1 + out: | + Cluster Properties: cib-bootstrap-options + cluster-infrastructure=corosync + cluster-name=hacluster + dc-version=2.1.9-1.fc41-7188dbf + have-watchdog=false + err: "" + - command: [/testbin/pcs, resource, status, virtual-ip] + environ: *env-def + rc: 0 + out: " * Clone Set: virtual-ip-clone [virtual-ip]\t(ocf:heartbeat:IPAddr2):\t Started" + err: "" - command: [/testbin/pcs, resource, status, virtual-ip] environ: *env-def rc: 0 diff --git a/tests/unit/plugins/modules/test_pacemaker_stonith.py b/tests/unit/plugins/modules/test_pacemaker_stonith.py index 7b32b245df..891c9ac90a 100644 --- a/tests/unit/plugins/modules/test_pacemaker_stonith.py +++ b/tests/unit/plugins/modules/test_pacemaker_stonith.py @@ -10,8 +10,125 @@ from __future__ import annotations +import json + +import pytest + from ansible_collections.community.general.plugins.modules import pacemaker_stonith from .uthelper import RunCommandMock, UTHelper UTHelper.from_module(pacemaker_stonith, __name__, mocks=[RunCommandMock]) + + +# --------------------------------------------------------------------------- +# Race condition tests: resource starts after one or more Stopped polls +# --------------------------------------------------------------------------- + + +@pytest.fixture +def patch_bin(mocker): + def mockie(self_, path, *args, **kwargs): + return f"/testbin/{path}" + + mocker.patch("ansible.module_utils.basic.AnsibleModule.get_bin_path", mockie) + + +@pytest.mark.usefixtures("patch_bin") +def test_present_race_condition_stopped_then_started(mocker, capfd): + """Resource reports Stopped on the first poll then Started on the second — must succeed.""" + mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep") + + # Sequence of run_command calls: + # 1. initial _get(): stonith status → not found (rc=1) + # 2. state_present create → rc=0 + # 3. wait_for_resource poll 1: status → Stopped (not yet running) + # 4. wait_for_resource poll 2: status → Started + # 5. __quit_module__ _get(): status → Started + run_command_calls = [ + (1, "", ""), + (0, "", ""), + (0, " * virtual-stonith\t(stonith:fence_virt):\t Stopped", ""), + (0, " * virtual-stonith\t(stonith:fence_virt):\t Started", ""), + (0, " * virtual-stonith\t(stonith:fence_virt):\t Started", ""), + ] + + def side_effect(self_, **kwargs): + return run_command_calls.pop(0) + + mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect) + + with pytest.raises(SystemExit): + with pytest.MonkeyPatch().context() as mp: + mp.setattr( + "ansible.module_utils.basic._ANSIBLE_ARGS", + json.dumps( + { + "ANSIBLE_MODULE_ARGS": { + "state": "present", + "name": "virtual-stonith", + "stonith_type": "fence_virt", + "stonith_options": ["pcmk_host_list=f1"], + "wait": 30, + } + } + ).encode(), + ) + mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False) + pacemaker_stonith.main() + + out, _err = capfd.readouterr() + result = json.loads(out) + assert result["changed"] is True + assert result.get("failed") is not True + assert "Started" in result["value"] + + +@pytest.mark.usefixtures("patch_bin") +def test_present_wait_timeout_raises(mocker, capfd): + """Resource never starts within the wait window — must fail with a timeout message.""" + mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep") + + # Simulate time advancing past the deadline immediately on the first poll + monotonic_values = iter([0.0, 999.0]) + mocker.patch( + "ansible_collections.community.general.plugins.module_utils.pacemaker.time.monotonic", + side_effect=lambda: next(monotonic_values), + ) + + # Sequence: initial _get() → not found, create → rc=0, poll → Stopped (deadline exceeded) + run_command_calls = [ + (1, "", ""), + (0, "", ""), + (0, " * virtual-stonith\t(stonith:fence_virt):\t Stopped", ""), + ] + + def side_effect(self_, **kwargs): + return run_command_calls.pop(0) + + mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect) + + with pytest.raises(SystemExit): + with pytest.MonkeyPatch().context() as mp: + mp.setattr( + "ansible.module_utils.basic._ANSIBLE_ARGS", + json.dumps( + { + "ANSIBLE_MODULE_ARGS": { + "state": "present", + "name": "virtual-stonith", + "stonith_type": "fence_virt", + "stonith_options": ["pcmk_host_list=f1"], + "wait": 10, + } + } + ).encode(), + ) + mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False) + pacemaker_stonith.main() + + out, _err = capfd.readouterr() + result = json.loads(out) + assert result.get("failed") is True + assert "Timed out" in result["msg"] + assert "virtual-stonith" in result["msg"] diff --git a/tests/unit/plugins/modules/test_pacemaker_stonith.yaml b/tests/unit/plugins/modules/test_pacemaker_stonith.yaml index cf6bdbc7ff..cf2c71a8b0 100644 --- a/tests/unit/plugins/modules/test_pacemaker_stonith.yaml +++ b/tests/unit/plugins/modules/test_pacemaker_stonith.yaml @@ -33,7 +33,7 @@ test_cases: rc: 1 out: "" err: "" - - command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s", "--wait=300"] + - command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s"] environ: *env-def rc: 0 out: "" @@ -43,6 +43,11 @@ test_cases: rc: 0 out: " * virtual-stonith\t(stonith:fence_virt):\t Started" err: "" + - command: ["/testbin/pcs", stonith, status, virtual-stonith] + environ: *env-def + rc: 0 + out: " * virtual-stonith\t(stonith:fence_virt):\t Started" + err: "" - id: test_present_minimal_input_stonith_exists input: state: present @@ -65,7 +70,7 @@ test_cases: rc: 0 out: " * virtual-stonith\t(stonith:fence_virt):\t Started" err: "" - - command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s", "--wait=300"] + - command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s"] environ: *env-def rc: 0 out: "" @@ -75,6 +80,11 @@ test_cases: rc: 0 out: " * virtual-stonith\t(stonith:fence_virt):\t Started" err: "" + - command: ["/testbin/pcs", stonith, status, virtual-stonith] + environ: *env-def + rc: 0 + out: " * virtual-stonith\t(stonith:fence_virt):\t Started" + err: "" - id: test_absent_minimal_input_stonith_not_exists input: state: absent