Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test_coredump_check_for_ceph_daemon_crash coredump not getting generated for ceph #10989

Open
DanielOsypenko opened this issue Dec 4, 2024 · 1 comment
Assignees

Comments

@DanielOsypenko
Copy link
Contributor

Test is unstable

[2024-12-03T22:09:11.681Z] _________ TestKillCephDaemon.test_coredump_check_for_ceph_daemon_crash _________
[2024-12-03T22:09:11.681Z] 
[2024-12-03T22:09:11.681Z] self = <tests.functional.z_cluster.test_coredump_check_for_ceph_daemon_crash.TestKillCephDaemon object at 0x7fe5255419a0>
[2024-12-03T22:09:11.681Z] 
[2024-12-03T22:09:11.681Z]     def test_coredump_check_for_ceph_daemon_crash(self):
[2024-12-03T22:09:11.681Z]         """
[2024-12-03T22:09:11.681Z]         Verify coredumpctl list updated after killing daemon
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         """
[2024-12-03T22:09:11.681Z]         log.info("Get Node name where mon pod running")
[2024-12-03T22:09:11.681Z]         mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()]
[2024-12-03T22:09:11.681Z]         mon_pod_node_names = [node.name for node in mon_pod_nodes]
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         log.info("Get Node name where mgr pod running")
[2024-12-03T22:09:11.681Z]         mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()]
[2024-12-03T22:09:11.681Z]         mgr_pod_node_names = [node.name for node in mgr_pod_nodes]
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         log.info("Get Node name where osd pod running")
[2024-12-03T22:09:11.681Z]         osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()]
[2024-12-03T22:09:11.681Z]         osd_pod_node_names = [node.name for node in osd_pod_nodes]
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         node_mgr_mon_osd_names = set(mgr_pod_node_names).intersection(
[2024-12-03T22:09:11.681Z]             osd_pod_node_names, mon_pod_node_names
[2024-12-03T22:09:11.681Z]         )
[2024-12-03T22:09:11.681Z]         node_mgr_osd_names = set(mgr_pod_node_names).intersection(osd_pod_node_names)
[2024-12-03T22:09:11.681Z]         node_mgr_mon_names = set(mgr_pod_node_names).intersection(mon_pod_node_names)
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         if len(node_mgr_mon_osd_names) > 0:
[2024-12-03T22:09:11.681Z]             daemon_types = ["mgr", "osd", "mon"]
[2024-12-03T22:09:11.681Z]             node_name = list(node_mgr_mon_osd_names)[0]
[2024-12-03T22:09:11.681Z]         elif len(node_mgr_osd_names) > 0:
[2024-12-03T22:09:11.681Z]             daemon_types = ["mgr", "osd"]
[2024-12-03T22:09:11.681Z]             node_name = list(node_mgr_osd_names)[0]
[2024-12-03T22:09:11.681Z]         elif len(node_mgr_mon_names) > 0:
[2024-12-03T22:09:11.681Z]             daemon_types = ["mgr", "mon"]
[2024-12-03T22:09:11.681Z]             node_name = list(node_mgr_mon_names)[0]
[2024-12-03T22:09:11.681Z]         else:
[2024-12-03T22:09:11.681Z]             daemon_types = ["mgr"]
[2024-12-03T22:09:11.681Z]             node_name = mgr_pod_node_names[0]
[2024-12-03T22:09:11.681Z]         log.info(f"Test the daemon_types {daemon_types} on node {node_name}")
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         log.info(
[2024-12-03T22:09:11.681Z]             "Delete the contents of 'posted' directory "
[2024-12-03T22:09:11.681Z]             "`/var/lib/rook/openshift-storage/crash/posted/`"
[2024-12-03T22:09:11.681Z]         )
[2024-12-03T22:09:11.681Z]         cmd_bash = (
[2024-12-03T22:09:11.681Z]             f"oc debug nodes/{node_name} --to-namespace={config.ENV_DATA['cluster_namespace']} "
[2024-12-03T22:09:11.681Z]             "-- chroot /host /bin/bash -c "
[2024-12-03T22:09:11.681Z]         )
[2024-12-03T22:09:11.681Z]         cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"'
[2024-12-03T22:09:11.681Z]         cmd = cmd_bash + cmd_delete_files
[2024-12-03T22:09:11.681Z]         run_cmd(cmd=cmd)
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         for daemon_type in daemon_types:
[2024-12-03T22:09:11.681Z]             log.info(f"find ceph-{daemon_type} process-id")
[2024-12-03T22:09:11.681Z]             cmd_pid = f"pidof ceph-{daemon_type}"
[2024-12-03T22:09:11.681Z]             cmd_gen = (
[2024-12-03T22:09:11.681Z]                 "oc debug node/"
[2024-12-03T22:09:11.681Z]                 + node_name
[2024-12-03T22:09:11.681Z]                 + f" --to-namespace={config.ENV_DATA['cluster_namespace']} -- chroot /host "
[2024-12-03T22:09:11.681Z]             )
[2024-12-03T22:09:11.681Z]             cmd = cmd_gen + cmd_pid
[2024-12-03T22:09:11.681Z]             out = run_cmd(cmd=cmd)
[2024-12-03T22:09:11.681Z]             pids = out.strip().split()
[2024-12-03T22:09:11.681Z]             pid = pids[0]
[2024-12-03T22:09:11.681Z]             if not pid.isnumeric():
[2024-12-03T22:09:11.681Z]                 raise Exception(f"The ceph-{daemon_type} process-id was not found.")
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]             log.info(f"Kill ceph-{daemon_type} process-id {pid}")
[2024-12-03T22:09:11.681Z]             disruptions_obj = Disruptions()
[2024-12-03T22:09:11.681Z]             disruptions_obj.daemon_pid = pid
[2024-12-03T22:09:11.681Z]             disruptions_obj.kill_daemon(
[2024-12-03T22:09:11.681Z]                 node_name=node_name, check_new_pid=False, kill_signal="11"
[2024-12-03T22:09:11.681Z]             )
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         log.info(
[2024-12-03T22:09:11.681Z]             f"Verify that we have a crash event for ceph-{daemon_types} crash (tool pod)"
[2024-12-03T22:09:11.681Z]         )
[2024-12-03T22:09:11.681Z]         sample = TimeoutSampler(
[2024-12-03T22:09:11.681Z]             timeout=600,
[2024-12-03T22:09:11.681Z]             sleep=10,
[2024-12-03T22:09:11.681Z]             func=run_cmd_verify_cli_output,
[2024-12-03T22:09:11.681Z]             cmd="ceph crash ls",
[2024-12-03T22:09:11.681Z]             expected_output_lst=daemon_types,
[2024-12-03T22:09:11.681Z]             cephtool_cmd=True,
[2024-12-03T22:09:11.681Z]         )
[2024-12-03T22:09:11.681Z]         if not sample.wait_for_func_status(True):
[2024-12-03T22:09:11.681Z]             raise Exception(
[2024-12-03T22:09:11.681Z]                 f"ceph-{daemon_types} process does not exist on crash list (tool pod)"
[2024-12-03T22:09:11.681Z]             )
[2024-12-03T22:09:11.681Z]     
[2024-12-03T22:09:11.681Z]         log.info(
[2024-12-03T22:09:11.681Z]             f"Verify coredumpctl list updated after killing {daemon_types} daemons on {node_name}"
[2024-12-03T22:09:11.681Z]         )
[2024-12-03T22:09:11.681Z]         sample = TimeoutSampler(
[2024-12-03T22:09:11.681Z]             timeout=600,
[2024-12-03T22:09:11.681Z]             sleep=10,
[2024-12-03T22:09:11.681Z]             func=run_cmd_verify_cli_output,
[2024-12-03T22:09:11.681Z]             cmd="coredumpctl list",
[2024-12-03T22:09:11.681Z]             expected_output_lst=daemon_types,
[2024-12-03T22:09:11.681Z]             debug_node=node_name,
[2024-12-03T22:09:11.681Z]         )
[2024-12-03T22:09:11.681Z]         if not sample.wait_for_func_status(True):
[2024-12-03T22:09:11.681Z] >           raise Exception(
[2024-12-03T22:09:11.681Z]                 f"coredump not getting generated for ceph-{daemon_types} daemon crash"
[2024-12-03T22:09:11.681Z]             )
[2024-12-03T22:09:11.681Z] �[1m�[31mE           Exception: coredump not getting generated for ceph-['mgr', 'osd', 'mon'] daemon crash�[0m
[2024-12-03T22:09:11.681Z] 
[2024-12-03T22:09:11.681Z] �[1m�[31m/home/jenkins/workspace/qe-deploy-ocs-cluster/ocs-ci/tests/functional/z_cluster/test_coredump_check_for_ceph_daemon_crash.py�[0m:165: Exception
@DanielOsypenko
Copy link
Contributor Author

@prsurve the test is failing with output ^^
Can you pls check p if this is suspicious and should open a bug.

test has never passed on 4.17 8/8 Fail

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants