Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Stretch cluster] Stabilize stretch cluster tests and support CNV workloads in MON & OSD failure test cases #10580

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
345 changes: 245 additions & 100 deletions ocs_ci/helpers/stretchcluster_helper.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,52 @@
import re
import logging

from ocs_ci.ocs.utils import retry
from ocs_ci.helpers.helpers import (
modify_deployment_replica_count,
modify_statefulset_replica_count,
modify_job_parallelism_count,
)
from ocs_ci.ocs.exceptions import CommandFailed
from ocs_ci.ocs.constants import LOGREADER_CEPHFS_LABEL, LOGWRITER_RBD_LABEL
from ocs_ci.ocs.exceptions import CommandFailed, UnexpectedBehaviour
from ocs_ci.ocs.resources.pod import (
wait_for_pods_deletion,
get_not_running_pods,
get_pod_node,
)
from ocs_ci.ocs import constants
from ocs_ci.ocs.ocp import OCP


logger = logging.getLogger(__name__)


def recover_workload_pods_post_recovery(sc_obj, pods_not_running):
def check_for_logwriter_workload_pods(sc_obj, nodes=None):
"""
Check if logwriter pods are healthy state

Args:
sc_obj (StretchCluster): Stretch cluster object
nodes (Fixture): Nodes fixture identifying the platform nodes

"""
try:
sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGREADER_CEPHFS_LABEL,
statuses=[constants.STATUS_RUNNING, constants.STATUS_COMPLETED],
)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2
)
except UnexpectedBehaviour:
logger.info("some pods are not running, so trying the work-around")
recover_by_zone_restart(sc_obj, nodes=nodes)
logger.info("All the workloads pods are successfully up and running")


@retry(UnexpectedBehaviour, tries=5, delay=10, backoff=1)
def recover_workload_pods_post_recovery(sc_obj, pods_not_running=None):
"""
There seems to be a known issue https://bugzilla.redhat.com/show_bug.cgi?id=2244353
and this will apply the workaround to resolve that issue
Expand All @@ -27,6 +57,14 @@ def recover_workload_pods_post_recovery(sc_obj, pods_not_running):
pods_not_running (List): A list of Pod objects that are not in Running state

"""

# fetch the not running pods
if not pods_not_running:
logger.info("Fetching pods that are not running or terminating")
pods_not_running = get_not_running_pods(
namespace=constants.STRETCH_CLUSTER_NAMESPACE
)

# try to scale down and scale up the deployment/sts
# if any of the mentioned errors are found
error_messages = [
Expand All @@ -44,116 +82,223 @@ def check_errors_regex(desc_out, err_msgs):
return bool(re.search(pattern, desc_out))

pod_names = [pod.name for pod in pods_not_running]
logger.info(f"Pods not running: {pod_names}")
scaled_down = []
logger.info(f"These are the pods not running: {pod_names}")

for pod in pods_not_running:
# get the labels from the pod data
labels = str(pod.get_labels())

# make sure these pods are not already scaled down
if any(
[
constants.LOGWRITER_CEPHFS_LABEL.split("=")[1] in labels
and constants.LOGWRITER_CEPHFS_LABEL in scaled_down,
constants.LOGWRITER_RBD_LABEL.split("=")[1] in labels
and constants.LOGWRITER_RBD_LABEL in scaled_down,
constants.LOGREADER_CEPHFS_LABEL.split("=")[1] in labels
and constants.LOGREADER_CEPHFS_LABEL in scaled_down,
]
):
continue

# get the pod describe output
try:
desc_out = OCP().exec_oc_cmd(
command=f"describe pod {pod.name}", out_yaml_format=False
)
except CommandFailed as e:
if "NotFound" not in e.args[0]:
raise e
else:
continue
workload_labels = [
constants.LOGWRITER_CEPHFS_LABEL,
LOGREADER_CEPHFS_LABEL,
LOGWRITER_RBD_LABEL,
]

# if any of the above mentioned error messages are present in the
# describe outpout we scaled down respective deployment/job/statefulset
if check_errors_regex(desc_out, error_messages):
# Delete the ContainerStatusUnknown error pods
if pod.status() == constants.STATUS_CONTAINER_STATUS_UNKNOWN:
pod.delete()
for app_label in workload_labels:
for pod in pods_not_running:

if (
constants.LOGWRITER_CEPHFS_LABEL.split("=")[1] in labels
and constants.LOGWRITER_CEPHFS_LABEL not in scaled_down
):
modify_deployment_replica_count(
deployment_name=constants.LOGWRITER_CEPHFS_NAME,
replica_count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
wait_for_pods_deletion(
constants.LOGWRITER_CEPHFS_LABEL,
timeout=300,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
scaled_down.append(constants.LOGWRITER_CEPHFS_LABEL)
elif (
constants.LOGWRITER_RBD_LABEL.split("=")[1] in labels
and constants.LOGWRITER_RBD_LABEL not in scaled_down
):

modify_statefulset_replica_count(
statefulset_name=constants.LOGWRITER_RBD_NAME,
replica_count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
wait_for_pods_deletion(
constants.LOGWRITER_RBD_LABEL,
timeout=300,
# Delete any pod that is in Error or ContainerStatusUnknown status
try:
if pod.status() in [
constants.STATUS_CONTAINER_STATUS_UNKNOWN,
constants.STATUS_ERROR,
]:
logger.info(
f"Pod {pod.name} in either {constants.STATUS_CONTAINER_STATUS_UNKNOWN} "
f"or {constants.STATUS_ERROR}. hence deleting the pod"
)
pod.delete()
continue

# Get the pod describe output to verify the error
logger.info(f"Fetching the `oc describe` output for pod {pod.name}")
PrasadDesala marked this conversation as resolved.
Show resolved Hide resolved
desc_out = OCP(
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
scaled_down.append(constants.LOGWRITER_RBD_LABEL)
).exec_oc_cmd(command=f"describe pod {pod.name}", out_yaml_format=False)
except CommandFailed as e:
if "NotFound" in e.args[0]:
continue
else:
raise e

elif (
constants.LOGREADER_CEPHFS_LABEL.split("=")[1] in labels
and constants.LOGREADER_CEPHFS_LABEL not in scaled_down
):
# checks for errors in the pod describe output
if check_errors_regex(desc_out, error_messages):

modify_job_parallelism_count(
job_name=constants.LOGREADER_CEPHFS_NAME,
count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
wait_for_pods_deletion(
constants.LOGREADER_CEPHFS_LABEL,
timeout=300,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
if (
app_label.split("=")[1] in str(pod.get_labels())
and app_label == constants.LOGWRITER_CEPHFS_LABEL
):

logger.info("Scaling down the deployment for logwriter")
modify_deployment_replica_count(
deployment_name=constants.LOGWRITER_CEPHFS_NAME,
replica_count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
wait_for_pods_deletion(
constants.LOGWRITER_CEPHFS_LABEL,
timeout=300,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
modify_deployment_replica_count(
deployment_name=constants.LOGWRITER_CEPHFS_NAME,
replica_count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do need a break after every elif, won't a single break at the end of elif not sufficient?


elif (
app_label.split("=")[1] in str(pod.get_labels())
and app_label == constants.LOGREADER_CEPHFS_LABEL
):

logger.info("Scaling down the job for logreader")
modify_job_parallelism_count(
job_name=constants.LOGREADER_CEPHFS_NAME,
count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
wait_for_pods_deletion(
constants.LOGREADER_CEPHFS_LABEL,
timeout=300,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
modify_job_parallelism_count(
job_name=constants.LOGREADER_CEPHFS_NAME,
count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break

elif (
app_label.split("=")[1] in str(pod.get_labels())
and app_label == constants.LOGWRITER_RBD_LABEL
):

logger.info("Scaling down logwriter rbd statefulset")
modify_statefulset_replica_count(
statefulset_name=constants.LOGWRITER_RBD_NAME,
replica_count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
wait_for_pods_deletion(
constants.LOGWRITER_RBD_LABEL,
timeout=300,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
modify_statefulset_replica_count(
statefulset_name=constants.LOGWRITER_RBD_NAME,
replica_count=2,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
break

# fetch workload pod details now and make sure all of them are running
logger.info("Checking if the logwriter pods are up and running now")
sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGREADER_CEPHFS_LABEL, statuses=["Running", "Completed"]
)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2
)


@retry(UnexpectedBehaviour, tries=3, delay=10, backoff=1)
def recover_by_zone_restart(sc_obj, nodes):
"""
Recover the logwriter workload pods by nodes restart
if any of the known error is found in pods

Args:
sc_obj (StretchCluster): StretchCluster Object
nodes (Fixture): Nodes fixture identifying the platform nodes

"""
logger.info("Fetching pods that are not running or terminating")
pods_not_running = get_not_running_pods(
namespace=constants.STRETCH_CLUSTER_NAMESPACE
)

# restart the pod nodes if any of the mentioned errors are found
error_messages = [
"is not a mountpoint",
"not found in the list of registered CSI drivers",
"timed out waiting for the condition",
"Error: failed to resolve symlink",
"permission denied",
]

# function that will return true if any of the error message
# present in the describe output
def check_errors_regex(desc_out, err_msgs):
pattern = "|".join(map(re.escape, err_msgs))
return bool(re.search(pattern, desc_out))

restarted = False

for pod in pods_not_running:

# Delete any pod that is in Error or ContainerStatusUnknown status
try:
if pod.status() in [
constants.STATUS_CONTAINER_STATUS_UNKNOWN,
constants.STATUS_ERROR,
]:
logger.info(
f"Pod {pod.name} in either {constants.STATUS_CONTAINER_STATUS_UNKNOWN} "
f"or {constants.STATUS_ERROR}. hence deleting the pod"
)
scaled_down.append(constants.LOGREADER_CEPHFS_LABEL)

# for all the scaled down workloads we scale them up
# one by one
for label in scaled_down:
if label == constants.LOGWRITER_CEPHFS_LABEL:
modify_deployment_replica_count(
deployment_name=constants.LOGWRITER_CEPHFS_NAME,
replica_count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
elif label == constants.LOGWRITER_RBD_LABEL:
modify_statefulset_replica_count(
statefulset_name=constants.LOGWRITER_RBD_NAME,
replica_count=2,
pod.delete()
continue

# Get the pod describe output to verify the error
logger.info(f"Fetching the `oc describe` output for pod {pod.name}")
desc_out = OCP(
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
).exec_oc_cmd(command=f"describe pod {pod.name}", out_yaml_format=False)
except CommandFailed as e:
if "NotFound" in e.args[0]:
continue
else:
raise e

if check_errors_regex(desc_out, error_messages) and not restarted:

pod_node = get_pod_node(pod)
logger.info(
f"We need to restart the all the nodes in the zone of node {pod_node.name}"
)
elif label == constants.LOGREADER_CEPHFS_LABEL:
modify_job_parallelism_count(
job_name=constants.LOGREADER_CEPHFS_NAME,
count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
node_labels = pod_node.get()["metadata"]["labels"]

logger.info(f"Identifying the zone of the node {pod_node.name}")
for zone in constants.DATA_ZONE_LABELS:
if (
constants.ZONE_LABEL in node_labels.keys()
and node_labels[constants.ZONE_LABEL] == zone
):
zone_to_restart = zone
break

logger.info(
f"We need to restart all the worker nodes in zone {zone_to_restart}"
)
nodes_in_zone = sc_obj.get_nodes_in_zone(zone_to_restart)
nodes_to_restart = list()
for node_obj in nodes_in_zone:
node_labels = node_obj.get()["metadata"]["labels"]
if constants.WORKER_LABEL in node_labels.keys():
nodes_to_restart.append(node_obj)

nodes.restart_nodes(nodes=nodes_to_restart)
restarted = True

if not restarted:
logger.error(
"Raising exception because none of the pods are failing "
"because of known errors and no nodes restart was done."
"Please check..."
)
raise Exception

# fetch workload pod details now and make sure all of them are running
logger.info("Checking if the logwriter pods are up and running now")
sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGREADER_CEPHFS_LABEL, statuses=["Running", "Completed"]
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -2488,6 +2488,7 @@ def get_not_running_pods(selector=None, namespace=config.ENV_DATA["cluster_names
if (
status != constants.STATUS_RUNNING
and status != constants.STATUS_TERMINATING
and status != constants.STATUS_COMPLETED
):
pods_not_running.append(pod)

Expand Down
Loading
Loading