diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 3d57a507b..9ac7fc923 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -21,6 +21,22 @@ on: description: 'Name of test to run. Empty to run all' required: false default: '' + run_failed_tests_only: + description: "Run only the failed tests from the last run" + required: false + default: false + type: boolean + run_unexecuted_tests: + description: "Run tests that were not executed in the last run" + required: false + default: false + type: boolean + retry_count: + description: "Number of retries for failed test cases" + required: false + default: 1 + type: number + jobs: e2e: runs-on: self-hosted @@ -70,7 +86,7 @@ jobs: -backend-config="dynamodb_table=${TFSTATE_DYNAMODB_TABLE}" \ -backend-config="encrypt=true" - - name: select or create workspace + - name: Select or Create Workspace run: | cd $GITHUB_WORKSPACE/simplyBlockDeploy terraform workspace select -or-create ghiaction-sbclie2e @@ -83,8 +99,10 @@ jobs: cd $GITHUB_WORKSPACE/simplyBlockDeploy terraform plan \ -var "mgmt_nodes=1" -var "storage_nodes=3" -var "volumes_per_storage_nodes=3" \ - -var "extra_nodes=1" -var "extra_nodes_instance_type=m6id.large" \ - -var "region=us-east-2" -var "sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}" -out=tfplan + -var mgmt_nodes_instance_type="m6id.xlarge" -var storage_nodes_instance_type="m6id.xlarge" \ + -var "extra_nodes=1" -var "extra_nodes_instance_type=m6id.xlarge" \ + -var storage_nodes_ebs_size2=100 -var "region=us-east-2" \ + -var "sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}" -out=tfplan - name: Apply Terraform Changes run: | cd $GITHUB_WORKSPACE/simplyBlockDeploy @@ -100,13 +118,13 @@ jobs: - name: Bootstrap Cluster run: | cd $GITHUB_WORKSPACE/simplyBlockDeploy - ./bootstrap-cluster.sh --max-lvol 10 --max-snap 10 --max-prov 450g --number-of-devices 3 + ./bootstrap-cluster.sh --max-lvol 100 --max-snap 100 --max-prov 360G --number-of-devices 3 id: bootstrap_cluster env: SBCLI_CMD: ${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }} - name: Setup Tests & Run Tests - timeout-minutes: 60 + timeout-minutes: 120 run: | cd $GITHUB_WORKSPACE/e2e sudo apt-get install -y python3.12-venv @@ -114,6 +132,12 @@ jobs: source myenv/bin/activate python3 -m pip install -r requirements.txt echo "Running tests in namespace ${{ steps.get-namespace.outputs.namespace }}" + + # Clean branch name to remove slashes for filename + BRANCH_NAME_SAFE=$(echo "${{ github.ref_name }}" | tr '/' '_') + export BRANCH_NAME=${BRANCH_NAME_SAFE} + export FAILED_CASES_FILE="failed_cases_${BRANCH_NAME_SAFE}.json" + export EXECUTED_CASES_FILE="executed_cases_${BRANCH_NAME_SAFE}.json" export CLUSTER_ID=${{ steps.bootstrap_cluster.outputs.cluster_id }} export CLUSTER_SECRET=${{ steps.bootstrap_cluster.outputs.cluster_secret }} export CLUSTER_IP=${{ steps.bootstrap_cluster.outputs.cluster_ip }} @@ -128,7 +152,13 @@ jobs: if [ -n "${{ github.event.inputs.testname }}" ]; then TESTNAME="--testname ${{ github.event.inputs.testname }}" fi - python3 e2e.py $TESTNAME + if [ "${{ github.event.inputs.run_failed_tests_only }}" = "true" ]; then + python3 e2e.py --failed_only --retry ${{ github.event.inputs.retry_count }} --branch $BRANCH_NAME $TESTNAME + elif [ "${{ github.event.inputs.run_unexecuted_tests }}" = "true" ]; then + python3 e2e.py --unexecuted_only --retry ${{ github.event.inputs.retry_count }} --branch $BRANCH_NAME $TESTNAME + else + python3 e2e.py --retry ${{ github.event.inputs.retry_count }} --branch $BRANCH_NAME $TESTNAME + fi - name: Upload automation and docker logs to s3 run: | cd $GITHUB_WORKSPACE/e2e/logs @@ -243,4 +273,4 @@ jobs: ls -la ./ rm -rf ./* || true rm -rf ./.??* || true - ls -la ./ \ No newline at end of file + ls -la ./ diff --git a/e2e/e2e.py b/e2e/e2e.py index b224469c2..fde2ae966 100644 --- a/e2e/e2e.py +++ b/e2e/e2e.py @@ -1,58 +1,105 @@ ### simplyblock e2e tests import argparse +import os +import json import traceback from __init__ import get_all_tests from logger_config import setup_logger -from exceptions.custom_exception import ( - TestNotFoundException, - MultipleExceptions -) +from exceptions.custom_exception import TestNotFoundException, MultipleExceptions from e2e_tests.cluster_test_base import TestClusterBase from utils.sbcli_utils import SbcliUtils from utils.ssh_utils import SshUtils def main(): - """Run complete test suite - """ + """Run the complete test suite or specific tests.""" parser = argparse.ArgumentParser(description="Run simplyBlock's E2E Test Framework") parser.add_argument('--testname', type=str, help="The name of the test to run", default=None) parser.add_argument('--fio_debug', type=bool, help="Add debug flag to fio", default=False) + parser.add_argument('--failed_only', action='store_true', help="Run only failed tests from last run", default=False) + parser.add_argument('--unexecuted_only', action='store_true', help="Run only unexecuted tests from last run", default=False) + parser.add_argument('--branch', type=str, help="Branch name to uniquely store test results", required=True) + parser.add_argument('--retry', type=int, help="Number of retries for failed cases", default=1) args = parser.parse_args() tests = get_all_tests() - # Find the test class based on the provided test name - test_class_run = [] - if args.testname is None or len(args.testname.strip()) == 0: - test_class_run = tests + + # File to store failed test cases for the specific branch + base_dir = os.path.join(os.path.expanduser('~'), 'e2e_test_runs_fail_unexec_json') + if not os.path.exists(base_dir): + os.makedirs(base_dir) + failed_cases_file = os.path.join(base_dir, + f'failed_cases_{args.branch}.json') + executed_cases_file = os.path.join(base_dir, + f'executed_cases_{args.branch}.json') + + logger.info(f"Failed only: {args.failed_only}") + logger.info(f"Unexecuted only: {args.unexecuted_only}") + logger.info(f"Failed case file: {failed_cases_file}") + logger.info(f"File exists: {os.path.exists(failed_cases_file)}") + + onlyfiles = [f for f in os.listdir(base_dir) if os.path.isfile(os.path.join(base_dir, f))] + logger.info(f"List of files: {onlyfiles}") + + # Load previously failed cases if '--failed_only' is set + if args.failed_only and os.path.exists(failed_cases_file): + logger.info("Running failed cases only") + with open(failed_cases_file, 'r', encoding='utf-8') as file: + failed_tests = json.load(file) + test_class_run = [cls for cls in tests + if any(ft in f'{cls.__name__}' for ft in failed_tests)] + + logger.info(f"Running failed cases only: {test_class_run}") + elif args.unexecuted_only and os.path.exists(executed_cases_file): + logger.info("Running unexecuted cases only") + with open(executed_cases_file, 'r', encoding='utf-8') as file: + executed_tests = json.load(file) + test_class_run = [cls for cls in tests + if all(unet not in f'{cls.__name__}' for unet in executed_tests)] + logger.info(f"Running unexecuted cases only: {test_class_run}") else: - for cls in tests: - if args.testname.lower() in cls.__name__.lower(): - test_class_run.append(cls) + # Run all tests or selected ones + logger.info("Running all or selected cases") + test_class_run = [] + if args.testname is None or len(args.testname.strip()) == 0: + test_class_run = tests + else: + for cls in tests: + if args.testname.lower() in cls.__name__.lower(): + test_class_run.append(cls) + logger.info(f"List of tests to run: {test_class_run}") if not test_class_run: available_tests = ', '.join(cls.__name__ for cls in tests) logger.info(f"Test '{args.testname}' not found. Available tests are: {available_tests}") raise TestNotFoundException(args.testname, available_tests) - + errors = {} + executed_tests = [] for test in test_class_run: logger.info(f"Running Test {test}") test_obj = test(fio_debug=args.fio_debug) - try: - test_obj.setup() - test_obj.run() - except Exception as exp: - logger.error(traceback.format_exc()) - errors[f"{test.__name__}"] = [exp] + + for attempt in range(args.retry): + try: + test_obj.setup() + executed_tests.append(test.__name__) + test_obj.run() + logger.info(f"Test {test.__name__} passed on attempt {attempt + 1}") + if f"{test.__name__}" in errors: + del errors[f"{test.__name__}"] + break # Test passed, no need for more retries + except Exception as exp: + logger.error(f"Attempt {attempt + 1} failed for test {test.__name__}") + logger.error(traceback.format_exc()) + errors[f"{test.__name__}"] = [exp] + try: test_obj.teardown() - # pass except Exception as _: logger.error(f"Error During Teardown for test: {test.__name__}") logger.error(traceback.format_exc()) - # errors[f"{test.__name__}"].append(exp) finally: if check_for_dumps(): logger.info("Found a core dump during test execution. " @@ -60,24 +107,33 @@ def main(): break failed_cases = list(errors.keys()) + + # Save failed cases for next run + if failed_cases: + with open(failed_cases_file, 'w') as file: + json.dump(failed_cases, file) + else: + if os.path.exists(failed_cases_file): + os.remove(failed_cases_file) # Clear file if all tests passed + + # Save executed cases for next run + if executed_tests: + with open(executed_cases_file, 'w') as file: + json.dump(executed_tests, file) + else: + if os.path.exists(executed_cases_file): + os.remove(executed_cases_file) # Clear file if no tests executed the run + logger.info(f"Number of Total Cases: {len(test_class_run)}") logger.info(f"Number of Passed Cases: {len(test_class_run) - len(failed_cases)}") logger.info(f"Number of Failed Cases: {len(failed_cases)}") - - logger.info("Test Wise run status:") - for test in test_class_run: - if test.__name__ not in failed_cases: - logger.info(f"{test.__name__} PASSED CASE.") - else: - logger.info(f"{test.__name__} FAILED CASE.") - if errors: raise MultipleExceptions(errors) - + def check_for_dumps(): - """Validates whether core dumps present on machines + """Validates whether core dumps are present on machines Returns: bool: If there are core dumps or not diff --git a/e2e/e2e_tests/single_node_failure.py b/e2e/e2e_tests/single_node_failure.py index 9f0394935..9ca5b1aa7 100644 --- a/e2e/e2e_tests/single_node_failure.py +++ b/e2e/e2e_tests/single_node_failure.py @@ -65,6 +65,7 @@ def run(self): distr_ndcs=2, distr_npcs=1 ) + # raise Exception("Testing failure runs and retry runs") lvols = self.sbcli_utils.list_lvols() assert self.lvol_name in list(lvols.keys()), \ f"Lvol {self.lvol_name} not present in list of lvols post add: {lvols}"