Skip to content

on behalf of @vitaly: rebased hot backup topology change #555

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions release_tester/arangodb/starter/deployments/activefailover.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,8 +336,10 @@ def jam_attempt_impl(self):
args = ["--skip", "802_"]
self.checkdata_args = args
ret = curr_leader.arangosh.check_test_data(
"checking active failover new leader node", True, args, log_debug=True
)
"checking active failover new leader node",
True,
args,
log_debug=True)
if not ret[0]:
raise Exception("check data failed " + ret[1])

Expand Down
137 changes: 122 additions & 15 deletions release_tester/arangodb/starter/deployments/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def __init__(
if ver_found < len(versions):
print("One deployment doesn't support starters with more nodes!")
self.props.cluster_nodes = 3
self.backup_instance_count = self.props.cluster_nodes

def starter_prepare_env_impl(self, sm=None, more_opts=None):
# pylint: disable=invalid-name
Expand Down Expand Up @@ -146,7 +147,9 @@ def add_starter(name, port, opts, sm, hasAgency):
self.create_tls_ca_cert()
port = 9528
count = 0
for this_node in list(range(1, self.props.cluster_nodes + 1)):
# we need 2 additional nodes for hotbackup testing
full_node_count = self.props.cluster_nodes + 2 if self.hot_backup else self.props.cluster_nodes
for this_node in list(range(1, full_node_count + 1)):
node = []
node_opts.append(node)
if this_node != 1:
Expand All @@ -158,44 +161,48 @@ def add_starter(name, port, opts, sm, hasAgency):
add_starter(f"node{this_node}", port, node + common_opts, sm, count < 3)
port += 100
count += 1
self.backup_instance_count = count
for instance in self.starter_instances:
instance.is_leader = True

def starter_run_impl(self):
lh.subsection("instance setup")
for manager in self.starter_instances:
for manager in self.starter_instances[: self.props.cluster_nodes]:
logging.info("Spawning instance")
manager.run_starter()

logging.info("waiting for the starters to become alive")
not_started = self.starter_instances[:] # This is a explicit copy
not_running = self.get_running_starters() # This is a explicit copy
count = 0
while not_started:
logging.debug("waiting for mananger with logfile:" + str(not_started[-1].log_file))
if not_started[-1].is_instance_up():
not_started.pop()
while not_running:
logging.debug("waiting for mananger with logfile:" + str(not_running[-1].log_file))
if not_running[-1].is_instance_up():
not_running.pop()
progress(".")
time.sleep(1)
count += 1
if count > 120:
raise Exception("Cluster installation didn't come up in two minutes!")

logging.info("waiting for the cluster instances to become alive")
for node in self.starter_instances:
for node in self.get_running_starters():
node.detect_instances()
node.detect_instance_pids()
# self.cfg.add_frontend('http', self.cfg.publicip, str(node.get_frontend_port()))

logging.info("instances are ready - JWT: " + self.starter_instances[0].get_jwt_header())
count = 0
for node in self.starter_instances:
for node in self.get_running_starters():
node.set_passvoid("cluster", count == 0)
count += 1
for node in self.get_not_running_starters():
node.set_passvoid("cluster", False)
self.passvoid = "cluster"
self.cfg.passvoid = self.passvoid
if self.new_cfg:
self.new_cfg.passvoid = self.passvoid

def finish_setup_impl(self):
self.makedata_instances = self.starter_instances[:]
self.makedata_instances = self.get_running_starters()
self.set_frontend_instances()

def _check_for_shards_in_sync(self):
Expand Down Expand Up @@ -234,10 +241,10 @@ def upgrade_arangod_version_impl(self):
if self.cfg.stress_upgrade:
bench_instances.append(self.starter_instances[0].launch_arangobench("cluster_upgrade_scenario_1"))
bench_instances.append(self.starter_instances[1].launch_arangobench("cluster_upgrade_scenario_2"))
for node in self.starter_instances:
for node in self.get_running_starters():
node.replace_binary_for_upgrade(self.new_installer.cfg)

for node in self.starter_instances:
for node in self.get_running_starters():
node.detect_instance_pids_still_alive()

self.starter_instances[1].command_upgrade()
Expand Down Expand Up @@ -491,12 +498,12 @@ def jam_attempt_impl(self):
# After attempt of jamming, we have peer for nodeX in setup.json.
# This peer will brake further updates because this peer is unavailable.
# It is necessary to remove this peer from json for each starter instance
for instance in self.starter_instances:
for instance in self.get_running_starters():
remove_node_x_from_json(instance.basedir)

def shutdown_impl(self):
ret = False
for node in self.starter_instances:
for node in self.get_running_starters():
ret = ret or node.terminate_instance()
logging.info("test ended")
return ret
Expand Down Expand Up @@ -536,3 +543,103 @@ def generate_keyfile(self, keyfile):
"--host=localhost",
]
)

# pylint: disable=too-many-statements
@step
def test_hotbackup_impl(self):
"""test hotbackup feature: Cluster"""
with step("step 1: create a backup"):
backup_step_1 = self.create_backup_and_upload("thy_name_is_" + self.name)

with step("step 2: add new db server"):
old_servers = self.get_running_starters()
new_starter = self.get_not_running_starters()[-1]
new_starter.run_starter_and_wait()
self.backup_instance_count += 1
self.makedata_instances = self.get_running_starters()

with step("step 3: create a backup"):
backup_step_3 = self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server")

with step("step 4: remove old db server"):
self.remove_starter_dbserver(old_servers[0])

with step("step 5: create another backup"):
self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server_minus1_server", False)

with step("step 6: create non-backup data"):
self._check_for_shards_in_sync()
self.create_non_backup_data()
self.tcp_ping_all_nodes()

with step("step 7: download and restore backup from step 1"):
self.download_backup(backup_step_1)
self.validate_local_backup(backup_step_1)
backups = self.list_backup()
if backup_step_1 not in backups:
raise Exception("downloaded backup has different name? " + str(backups))
self.restore_backup(backup_step_1)
self.tcp_ping_all_nodes()

with step("step 8: check data"):
self.check_data_impl()
if not self.check_non_backup_data():
raise Exception("data created after backup is still there??")

with step("step 9: add new db server"):
new_starter2 = self.get_not_running_starters()[-1]
new_starter2.run_starter_and_wait()
self.backup_instance_count += 1
self.makedata_instances = self.get_running_starters()

with step("step 10: create non-backup data"):
self.create_non_backup_data()
self.tcp_ping_all_nodes()

with step("step 11: download and restore backup from step 3"):
self.download_backup(backup_step_3)
self.validate_local_backup(backup_step_3)
backups = self.list_backup()
if backup_step_3 not in backups:
raise Exception("downloaded backup has different name? " + str(backups))
self.restore_backup(backup_step_3)
self.tcp_ping_all_nodes()

with step("step 12: check data"):
self.check_data_impl()

with step("step 13: remove old db server"):
self.remove_starter_dbserver(old_servers[1])

with step("step 14: create non-backup data"):
self._check_for_shards_in_sync()
self.create_non_backup_data()
self.tcp_ping_all_nodes()

@step
def remove_starter_dbserver(self, starter):
"""remove dbserver managed by given starter from cluster"""
print("removing starter " + repr(starter))
terminated_dbserver_uuid = starter.get_dbserver().get_uuid()
starter.stop_dbserver()
self.remove_server_from_agency(terminated_dbserver_uuid)
self.backup_instance_count -= 1
self.makedata_instances = self.get_running_starters()

@step
def test_hotbackup_after_upgrade_impl(self):
"""test hotbackup after upgrade: cluster"""
with step("step 1: check data"):
self.check_data_impl()
with step("step 2: download backup"):
latest_backup = self.uploaded_backups[-1]
self.download_backup(latest_backup)
backups = self.list_backup()
if latest_backup not in backups:
raise Exception("downloaded backup has different name? " + str(backups))
with step("step 3: restore backup"):
self.restore_backup(latest_backup)
self.tcp_ping_all_nodes()
# we don't run checkdata after restore in this function, because it is ran afterwards by in runner.py
with step("step 4: delete backups"):
self.delete_all_backups()
4 changes: 3 additions & 1 deletion release_tester/arangodb/starter/deployments/cluster_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,9 @@ def __init__(
runner_type,
abort_on_error,
installer_set,
RunnerProperties(rp, "CLUSTER", 400, 600, self.scenario.hot_backup, 6),
RunnerProperties(
rp, "CLUSTER", 400, 600, self.scenario.hot_backup, 6
),
selenium,
selenium_driver_args,
selenium_include_suites,
Expand Down
4 changes: 3 additions & 1 deletion release_tester/arangodb/starter/deployments/dc2dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ def __init__(
runner_type,
abort_on_error,
installer_set,
RunnerProperties(rp, name, 0, 4500, True, 12),
RunnerProperties(
rp, name, 0, 4500, True, 12
),
selenium,
selenium_driver_args,
selenium_include_suites,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def __init__(
runner_type,
abort_on_error,
installer_set,
RunnerProperties(rp, "LeaderFollower", 400, 500, False, 2),
RunnerProperties(
rp, "LeaderFollower", 400, 500, False, 2
),
selenium,
selenium_driver_args,
selenium_include_suites,
Expand Down
4 changes: 3 additions & 1 deletion release_tester/arangodb/starter/deployments/none.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ def __init__(
runner_type,
abort_on_error,
installer_set,
RunnerProperties(rp, "none", 0, 1, False, 1),
RunnerProperties(
rp, "none", 0, 1, False, 1
),
selenium,
selenium_driver_args,
selenium_include_suites,
Expand Down
Loading