Skip to content

Commit

Permalink
Issue 6032 - Replication broken after backup restore (#6035)
Browse files Browse the repository at this point in the history
Replication is broken after doing an offline backup then later on an online or offline restore
Note: with online backup changelog is discarded at restore time (because it has no purge RUV)
In fact there are multiple cause:
[1] _cl5CICbInit is building wrongly the changelog RUVs so changelog is recreated
[2] Changelog is not cleared when it is "Recreated because of wrong test in dbmdb_back_ctrl
[3] Replication keep alive get created before the replica get back in sync. This creates missing csn.
Solution:
[1] Fix _cl5CICbInit to get the csn from the changelog record key and store properly the min and max in the context.
[2] Replace invalid test by a proper one.
[3] Change keep alive update starting delay from 2 seconds to 10 minutes (i.e twice the maximum backoff timeout)
To let a chance for the other supplier to replay the missing changes.
Also added/modified some more data when replication log are enabled
Note: this is a partial fix as a proper "resync after db reload" is not handled so this left issues (typically because
of the plugin internal operations like memberof plugin or if there are lots of changes to replay) but at least is is enough for the CI test ...

Issue: #6032

Reviewed by: @droideck, @tbordaz (Thanks!)

(cherry picked from commit 9e595d4)
  • Loading branch information
progier389 committed Jan 19, 2024
1 parent c4b7edf commit 109d4c9
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 30 deletions.
63 changes: 62 additions & 1 deletion dirsrvtests/tests/suites/backups/backup_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@
import logging
import pytest
import os
import shutil
from datetime import datetime
from lib389._constants import DEFAULT_SUFFIX, INSTALL_LATEST_CONFIG
from lib389.properties import BACKEND_SAMPLE_ENTRIES, TASK_WAIT
from lib389.topologies import topology_st as topo
from lib389.topologies import topology_st as topo, topology_m2 as topo_m2
from lib389.backend import Backend
from lib389.tasks import BackupTask, RestoreTask
from lib389.config import BDB_LDBMConfig
from lib389 import DSEldif
from lib389.utils import ds_is_older, get_default_db_lib
from lib389.replica import ReplicationManager
import tempfile

pytestmark = pytest.mark.tier1
Expand Down Expand Up @@ -106,6 +108,65 @@ def test_db_home_dir_online_backup(topo):
topo.standalone.tasks.db2bak(backup_dir=f'{backup_dir}', args={TASK_WAIT: True})
assert topo.standalone.ds_error_log.match(f".*Failed renaming {backup_dir}.bak back to {backup_dir}")

def test_replication(topo_m2):
"""Test that if the dbhome directory is set causing an online backup to fail,
the dblayer_backup function should go to error processing section.
:id: 9c826d36-b17d-11ee-855f-482ae39447e5
:setup: Two suppliers
:steps:
1. Perform backup on S1
2. Perform changes on both suppliers
3. Wait until replication is in sync
4. Stop S1
5. Destroy S1 database
6. Start S1
7. Restore S1 from backup
8. Wait until replication is in sync
:expectedresults:
1. Success
2. Success
3. Success
4. Success
5. Success
6. Success
7. Success
8. Success
"""
S1 = topo_m2.ms["supplier1"]
S2 = topo_m2.ms["supplier2"]
repl = ReplicationManager(DEFAULT_SUFFIX)

with tempfile.TemporaryDirectory(dir=S1.ds_paths.backup_dir) as backup_dir:
# Step 1: Perform backup on S1
# Use the offline method to have a cleanly stopped state in changelog.
S1.stop()
assert S1.db2bak(backup_dir)
S1.start()

# Step 2: Perform changes on both suppliers and wait for replication
# Note: wait_for_replication perform changes
repl.wait_for_replication(S1, S2)
repl.wait_for_replication(S2, S1)
# Step 4: Stop S1
S1.stop()
# Step 5: Destroy S1 database
if get_default_db_lib() == "mdb":
os.remove(f'{S1.ds_paths.db_dir}/data.mdb')
else:
shutil.rmtree(f'{S1.ds_paths.db_dir}/userRoot')
# Step 6: Start S1
S1.start()
# Step 7: Restore S1 from backup
rc = S1.tasks.bak2db(backup_dir=f'{backup_dir}', args={TASK_WAIT: True})
assert rc == 0
# Step 8: Wait until replication is in sync
# Must replicate first from S2 to S1 to resync S1
repl.wait_for_replication(S2, S1)
# To help to diagnose test failure, you may want to look first at:
# grep -E 'Database RUV|replica_reload_ruv|task_restore_thread|_cl5ConstructRUVs' /var/log/dirsrv/slapd-supplier1/errors
repl.wait_for_replication(S1, S2)


if __name__ == '__main__':
# Run isolated
Expand Down
24 changes: 12 additions & 12 deletions ldap/servers/plugins/replication/cl5_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -2589,7 +2589,7 @@ _cl5CICbInit(dbi_val_t *key, dbi_val_t *data, DBLCI_CTX *dblcictx)
return DBI_RC_SUCCESS;
}
/* Update last csn */
csn_init_by_string(&dblcictx->csn, data->data);
csn_init_by_string(&dblcictx->csn, key->data);
if (_cl5CIEventCheckTxnEnd(&dblcictx->seen) ||
_cl5CIEventCheckTxnEnd(&dblcictx->changed)) {
/*
Expand Down Expand Up @@ -3008,7 +3008,6 @@ _cl5GenRUVInfo(dbi_val_t *key, dbi_val_t *data, void *ctx)
DBLCI_CTX *dblcictx = ctx;
ReplicaId rid = 0;
RID_INFO *ridinfo = NULL;
CSN csn = {0};
int rc = _cl5CICbInit(key, data, dblcictx);
if (rc != DBI_RC_SUCCESS) {
return rc;
Expand All @@ -3030,9 +3029,9 @@ _cl5GenRUVInfo(dbi_val_t *key, dbi_val_t *data, void *ctx)
ridinfo = _cl5GetRidInfo(dblcictx, rid, PR_TRUE);
if (ridinfo->new == 1) {
ridinfo->new = 0;
ridinfo->mincsn = csn;
ridinfo->mincsn = dblcictx->csn;
}
ridinfo->maxcsn = csn;
ridinfo->maxcsn = dblcictx->csn;
return DBI_RC_SUCCESS;
}

Expand All @@ -3045,18 +3044,19 @@ _cl5ConstructRUVs (cldb_Handle *cldb)
char mincsnstr[CSN_STRSIZE] = "";
char maxcsnstr[CSN_STRSIZE] = "";
int rc = ruv_init_new(cldb->ident, 0, NULL, &cldb->purgeRUV);
const char * bename = cldb->be ? cldb->be->be_name : "?" ;

if (rc != RUV_SUCCESS) {
slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name_cl, "_cl5ConstructRUVs - "
"Failed to initialize purges RUV for file %s; ruv error - %d\n",
cldb->ident, rc);
"Failed to initialize purges RUV for %s changelog in backend %s; ruv error - %d\n",
cldb->ident, bename, rc);
return CL5_RUV_ERROR;
}
rc = ruv_init_new(cldb->ident, 0, NULL, &cldb->maxRUV);
if (rc != RUV_SUCCESS) {
slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name_cl, "_cl5ConstructRUVs - "
"Failed to initialize upper bound RUV for file %s; ruv error - %d\n",
cldb->ident, rc);
"Failed to initialize upper bound RUV for %s changelog in backend %s; ruv error - %d\n",
cldb->ident, bename, rc);
return CL5_RUV_ERROR;
}

Expand All @@ -3073,21 +3073,21 @@ _cl5ConstructRUVs (cldb_Handle *cldb)
/* Now that we have the min and max csn for each rids, it is time to update the RUVs */
rc = CL5_SUCCESS;
slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name_cl, "_cl5ConstructRUVs - "
"Found %d replicas in %s changelog file.\n",
dblcictx.nb_rids, cldb->ident);
"Found %d replicas in %s changelog in backend %s.\n",
dblcictx.nb_rids, cldb->ident, bename);
for (size_t i=0; i<dblcictx.nb_rids; i++) {
rc = ruv_set_csns(cldb->maxRUV, &dblcictx.rids[i].maxcsn, NULL);
if (rc != RUV_SUCCESS) {
slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name_cl, "_cl5ConstructRUVs - "
"Failed to update upper bound RUV for file %s; ruv error - %d\n",
"Failed to update upper bound RUV for %s changelog; ruv error - %d\n",
cldb->ident, rc);
rc = CL5_DB_ERROR;
break;
}
rc = ruv_set_csns(cldb->purgeRUV, &dblcictx.rids[i].mincsn, NULL);
if (rc != RUV_SUCCESS) {
slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name_cl, "_cl5ConstructRUVs - "
"Failed to update purge RUV for file %s; ruv error - %d\n",
"Failed to update purge RUV for %s changelog; ruv error - %d\n",
cldb->ident, rc);
rc = CL5_DB_ERROR;
break;
Expand Down
13 changes: 12 additions & 1 deletion ldap/servers/plugins/replication/repl5_replica.c
Original file line number Diff line number Diff line change
Expand Up @@ -1563,8 +1563,9 @@ replica_set_enabled(Replica *r, PRBool enable)
}
/* create supplier update event */
if (r->repl_eqcxt_ka_update == NULL && replica_get_type(r) == REPLICA_TYPE_UPDATABLE) {
/* Should not create local update before the replica get a chance to resync after a restore/import */
r->repl_eqcxt_ka_update = slapi_eq_repeat_rel(replica_subentry_update, r,
slapi_current_rel_time_t() + START_UPDATE_DELAY,
slapi_current_rel_time_t() + 2*PROTOCOL_BACKOFF_MAXIMUM,
1000 * replica_get_keepalive_update_interval(r));
}
} else /* disable */
Expand Down Expand Up @@ -1655,6 +1656,16 @@ replica_reload_ruv(Replica *r)
!ruv_covers_ruv(upper_bound_ruv, new_ruv)) {

/* We can't use existing changelog - remove existing file */
ruv_dump(new_ruv, "replica_reload_ruv database RUV", NULL);
ruv_dump(upper_bound_ruv, "replica_reload_ruv changelog RUV", NULL);
if (!ruv_covers_ruv(new_ruv, upper_bound_ruv)) {
slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "replica_reload_ruv - "
"changelog contains changes that are not in the databae.\n");
}
if (!ruv_covers_ruv(upper_bound_ruv, new_ruv)) {
slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "replica_reload_ruv - "
"database contains changes that are not in the changelog.\n");
}
slapi_log_err(SLAPI_LOG_WARNING, repl_plugin_name, "replica_reload_ruv - "
"New data for replica %s does not match the data in the changelog.\n "
"Recreating the changelog file. This could affect replication with replica's "
Expand Down
29 changes: 13 additions & 16 deletions ldap/servers/slapd/back-ldbm/db-mdb/mdb_layer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1440,23 +1440,20 @@ dbmdb_back_ctrl(Slapi_Backend *be, int cmd, void *info)
struct ldbminfo *li = (struct ldbminfo *)be->be_database->plg_private;

ldbm_instance *inst = (ldbm_instance *) be->be_instance_info;
if (li) {
dblayer_private *priv = (dblayer_private *)li->li_dblayer_private;
if (priv && priv->dblayer_env) {
char *instancedir;
dbmdb_dbi_t *dbi = NULL;

slapi_back_get_info(be, BACK_INFO_INSTANCE_DIR, (void **)&instancedir);
rc = dbmdb_open_dbi_from_filename(&dbi, be, BDB_CL_FILENAME, NULL, 0);
if (rc == MDB_NOTFOUND) {
/* Nothing to do */
rc = 0;
} else if (rc == 0) {
rc = dbmdb_dbi_remove(MDB_CONFIG(li), (dbi_db_t**)&dbi);
}
inst->inst_changelog = NULL;
slapi_ch_free_string(&instancedir);
if (li && MDB_CONFIG(li)) {
char *instancedir;
dbmdb_dbi_t *dbi = NULL;

slapi_back_get_info(be, BACK_INFO_INSTANCE_DIR, (void **)&instancedir);
rc = dbmdb_open_dbi_from_filename(&dbi, be, BDB_CL_FILENAME, NULL, 0);
if (rc == MDB_NOTFOUND) {
/* Nothing to do */
rc = 0;
} else if (rc == 0) {
rc = dbmdb_dbi_remove(MDB_CONFIG(li), (dbi_db_t**)&dbi);
}
inst->inst_changelog = NULL;
slapi_ch_free_string(&instancedir);
}
break;
}
Expand Down

0 comments on commit 109d4c9

Please sign in to comment.