Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 6372 - Deadlock while doing online backup #6475

Merged
merged 2 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion dirsrvtests/tests/suites/backups/backup_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,20 @@
from datetime import datetime
from lib389._constants import DN_DM, PASSWORD, DEFAULT_SUFFIX, INSTALL_LATEST_CONFIG
from lib389.properties import BACKEND_SAMPLE_ENTRIES, TASK_WAIT
from lib389.topologies import topology_st as topo, topology_m2 as topo_m2
from lib389.topologies import topology_st as topo, topology_m2 as topo_m2, set_timeout
from lib389.backend import Backends, Backend
from lib389.dbgen import dbgen_users
from lib389.tasks import BackupTask, RestoreTask
from lib389.config import BDB_LDBMConfig
from lib389.idm.nscontainer import nsContainers
from lib389 import DSEldif
from lib389.utils import ds_is_older, get_default_db_lib
from lib389.replica import ReplicationManager
from threading import Thread, Event
import tempfile



pytestmark = pytest.mark.tier1

DEBUGGING = os.getenv("DEBUGGING", default=False)
Expand All @@ -36,6 +40,11 @@
logging.getLogger(__name__).setLevel(logging.INFO)
log = logging.getLogger(__name__)

# test_online_backup_and_dse_write may hang if 6372 is not fixed
# so lets use a shorter timeout
set_timeout(30*60)

event = Event()

BESTRUCT = [
{ "bename" : "be1", "suffix": "dc=be1", "nbusers": 1000 },
Expand Down Expand Up @@ -348,6 +357,38 @@ def test_backup_task_after_failure(mytopo):
assert exitCode == 0, "Backup failed. Issue #6229 may not be fixed."


def load_dse(inst):
conts = nsContainers(inst, 'cn=config')
while not event.is_set():
cont = conts.create(properties={'cn': 'test_online_backup_and_dse_write'})
cont.delete()


def test_online_backup_and_dse_write(topo):
"""Test online backup while attempting to add/delete entries in dse.ldif.

:id: 4a1edd2c-be15-11ef-8bc8-482ae39447e5
:setup: One standalone instance
:steps:
1. Start a thread that loops adding then removing in the dse.ldif
2. Perform 10 online backups
3. Stop the thread
:expectedresults:
1. Success
2. Success (or timeout if issue #6372 is not fixed.)
3. Success
"""
inst = topo.standalone
t = Thread(target=load_dse, args=[inst])
t.start()
for x in range(10):
with tempfile.TemporaryDirectory() as backup_dir:
assert inst.tasks.db2bak(backup_dir=f'{backup_dir}', args={TASK_WAIT: True}) == 0
event.set()
t.join()
event.clear()


if __name__ == '__main__':
# Run isolated
# -s for DEBUG mode
Expand Down
170 changes: 103 additions & 67 deletions ldap/servers/slapd/dse.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <pwd.h>
/* Needed to access read_config_dse */
#include "proto-slap.h"
#include <stdbool.h>

#include <unistd.h> /* provides fsync/close */

Expand Down Expand Up @@ -73,9 +74,6 @@
#define DSE_USE_LOCK 1
#define DSE_NO_LOCK 0

/* Global lock used during backups */
static PRLock *backup_lock = NULL;

struct dse_callback
{
int operation;
Expand Down Expand Up @@ -104,6 +102,8 @@ struct dse
/* initialize the dse */
int dse_is_updateable; /* if non-zero, this DSE can be written to */
int dse_readonly_error_reported; /* used to ensure that read-only errors are logged only once */
pthread_mutex_t dse_backup_lock; /* used to block write when online backup is in progress */
bool dse_backup_in_progress; /* tell that online backup is in progress (protected by dse_rwlock) */
};

struct dse_node
Expand Down Expand Up @@ -155,6 +155,91 @@ static int dse_write_entry(caddr_t data, caddr_t arg);
static int ldif_record_end(char *p);
static int dse_call_callback(struct dse *pdse, Slapi_PBlock *pb, int operation, int flags, Slapi_Entry *entryBefore, Slapi_Entry *entryAfter, int *returncode, char *returntext);

/* Lock the dse in read mode */
INLINE_DIRECTIVE static void
dse_lock_read(struct dse *pdse, int use_lock)
{
if (use_lock == DSE_USE_LOCK && pdse->dse_rwlock) {
slapi_rwlock_rdlock(pdse->dse_rwlock);
}
}

/* Lock the dse in write mode and wait until the */
INLINE_DIRECTIVE static void
dse_lock_write(struct dse *pdse, int use_lock)
{
if (use_lock != DSE_USE_LOCK || !pdse->dse_rwlock) {
return;
}
slapi_rwlock_wrlock(pdse->dse_rwlock);
while (pdse->dse_backup_in_progress) {
slapi_rwlock_unlock(pdse->dse_rwlock);
/* Wait util dse_backup_lock is unlocked */
pthread_mutex_lock(&pdse->dse_backup_lock);
pthread_mutex_unlock(&pdse->dse_backup_lock);
slapi_rwlock_wrlock(pdse->dse_rwlock);
}
}

/* release the dse lock */
INLINE_DIRECTIVE static void
dse_lock_unlock(struct dse *pdse, int use_lock)
{
if (use_lock == DSE_USE_LOCK && pdse->dse_rwlock) {
slapi_rwlock_unlock(pdse->dse_rwlock);
}
}

/* Call cb(pdse) */
INLINE_DIRECTIVE static void
dse_call_cb(void (*cb)(struct dse*))
{
Slapi_Backend *be = slapi_be_select_by_instance_name("DSE");
if (be) {
struct dse *pdse = NULL;
slapi_be_Rlock(be);
pdse = be->be_database->plg_private;
if (pdse) {
cb(pdse);
}
slapi_be_Unlock(be);
}
}

/* Helper for dse_backup_lock() */
static void
dse_backup_lock_cb(struct dse *pdse)
{
pthread_mutex_lock(&pdse->dse_backup_lock);
slapi_rwlock_wrlock(pdse->dse_rwlock);
pdse->dse_backup_in_progress = true;
slapi_rwlock_unlock(pdse->dse_rwlock);
}

/* Helper for dse_backup_unlock() */
static void
dse_backup_unlock_cb(struct dse *pdse)
{
slapi_rwlock_wrlock(pdse->dse_rwlock);
pdse->dse_backup_in_progress = false;
slapi_rwlock_unlock(pdse->dse_rwlock);
pthread_mutex_unlock(&pdse->dse_backup_lock);
}

/* Tells that a backup thread is starting */
void
dse_backup_lock()
{
dse_call_cb(dse_backup_lock_cb);
}

/* Tells that a backup thread is ending */
void
dse_backup_unlock()
{
dse_call_cb(dse_backup_unlock_cb);
}

/*
* Map a DN onto a dse_node.
* Returns NULL if not found.
Expand Down Expand Up @@ -192,18 +277,12 @@ dse_get_entry_copy(struct dse *pdse, const Slapi_DN *dn, int use_lock)
Slapi_Entry *e = NULL;
struct dse_node *n;

if (use_lock == DSE_USE_LOCK && pdse->dse_rwlock) {
slapi_rwlock_rdlock(pdse->dse_rwlock);
}

dse_lock_read(pdse, use_lock);
n = dse_find_node(pdse, dn);
if (n != NULL) {
e = slapi_entry_dup(n->entry);
}

if (use_lock == DSE_USE_LOCK && pdse->dse_rwlock) {
slapi_rwlock_unlock(pdse->dse_rwlock);
}
dse_lock_unlock(pdse, use_lock);

return e;
}
Expand Down Expand Up @@ -393,6 +472,7 @@ dse_new(char *filename, char *tmpfilename, char *backfilename, char *startokfile
pdse->dse_callback = NULL;
pdse->dse_is_updateable = dse_permission_to_write(pdse,
SLAPI_LOG_TRACE);
pthread_mutex_init(&pdse->dse_backup_lock, NULL);
}
slapi_ch_free((void **)&realconfigdir);
}
Expand Down Expand Up @@ -429,8 +509,7 @@ dse_destroy(struct dse *pdse)
if (NULL == pdse) {
return 0; /* no one checks this return value */
}
if (pdse->dse_rwlock)
slapi_rwlock_wrlock(pdse->dse_rwlock);
dse_lock_write(pdse, DSE_USE_LOCK);
slapi_ch_free((void **)&(pdse->dse_filename));
slapi_ch_free((void **)&(pdse->dse_tmpfile));
slapi_ch_free((void **)&(pdse->dse_fileback));
Expand All @@ -439,8 +518,8 @@ dse_destroy(struct dse *pdse)
dse_callback_deletelist(&pdse->dse_callback);
charray_free(pdse->dse_filelist);
nentries = avl_free(pdse->dse_tree, dse_internal_delete_entry);
dse_lock_unlock(pdse, DSE_USE_LOCK);
if (pdse->dse_rwlock) {
slapi_rwlock_unlock(pdse->dse_rwlock);
slapi_destroy_rwlock(pdse->dse_rwlock);
}
slapi_ch_free((void **)&pdse);
Expand Down Expand Up @@ -928,9 +1007,7 @@ dse_check_for_readonly_error(Slapi_PBlock *pb, struct dse *pdse)
{
int rc = 0; /* default: no error */

if (pdse->dse_rwlock)
slapi_rwlock_rdlock(pdse->dse_rwlock);

dse_lock_read(pdse, DSE_USE_LOCK);
if (!pdse->dse_is_updateable) {
if (!pdse->dse_readonly_error_reported) {
if (NULL != pdse->dse_filename) {
Expand All @@ -944,9 +1021,7 @@ dse_check_for_readonly_error(Slapi_PBlock *pb, struct dse *pdse)
}
rc = 1; /* return an error to the client */
}

if (pdse->dse_rwlock)
slapi_rwlock_unlock(pdse->dse_rwlock);
dse_lock_unlock(pdse, DSE_USE_LOCK);

if (rc != 0) {
slapi_send_ldap_result(pb, LDAP_UNWILLING_TO_PERFORM, NULL,
Expand All @@ -973,8 +1048,6 @@ dse_write_file_nolock(struct dse *pdse)
fpw.fpw_rc = 0;
fpw.fpw_prfd = NULL;

dse_backup_lock();

if (NULL != pdse->dse_filename) {
if ((fpw.fpw_prfd = PR_Open(pdse->dse_tmpfile, PR_RDWR | PR_CREATE_FILE | PR_TRUNCATE, SLAPD_DEFAULT_DSE_FILE_MODE)) == NULL) {
rc = PR_GetOSError();
Expand Down Expand Up @@ -1107,8 +1180,7 @@ dse_add_entry_pb(struct dse *pdse, Slapi_Entry *e, Slapi_PBlock *pb)
slapi_pblock_get(pb, SLAPI_DSE_MERGE_WHEN_ADDING, &merge);

/* keep write lock during both tree update and file write operations */
if (pdse->dse_rwlock)
slapi_rwlock_wrlock(pdse->dse_rwlock);
dse_lock_write(pdse, DSE_USE_LOCK);
if (merge) {
rc = avl_insert(&(pdse->dse_tree), n, entry_dn_cmp, dupentry_merge);
} else {
Expand All @@ -1131,8 +1203,7 @@ dse_add_entry_pb(struct dse *pdse, Slapi_Entry *e, Slapi_PBlock *pb)
} else { /* duplicate entry ignored */
dse_node_delete(&n); /* This also deletes the contained entry */
}
if (pdse->dse_rwlock)
slapi_rwlock_unlock(pdse->dse_rwlock);
dse_lock_unlock(pdse, DSE_USE_LOCK);

if (rc == -1) {
/* duplicate entry ignored */
Expand Down Expand Up @@ -1299,8 +1370,7 @@ dse_replace_entry(struct dse *pdse, Slapi_Entry *e, int write_file, int use_lock
int rc = -1;
if (NULL != e) {
struct dse_node *n = dse_node_new(e);
if (use_lock && pdse->dse_rwlock)
slapi_rwlock_wrlock(pdse->dse_rwlock);
dse_lock_write(pdse, use_lock);
rc = avl_insert(&(pdse->dse_tree), n, entry_dn_cmp, dupentry_replace);
if (write_file)
dse_write_file_nolock(pdse);
Expand All @@ -1310,8 +1380,7 @@ dse_replace_entry(struct dse *pdse, Slapi_Entry *e, int write_file, int use_lock
dse_node_delete(&n);
rc = 0; /* for return to caller */
}
if (use_lock && pdse->dse_rwlock)
slapi_rwlock_unlock(pdse->dse_rwlock);
dse_lock_unlock(pdse, use_lock);
}
return rc;
}
Expand Down Expand Up @@ -1398,8 +1467,7 @@ dse_delete_entry(struct dse *pdse, Slapi_PBlock *pb, const Slapi_Entry *e)
slapi_pblock_get(pb, SLAPI_DSE_DONT_WRITE_WHEN_ADDING, &dont_write_file);

/* keep write lock for both tree deleting and file writing */
if (pdse->dse_rwlock)
slapi_rwlock_wrlock(pdse->dse_rwlock);
dse_lock_write(pdse, DSE_USE_LOCK);
if ((deleted_node = (struct dse_node *)avl_delete(&pdse->dse_tree, n, entry_dn_cmp))) {
dse_node_delete(&deleted_node);
}
Expand All @@ -1411,8 +1479,7 @@ dse_delete_entry(struct dse *pdse, Slapi_PBlock *pb, const Slapi_Entry *e)
SLAPI_OPERATION_DELETE);
dse_write_file_nolock(pdse);
}
if (pdse->dse_rwlock)
slapi_rwlock_unlock(pdse->dse_rwlock);
dse_lock_unlock(pdse, DSE_USE_LOCK);

return 1;
}
Expand Down Expand Up @@ -1574,11 +1641,9 @@ do_dse_search(struct dse *pdse, Slapi_PBlock *pb, int scope, const Slapi_DN *bas
* entries that change, we skip looking through the DSE entries.
*/
if (pb_op == NULL || !operation_is_flag_set(pb_op, OP_FLAG_PS_CHANGESONLY)) {
if (pdse->dse_rwlock)
slapi_rwlock_rdlock(pdse->dse_rwlock);
dse_lock_read(pdse, DSE_USE_LOCK);
dse_apply_nolock(pdse, dse_search_filter_entry, (caddr_t)&stuff);
if (pdse->dse_rwlock)
slapi_rwlock_unlock(pdse->dse_rwlock);
dse_lock_unlock(pdse, DSE_USE_LOCK);
}

if (stuff.ss) /* something was found which matched our criteria */
Expand Down Expand Up @@ -2925,32 +2990,3 @@ dse_next_search_entry(Slapi_PBlock *pb)
return 0;
}

/* When a backup is occurring we can not allow the writing the dse.ldif file */
void
dse_init_backup_lock()
{
backup_lock = PR_NewLock();
}

void
dse_destroy_backup_lock()
{
PR_DestroyLock(backup_lock);
backup_lock = NULL;
}

void
dse_backup_lock()
{
if (backup_lock) {
PR_Lock(backup_lock);
}
}

void
dse_backup_unlock()
{
if (backup_lock) {
PR_Unlock(backup_lock);
}
}
3 changes: 0 additions & 3 deletions ldap/servers/slapd/libglobs.c
Original file line number Diff line number Diff line change
Expand Up @@ -2033,9 +2033,6 @@ FrontendConfig_init(void)
/* Done, unlock! */
CFG_UNLOCK_WRITE(cfg);

/* init the dse file backup lock */
dse_init_backup_lock();

init_config_get_and_set();
}

Expand Down
1 change: 0 additions & 1 deletion ldap/servers/slapd/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,6 @@ main(int argc, char **argv)
slapd_ssl_destroy();
ndn_cache_destroy();
NSS_Shutdown();
dse_destroy_backup_lock();

/*
* Server has stopped, lets force everything to disk: logs
Expand Down
Loading
Loading