From 500e70717f48df591ca790d4919750d2a7e79d14 Mon Sep 17 00:00:00 2001
From: tbordaz <tbordaz@redhat.com>
Date: Tue, 12 Dec 2023 12:57:31 +0100
Subject: [PATCH] Issue 5939 - During an update, if the target entry is
 reverted in the entry cache, the server should not retry to lock it (#6007)

Bug description:
	During an update if an BETXN plugin fails the full TXN is aborted and the DB
	returns to the previous state.
	However potential internal updates, done by BETXN plugins, are also applied
	on the entry cache.
	Even if the TXN is aborted some entries in the entry cache are left in a state
	that does not reflect the DB state.
	The fix https://pagure.io/389-ds-base/issue/50260 "reverts" those
	entries, setting their state to INVALID.

	A problem is that reverted entries stay in the entry cache, until refcnt is 0.
	During that period, an update targeting that entry fails to retrieve the
	entry from the entry cache and fails to add it again as it already exist
	the entry.
	The update iterates 1000 times, trying to read the entry and to fetch it
	from DB.
	This is a pure waste as the reverted entry stays too long.

	The signature of this issue is a message in the error log: "Retry count exceeded"

Fix description:
	The fix consiste in the loops (fetch on DN or NSUNIQUEID) to test if the
        entry state is INVALID.
	In such case it aborts the loop and return a failure.

relates: #5939

Reviewed by: Pierre Rogier, Simon Pichugin (Thanks !!)
---
 dirsrvtests/tests/suites/betxns/betxn_test.py | 86 +++++++++++++++++++
 ldap/servers/slapd/back-ldbm/cache.c          | 19 ++++
 ldap/servers/slapd/back-ldbm/findentry.c      | 25 +++++-
 .../servers/slapd/back-ldbm/proto-back-ldbm.h |  1 +
 4 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/dirsrvtests/tests/suites/betxns/betxn_test.py b/dirsrvtests/tests/suites/betxns/betxn_test.py
index d40fbafd73..76850a992c 100644
--- a/dirsrvtests/tests/suites/betxns/betxn_test.py
+++ b/dirsrvtests/tests/suites/betxns/betxn_test.py
@@ -445,6 +445,92 @@ def fin():
 
     request.addfinalizer(fin)
 
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_revert_cache_noloop(topology_st, request):
+    """Tests that when an entry is reverted, if an update
+    hit the reverted entry then the retry loop is aborted
+    and the update gets err=51
+    NOTE: this test requires a dynamic so that the two updates
+    occurs about the same time. If the test becomes fragile it is
+    okay to make it flaky
+
+    :id: 88ef0ba5-8c66-49e6-99c9-9e3f6183917f
+
+    :setup: Standalone instance
+
+    :steps:
+         1. Create a user account (with a homeDirectory)
+         2. Remove the error log file
+         3. Configure memberof to trigger a failure
+         4. Do in a loop 3 parallel updates (they will fail in
+            memberof plugin) and an updated on the reverted entry
+         5. Check that error log does contain entry cache reversion
+
+    :expectedresults:
+         1. Succeeds
+         2. Success
+         3. Succeeds
+         4. Succeeds
+         5. Succeeds
+    """
+    # Create an test user entry
+    log.info('Create a user without nsMemberOF objectclass')
+    try:
+        users = UserAccounts(topology_st.standalone, DEFAULT_SUFFIX, rdn='ou=people')
+        user = users.create_test_user()
+        user.replace('objectclass', ['top', 'account', 'person', 'inetorgperson', 'posixAccount', 'organizationalPerson', 'nsAccount'])
+    except ldap.LDAPError as e:
+        log.fatal('Failed to create test user: error ' + e.args[0]['desc'])
+        assert False
+
+    # Remove the current error log file
+    topology_st.standalone.stop()
+    lpath = topology_st.standalone.ds_error_log._get_log_path()
+    os.unlink(lpath)
+    topology_st.standalone.start()
+
+    # Prepare memberof so that it will fail during a next update
+    # If memberof plugin can not add 'memberof' to the
+    # member entry, it retries after adding
+    # 'memberOfAutoAddOC' objectclass to the member.
+    # If it fails again the plugin fails with 'object
+    # violation'
+    # To trigger this failure, set 'memberOfAutoAddOC'
+    # to a value that does *not* allow 'memberof' attribute
+    memberof = MemberOfPlugin(topology_st.standalone)
+    memberof.enable()
+    memberof.replace('memberOfAutoAddOC', 'account')
+    memberof.replace('memberofentryscope', DEFAULT_SUFFIX)
+    topology_st.standalone.restart()
+
+    for i in range(50):
+        # Try to add the user to demo_group
+        # It should fail because user entry has not 'nsmemberof' objectclass
+        # As this is a BETXN plugin that fails it should revert the entry cache
+        try:
+            GROUP_DN = "cn=demo_group,ou=groups,"  + DEFAULT_SUFFIX
+            topology_st.standalone.modify(GROUP_DN,
+                [(ldap.MOD_REPLACE, 'member', ensure_bytes(user.dn))])
+            topology_st.standalone.modify(GROUP_DN,
+                [(ldap.MOD_REPLACE, 'member', ensure_bytes(user.dn))])
+            topology_st.standalone.modify(GROUP_DN,
+                [(ldap.MOD_REPLACE, 'member', ensure_bytes(user.dn))])
+        except ldap.OBJECT_CLASS_VIOLATION:
+            pass
+
+        user.replace('cn', ['new_value'])
+
+    # Check that both a betxn failed and a reverted entry was
+    # detected during an update
+    assert topology_st.standalone.ds_error_log.match('.*WARN - flush_hash - Upon BETXN callback failure, entry cache is flushed during.*')
+    assert topology_st.standalone.ds_error_log.match('.*cache_is_reverted_entry - Entry reverted.*')
+
+    def fin():
+        user.delete()
+        memberof = MemberOfPlugin(topology_st.standalone)
+        memberof.replace('memberOfAutoAddOC', 'nsmemberof')
+
+    request.addfinalizer(fin)
 if __name__ == '__main__':
     # Run isolated
     # -s for DEBUG mode
diff --git a/ldap/servers/slapd/back-ldbm/cache.c b/ldap/servers/slapd/back-ldbm/cache.c
index b62bab0af3..47c03e6539 100644
--- a/ldap/servers/slapd/back-ldbm/cache.c
+++ b/ldap/servers/slapd/back-ldbm/cache.c
@@ -1715,6 +1715,25 @@ cache_lock_entry(struct cache *cache, struct backentry *e)
     return 0;
 }
 
+int
+cache_is_reverted_entry(struct cache *cache, struct backentry *e)
+{
+    struct backentry *dummy_e;
+
+    cache_lock(cache);
+    if (find_hash(cache->c_idtable, &e->ep_id, sizeof(ID), (void **)&dummy_e)) {
+        if (dummy_e->ep_state & ENTRY_STATE_INVALID) {
+            slapi_log_err(SLAPI_LOG_WARNING, "cache_is_reverted_entry", "Entry reverted = %d (0x%lX)  [entry: 0x%lX] refcnt=%d\n", 
+                          dummy_e->ep_state,
+                          pthread_self(),
+                          dummy_e, dummy_e->ep_refcnt);
+            cache_unlock(cache);
+            return 1;
+        }
+    }
+    cache_unlock(cache);
+    return 0;
+}
 /* the opposite of above */
 void
 cache_unlock_entry(struct cache *cache __attribute__((unused)), struct backentry *e)
diff --git a/ldap/servers/slapd/back-ldbm/findentry.c b/ldap/servers/slapd/back-ldbm/findentry.c
index 58f484303d..7bb56ef2c4 100644
--- a/ldap/servers/slapd/back-ldbm/findentry.c
+++ b/ldap/servers/slapd/back-ldbm/findentry.c
@@ -98,6 +98,7 @@ find_entry_internal_dn(
     size_t tries = 0;
     int isroot = 0;
     int op_type;
+    int reverted_entry = 0;
 
     /* get the managedsait ldap message control */
     slapi_pblock_get(pb, SLAPI_MANAGEDSAIT, &managedsait);
@@ -141,12 +142,20 @@ find_entry_internal_dn(
          */
         slapi_log_err(SLAPI_LOG_ARGS, "find_entry_internal_dn",
                       "   Retrying (%s)\n", slapi_sdn_get_dn(sdn));
+        if (cache_is_reverted_entry(&inst->inst_cache, e)) {
+            reverted_entry = 1;
+            break;
+        }
         CACHE_RETURN(&inst->inst_cache, &e);
         tries++;
     }
     if (tries >= LDBM_CACHE_RETRY_COUNT) {
         slapi_log_err(SLAPI_LOG_ERR, "find_entry_internal_dn", "Retry count exceeded (%s)\n", slapi_sdn_get_dn(sdn));
     }
+    if (reverted_entry) {
+        slapi_send_ldap_result(pb, LDAP_BUSY, NULL, "target entry busy because of a canceled operation", 0, NULL);
+        return (NULL);
+    }
     /*
      * there is no such entry in this server. see how far we
      * can match, and check if that entry contains a referral.
@@ -262,6 +271,7 @@ find_entry_internal_uniqueid(
     struct backentry *e;
     int err;
     size_t tries = 0;
+    int reverted_entry = 0;
 
     while ((tries < LDBM_CACHE_RETRY_COUNT) &&
            (e = uniqueid2entry(be, uniqueid, txn, &err)) != NULL) {
@@ -283,6 +293,10 @@ find_entry_internal_uniqueid(
          */
         slapi_log_err(SLAPI_LOG_ARGS,
                       "   find_entry_internal_uniqueid", "Retrying; uniqueid = (%s)\n", uniqueid);
+        if (cache_is_reverted_entry(&inst->inst_cache, e)) {
+            reverted_entry = 1;
+            break;
+        }
         CACHE_RETURN(&inst->inst_cache, &e);
         tries++;
     }
@@ -292,9 +306,14 @@ find_entry_internal_uniqueid(
                       uniqueid);
     }
 
-    /* entry not found */
-    slapi_send_ldap_result(pb, (0 == err || DBI_RC_NOTFOUND == err) ? LDAP_NO_SUCH_OBJECT : LDAP_OPERATIONS_ERROR, NULL /* matched */, NULL,
-                           0, NULL);
+    if (reverted_entry) {
+        slapi_send_ldap_result(pb, LDAP_BUSY, NULL, "target entry busy because of a canceled operation", 0, NULL);
+        return (NULL);
+    } else {
+        /* entry not found */
+        slapi_send_ldap_result(pb, (0 == err || DBI_RC_NOTFOUND == err) ? LDAP_NO_SUCH_OBJECT : LDAP_OPERATIONS_ERROR, NULL /* matched */, NULL,
+                               0, NULL);
+    }
     slapi_log_err(SLAPI_LOG_TRACE,
                   "find_entry_internal_uniqueid", "<= not found; uniqueid = (%s)\n",
                   uniqueid);
diff --git a/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h b/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h
index 4d32a3f3b9..65603eccde 100644
--- a/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h
+++ b/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h
@@ -61,6 +61,7 @@ int cache_replace(struct cache *cache, void *oldptr, void *newptr);
 int cache_has_otherref(struct cache *cache, void *bep);
 int cache_is_in_cache(struct cache *cache, void *ptr);
 void revert_cache(ldbm_instance *inst, struct timespec *start_time);
+int cache_is_reverted_entry(struct cache *cache, struct backentry *e);
 
 #ifdef CACHE_DEBUG
 void check_entry_cache(struct cache *cache, struct backentry *e);