Skip to content

Commit

Permalink
Issue 5954 - Disable Transparent Huge Pages
Browse files Browse the repository at this point in the history
Bug Description:
THP can have negative effects on DS performance when large caches are
used.

Fix Description:
* Add a new variable for `ns-slapd` THP_DISABLE.
  When THP_DISABLE is set to 1, THP is disabled for `ns-slapd` process
  via `prctl(2)`. With any other value, THP settings are untouched.

Before:
```
$ grep THP /proc/$(pidof ns-slapd)/status
THP_enabled:    1
```

After
```
$ grep THP /proc/$(pidof ns-slapd)/status
THP_enabled:    0
```

* Add a new healthcheck linter, that checks if THP is disabled system-wide
  or per instance. In case THP is enabled for both the system and the
  process, it prints recommendations how to disable THP.

Fixes: #5954

Reviewed-by: @tbordaz, @Firstyear, @droideck (Thank you all!)
  • Loading branch information
vashirov committed Dec 11, 2023
1 parent 382003b commit dd03a50
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 1 deletion.
148 changes: 148 additions & 0 deletions dirsrvtests/tests/suites/healthcheck/health_tunables_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# --- BEGIN COPYRIGHT BLOCK ---
# Copyright (C) 2023 Red Hat, Inc.
# All rights reserved.
#
# License: GPL (version 3 or any later version).
# See LICENSE for details.
# --- END COPYRIGHT BLOCK ---
#

import subprocess
import pytest
import re
import os
from lib389.utils import *
from lib389.cli_base import FakeArgs
from lib389.topologies import topology_st
from lib389.cli_ctl.health import health_check_run
from lib389.paths import Paths

CMD_OUTPUT = 'No issues found.'
JSON_OUTPUT = '[]'
RET_CODE = 'DSTHPLE0001'

log = logging.getLogger(__name__)
p = Paths()


def run_healthcheck_and_flush_log(topology, instance, searched_code=None, json=False, searched_code2=None,
list_checks=False, list_errors=False, check=None, searched_list=None):
args = FakeArgs()
args.instance = instance.serverid
args.verbose = instance.verbose
args.list_errors = list_errors
args.list_checks = list_checks
args.check = check
args.dry_run = False
args.json = json

log.info('Use healthcheck with --json == {} option'.format(json))
health_check_run(instance, topology.logcap.log, args)

if searched_list is not None:
for item in searched_list:
assert topology.logcap.contains(item)
log.info('Healthcheck returned searched item: %s' % item)
else:
assert topology.logcap.contains(searched_code)
log.info('Healthcheck returned searched code: %s' % searched_code)

if searched_code2 is not None:
assert topology.logcap.contains(searched_code2)
log.info('Healthcheck returned searched code: %s' % searched_code2)

log.info('Clear the log')
topology.logcap.flush()


def _set_thp_system_mode(mode):
thp_path = '/sys/kernel/mm/transparent_hugepage/enabled'
with open(thp_path, 'w') as f:
log.info(f"Setting THP mode to {mode}")
f.write(mode)


def _set_thp_instance_mode(inst, disable: bool):
service_config = f"[Service]\nEnvironment=THP_DISABLE={int(disable)}"
drop_in_path = f"/etc/systemd/system/dirsrv@{inst.serverid}.service.d/"
os.makedirs(drop_in_path, exist_ok=True)
with open(os.path.join(drop_in_path, "thp.conf"), 'w') as f:
f.write(service_config)
subprocess.run(['systemctl', 'daemon-reload'], check=True)
inst.restart()


def _get_thp_system_mode():
thp_path = '/sys/kernel/mm/transparent_hugepage/enabled'
enabled_value_pattern = r'\[([^\]]+)\]'
with open(thp_path, 'r') as f:
text = f.read().strip()
mode = re.search(enabled_value_pattern, text)[1]
log.info(f"Current THP mode is {mode}")
return mode


@pytest.fixture(scope="function")
def thp_reset(request):
mode = _get_thp_system_mode()

def fin():
_set_thp_system_mode(mode)

request.addfinalizer(fin)


@pytest.mark.skipif(get_user_is_root() is False,
reason="This test requires root permissions to change kernel tunables")
@pytest.mark.skipif(p.with_systemd is False, reason='Needs systemd to run')
@pytest.mark.skipif(ds_is_older("2.3.7"), reason="Not implemented")
@pytest.mark.parametrize("system_thp_mode,instance_thp_mode,expected_output",
[("always", False, (RET_CODE, RET_CODE)),
("always", True, (CMD_OUTPUT, JSON_OUTPUT)),
("never", False, (CMD_OUTPUT, JSON_OUTPUT)),
("never", True, (CMD_OUTPUT, JSON_OUTPUT))],
ids=["System and Instance THP ON",
"System THP ON, Instance THP OFF",
"System THP OFF, Instance THP ON",
"System THP OFF, Instance THP OFF"])
@pytest.mark.usefixtures("thp_reset")
def test_healthcheck_transparent_huge_pages(topology_st, system_thp_mode, instance_thp_mode, expected_output):
"""Check if HealthCheck returns DSTHPLE0001 code
:id: 1f195e10-6403-4c92-8ac9-724b669e8cf2
:setup: Standalone instance
:parametrized: yes
:steps:
1. Enable THP system wide and for the instance
2. Use HealthCheck without --json option
3. Use HealthCheck with --json option
4. Enable THP system wide, disable THP for the instance
5. Use HealthCheck without --json option
6. Use HealthCheck with --json option
7. Disable THP system wide, enable THP for the instance
8. Use HealthCheck without --json option
9. Use HealthCheck with --json option
10. Disable THP system wide, disable THP for the instance
11. Use HealthCheck without --json option
12. Use HealthCheck with --json option
:expectedresults:
1. Success
2. HealthCheck should return code DSHTPLE0001
3. HealthCheck should return code DSTHPLE0001
4. Success
5. HealthCheck reports no issue found
6. HealthCheck reports no issue found
7. Success
8. HealthCheck reports no issue found
9. HealthCheck reports no issue found
10. Success
11. HealthCheck reports no issue found
12. HealthCheck reports no issue found
"""
standalone = topology_st.standalone
standalone.config.set("nsslapd-accesslog-logbuffering", "on")

_set_thp_system_mode(system_thp_mode)
_set_thp_instance_mode(standalone, instance_thp_mode)
run_healthcheck_and_flush_log(topology_st, standalone, expected_output[0], json=False)
run_healthcheck_and_flush_log(topology_st, standalone, expected_output[1], json=True)
5 changes: 4 additions & 1 deletion dirsrvtests/tests/suites/healthcheck/healthcheck_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ def test_healthcheck_list_checks(topology_st):
'replication:conflicts',
'dseldif:nsstate',
'tls:certificate_expiration',
'logs:notes']
'logs:notes',
'tunables:thp',
]

standalone = topology_st.standalone

Expand Down Expand Up @@ -205,6 +207,7 @@ def test_healthcheck_list_errors(topology_st):
'DSSKEWLE0001 :: Medium time skew',
'DSSKEWLE0002 :: Major time skew',
'DSSKEWLE0003 :: Extensive time skew',
'DSTHPLE0001 :: Transparent Huge Pages',
'DSVIRTLE0001 :: Virtual attribute indexed']

standalone = topology_st.standalone
Expand Down
12 changes: 12 additions & 0 deletions ldap/servers/slapd/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ union semun
#include <malloc.h>
#endif

#ifdef LINUX
#include <sys/prctl.h>
#endif

/* Forward Declarations */

struct main_config
Expand Down Expand Up @@ -523,6 +527,14 @@ main(int argc, char **argv)
{
int return_value = 0;
struct main_config mcfg = {0};
#ifdef LINUX
#if defined(PR_SET_THP_DISABLE)
char *thp_disable = getenv("THP_DISABLE");
if (thp_disable != NULL && strcmp(thp_disable, "1") == 0) {
prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0);
}
#endif
#endif

/* Set a number of defaults */
mcfg.slapd_exemode = SLAPD_EXEMODE_UNKNOWN;
Expand Down
2 changes: 2 additions & 0 deletions src/lib389/lib389/cli_ctl/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from lib389.nss_ssl import NssSsl
from lib389.dseldif import FSChecks, DSEldif
from lib389.dirsrv_log import DirsrvAccessLog
from lib389.tunables import Tunables
from lib389 import lint
from lib389 import plugins
from lib389._constants import DSRC_HOME
Expand All @@ -39,6 +40,7 @@
DSEldif,
NssSsl,
DirsrvAccessLog,
Tunables,
]


Expand Down
26 changes: 26 additions & 0 deletions src/lib389/lib389/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,3 +511,29 @@
'fix': """Stop using this these unknown attributes in the filter, or add the schema
to the server and make sure it's properly indexed."""
}

# Transparent Huge Pages
DSTHPLE0001 = {
'dsle': 'DSTHPLE0001',
'severity': 'Medium',
'description': 'Transparent Huge Pages',
'items': ['Possible Performance Impact'],
'detail': """Transparent Huge Pages are enabled. This can lead to an unexpected memory
consumption, especially when using large caches.\n""",
'fix': """Disable Transparent Huge Pages.
System-wide at boot:
Add "transparent_hugepage=never" to the list of kernel boot parameters.
System-wide at runtime:
# echo "never" > /sys/kernel/mm/transparent_hugepage/enabled
# echo "never" > /sys/kernel/mm/transparent_hugepage/defrag
Per instance (for the versions of 389 Directory Server that support it):
Edit dirsrv unit file:
# systemctl edit dirsrv@instance_name
And uncomment the following lines:
[Service]
Environment=THP_DISABLE=1
"""
}
65 changes: 65 additions & 0 deletions src/lib389/lib389/tunables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# --- BEGIN COPYRIGHT BLOCK ---
# Copyright (C) 2023 Red Hat, Inc.
# All rights reserved.
#
# License: GPL (version 3 or any later version).
# See LICENSE for details.
# --- END COPYRIGHT BLOCK ---
#

import os
import re
import copy
from lib389._mapped_object_lint import DSLint
from lib389 import pid_from_file
from lib389.lint import DSTHPLE0001

class Tunables(DSLint):
"""A class for working with system tunables
:param instance: An instance
:type instance: lib389.DirSrv
"""

def __init__(self, instance):
self._instance = instance
self.pid = str(pid_from_file(instance.ds_paths.pid_file))


@classmethod
def lint_uid(cls):
return 'tunables'


def _lint_thp(self):
"""Check if THP is enabled"""
def systemwide_thp_enabled() -> bool:
thp_path = '/sys/kernel/mm/transparent_hugepage'
thp_enabled_path = os.path.join(thp_path, "enabled")
thp_status_pattern = r"(.*\[always\].*)|(.*\[madvise\].*)"
if os.path.exists(thp_enabled_path):
with open(thp_enabled_path, 'r') as f:
thp_status = f.read().strip()
match = re.match(thp_status_pattern, thp_status)
return match is not None


def instance_thp_enabled() -> bool:
pid_status_path = f"/proc/{self.pid}/status"

with open(pid_status_path, 'r') as pid_status:
pid_status_content = pid_status.read()
thp_line = None
for line in pid_status_content.split('\n'):
if 'THP_enabled' in line:
thp_line = line
break
if thp_line is not None:
thp_value = int(thp_line.split()[1])
return bool(thp_value)


if instance_thp_enabled() and systemwide_thp_enabled():
report = copy.deepcopy(DSTHPLE0001)
report['check'] = 'tunables:transparent_huge_pages'
yield report

3 changes: 3 additions & 0 deletions wrappers/systemd.template.service.custom.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ TimeoutStopSec=600
# Preload jemalloc
Environment=LD_PRELOAD=@libdir@/@package_name@/lib/libjemalloc.so.2

# Disable Transparent Huge Pages
Environment=THP_DISABLE=1

##################################################
# Heap profiling with jemalloc #
##################################################
Expand Down

0 comments on commit dd03a50

Please sign in to comment.