561 lines
11 KiB
C
561 lines
11 KiB
C
/*
|
|
* uksmd - userspace KSM helper daemon
|
|
* Copyright (C) 2019 Oleksandr Natalenko <oleksandr@natalenko.name>
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, version 3 of the License.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <cap-ng.h>
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <libproc2/pids.h>
|
|
#include <limits.h>
|
|
#include <signal.h>
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/pidfd.h>
|
|
#include <sys/resource.h>
|
|
#if defined HAVE_SYSTEMD
|
|
#include <systemd/sd-daemon.h>
|
|
#endif /* HAVE_SYSTEMD */
|
|
#include <time.h>
|
|
#include <unistd.h>
|
|
|
|
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
|
|
|
|
#define KSM_RUN "/sys/kernel/mm/ksm/run"
|
|
#define KSM_FULL_SCANS "/sys/kernel/mm/ksm/full_scans"
|
|
#define KSM_PAGES_VOLATILE "/sys/kernel/mm/ksm/pages_volatile"
|
|
#if defined HAVE_SYSTEMD
|
|
#define KSM_PROFIT "/sys/kernel/mm/ksm/general_profit"
|
|
#endif /* HAVE_SYSTEMD */
|
|
#define KSMD_CMD "ksmd"
|
|
#define OBSERVE_WINDOW_SECS 30
|
|
#define IDLE_SLEEP_SECS 15
|
|
|
|
#define __SYSFS_process_ksm_enable "/sys/kernel/process_ksm/process_ksm_enable"
|
|
#define __SYSFS_process_ksm_disable "/sys/kernel/process_ksm/process_ksm_disable"
|
|
#define __SYSFS_process_ksm_status "/sys/kernel/process_ksm/process_ksm_status"
|
|
|
|
enum pksm_action
|
|
{
|
|
PKSM_ENABLE = 0,
|
|
PKSM_DISABLE,
|
|
PKSM_STATUS,
|
|
};
|
|
|
|
static long __NR_process_ksm_enable = -1;
|
|
static long __NR_process_ksm_disable = -1;
|
|
static long __NR_process_ksm_status = -1;
|
|
|
|
static const char* incompatible_tasks[] =
|
|
{
|
|
"mariadbd",
|
|
"p2pool",
|
|
};
|
|
|
|
static int ksm_ctl(bool _enable)
|
|
{
|
|
int ret = 0;
|
|
|
|
int fd = open(KSM_RUN, O_WRONLY);
|
|
if (fd == -1)
|
|
{
|
|
ret = errno;
|
|
goto out;
|
|
}
|
|
|
|
if (write(fd, _enable ? "1" : "2", 1) == -1)
|
|
{
|
|
ret = errno;
|
|
goto close_fd;
|
|
}
|
|
|
|
close_fd:
|
|
close(fd);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static long process_ksm_enable(int pidfd, unsigned int flags)
|
|
{
|
|
return syscall(__NR_process_ksm_enable, pidfd, flags);
|
|
}
|
|
|
|
static long process_ksm_disable(int pidfd, unsigned int flags)
|
|
{
|
|
return syscall(__NR_process_ksm_disable, pidfd, flags);
|
|
}
|
|
|
|
static long process_ksm_status(int pidfd, unsigned int flags)
|
|
{
|
|
return syscall(__NR_process_ksm_status, pidfd, flags);
|
|
}
|
|
|
|
static long process_ksm(pid_t pid, enum pksm_action _action)
|
|
{
|
|
long ret;
|
|
int pidfd;
|
|
|
|
pidfd = pidfd_open(pid, 0);
|
|
if (pidfd == -1)
|
|
{
|
|
ret = errno;
|
|
goto out;
|
|
}
|
|
|
|
switch (_action)
|
|
{
|
|
case PKSM_ENABLE:
|
|
ret = process_ksm_enable(pidfd, 0);
|
|
break;
|
|
case PKSM_DISABLE:
|
|
ret = process_ksm_disable(pidfd, 0);
|
|
break;
|
|
case PKSM_STATUS:
|
|
ret = process_ksm_status(pidfd, 0);
|
|
break;
|
|
}
|
|
|
|
if (ret == -1)
|
|
{
|
|
ret = errno;
|
|
/* ignore close() ret value to preserve the one from process_ksm_*() */
|
|
close(pidfd);
|
|
goto out;
|
|
}
|
|
|
|
ret = close(pidfd);
|
|
if (ret == -1)
|
|
ret = errno;
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static size_t pids_index(const enum pids_item _items[], size_t _items_len, int _item)
|
|
{
|
|
for (size_t i = 0; i < _items_len; i++)
|
|
if (_items[i] == _item)
|
|
return i;
|
|
|
|
/* coding error: no given item was declared in array */
|
|
abort();
|
|
}
|
|
|
|
#define PKSM_PIDS_VAL(name, type) \
|
|
(PIDS_VAL(pids_index(items, ARRAY_SIZE(items), name), type, stack, info))
|
|
|
|
static int kthread_niceness(const char* _name, int *_niceness)
|
|
{
|
|
int ret;
|
|
bool found = false;
|
|
struct pids_info *info = NULL;
|
|
struct pids_stack *stack;
|
|
enum pids_item items[] =
|
|
{
|
|
PIDS_CMD,
|
|
PIDS_NICE,
|
|
PIDS_VM_SIZE,
|
|
};
|
|
|
|
ret = procps_pids_new(&info, items, ARRAY_SIZE(items));
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
while ((stack = procps_pids_get(info, PIDS_FETCH_TASKS_ONLY)))
|
|
{
|
|
/* skip uthreads */
|
|
if (PKSM_PIDS_VAL(PIDS_VM_SIZE, ul_int))
|
|
continue;
|
|
|
|
if (!strcmp(_name, PKSM_PIDS_VAL(PIDS_CMD, str)))
|
|
{
|
|
*_niceness = PKSM_PIDS_VAL(PIDS_NICE, s_int);
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found)
|
|
return -ESRCH;
|
|
|
|
ret = procps_pids_unref(&info);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int do_setup_process_ksm(const char* _path, long* _nr)
|
|
{
|
|
int ret = 0;
|
|
char buf[4] = { 0, };
|
|
ssize_t read_len;
|
|
long nr;
|
|
|
|
int fd = open(_path, O_RDONLY);
|
|
if (fd == -1)
|
|
{
|
|
ret = errno;
|
|
goto out;
|
|
}
|
|
|
|
read_len = read(fd, buf, sizeof buf);
|
|
if (read_len == -1)
|
|
{
|
|
ret = errno;
|
|
goto close_fd;
|
|
}
|
|
|
|
nr = strtol(buf, NULL, 10);
|
|
if (nr == LONG_MIN || nr == LONG_MAX)
|
|
{
|
|
ret = errno;
|
|
goto close_fd;
|
|
}
|
|
|
|
*_nr = nr;
|
|
|
|
close_fd:
|
|
close(fd);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int setup_nr_process_ksm(void)
|
|
{
|
|
int ret = 0;
|
|
|
|
ret = do_setup_process_ksm(__SYSFS_process_ksm_enable, &__NR_process_ksm_enable);
|
|
if (ret)
|
|
goto out;
|
|
|
|
ret = do_setup_process_ksm(__SYSFS_process_ksm_disable, &__NR_process_ksm_disable);
|
|
if (ret)
|
|
goto out;
|
|
|
|
ret = do_setup_process_ksm(__SYSFS_process_ksm_status, &__NR_process_ksm_status);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int get_ksm_gauge(const char *_name, long *_value)
|
|
{
|
|
int ret = 0;
|
|
char buf[21] = { 0, };
|
|
ssize_t read_len;
|
|
unsigned long value = 0;
|
|
|
|
int fd = open(_name, O_RDONLY);
|
|
if (fd == -1)
|
|
{
|
|
ret = errno;
|
|
goto out;
|
|
}
|
|
|
|
read_len = read(fd, buf, sizeof buf);
|
|
if (read_len == -1)
|
|
{
|
|
ret = errno;
|
|
goto close_fd;
|
|
}
|
|
|
|
value = strtol(buf, NULL, 10);
|
|
if (value == LONG_MIN || value == LONG_MAX)
|
|
ret = errno;
|
|
|
|
close_fd:
|
|
close(fd);
|
|
|
|
out:
|
|
*_value = value;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool is_incompatible(const char* _comm)
|
|
{
|
|
for (size_t i = 0; i < ARRAY_SIZE(incompatible_tasks); i++)
|
|
if (!strcmp(incompatible_tasks[i], _comm))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
int main(int _argc, char** _argv)
|
|
{
|
|
(void)_argc;
|
|
(void)_argv;
|
|
int ret;
|
|
int ksmd_niceness;
|
|
pid_t self;
|
|
sigset_t sigmask;
|
|
sigset_t sigorigmask;
|
|
struct pids_info *info = NULL;
|
|
struct pids_stack *stack;
|
|
enum pids_item items[] =
|
|
{
|
|
PIDS_CMD,
|
|
PIDS_ID_PID,
|
|
PIDS_TIME_START,
|
|
PIDS_VM_SIZE,
|
|
};
|
|
struct timespec now;
|
|
struct timespec time_to_sleep;
|
|
siginfo_t siginfo;
|
|
long full_scans;
|
|
long prev_full_scans;
|
|
bool first_run;
|
|
long pages_volatile;
|
|
#if defined HAVE_SYSTEMD
|
|
long profit;
|
|
#endif /* HAVE_SYSTEMD */
|
|
|
|
if (capng_get_caps_process() == -1)
|
|
{
|
|
ret = ENODATA;
|
|
fprintf(stderr, "Unable to get capabilities\n");
|
|
goto out;
|
|
}
|
|
|
|
if (!capng_have_capability(CAPNG_EFFECTIVE, CAP_SYS_PTRACE))
|
|
{
|
|
ret = EACCES;
|
|
fprintf(stderr, "capabilities: CAP_SYS_PTRACE required\n");
|
|
goto out;
|
|
}
|
|
|
|
if (!capng_have_capability(CAPNG_EFFECTIVE, CAP_DAC_OVERRIDE))
|
|
{
|
|
ret = EACCES;
|
|
fprintf(stderr, "capabilities: CAP_DAC_OVERRIDE required\n");
|
|
goto out;
|
|
}
|
|
|
|
if (!capng_have_capability(CAPNG_EFFECTIVE, CAP_SYS_NICE))
|
|
{
|
|
ret = EACCES;
|
|
fprintf(stderr, "capabilities: CAP_SYS_NICE required\n");
|
|
goto out;
|
|
}
|
|
|
|
if (setup_nr_process_ksm())
|
|
{
|
|
ret = ENODATA;
|
|
fprintf(stderr, "Unable to get process_ksm syscall numbers\n");
|
|
goto out;
|
|
}
|
|
|
|
ret = kthread_niceness("ksmd", &ksmd_niceness);
|
|
if (ret < 0)
|
|
{
|
|
fprintf(stderr, "kthread_niceness: %s\n", strerror(-ret));
|
|
goto out;
|
|
}
|
|
|
|
ret = setpriority(PRIO_PROCESS, 0, ksmd_niceness);
|
|
if (ret == -1 && errno)
|
|
{
|
|
ret = errno;
|
|
fprintf(stderr, "setpriority: %s\n", strerror(ret));
|
|
goto out;
|
|
}
|
|
|
|
#if !defined HAVE_SYSTEMD
|
|
ret = daemon(0, 0);
|
|
if (ret == -1)
|
|
{
|
|
ret = errno;
|
|
fprintf(stderr, "daemon: %s\n", strerror(ret));
|
|
goto out;
|
|
}
|
|
#endif /* HAVE_SYSTEMD */
|
|
|
|
self = getpid();
|
|
|
|
ret = ksm_ctl(true);
|
|
if (ret)
|
|
{
|
|
fprintf(stderr, "ksm_ctl: %s\n", strerror(ret));
|
|
goto out;
|
|
}
|
|
|
|
sigemptyset(&sigmask);
|
|
sigaddset(&sigmask, SIGINT);
|
|
sigaddset(&sigmask, SIGTERM);
|
|
ret = sigprocmask(SIG_BLOCK, &sigmask, &sigorigmask);
|
|
if (ret == -1)
|
|
{
|
|
ret = errno;
|
|
fprintf(stderr, "sigprocmask: %s\n", strerror(ret));
|
|
goto ksm_ctl_false;
|
|
}
|
|
|
|
#if defined HAVE_SYSTEMD
|
|
sd_notify(0, "READY=1");
|
|
#endif /* HAVE_SYSTEMD */
|
|
|
|
first_run = true;
|
|
full_scans = prev_full_scans = 0;
|
|
while (true)
|
|
{
|
|
#if defined HAVE_SYSTEMD
|
|
sd_notify(0, "WATCHDOG=1");
|
|
ret = get_ksm_gauge(KSM_PROFIT, &profit);
|
|
if (ret)
|
|
{
|
|
fprintf(stderr, "get KSM_PROFIT: %s\n", strerror(ret));
|
|
goto unblock_signals;
|
|
}
|
|
sd_notifyf(0, "STATUS=Profit: %ld MiB", profit / (1L << 20));
|
|
#endif /* HAVE_SYSTEMD */
|
|
|
|
ret = get_ksm_gauge(KSM_FULL_SCANS, &full_scans);
|
|
if (ret)
|
|
{
|
|
fprintf(stderr, "get KSM_FULL_SCANS: %s\n", strerror(ret));
|
|
goto unblock_signals;
|
|
}
|
|
|
|
ret = get_ksm_gauge(KSM_PAGES_VOLATILE, &pages_volatile);
|
|
if (ret)
|
|
{
|
|
fprintf(stderr, "get KSM_PAGES_VOLATILE: %s\n", strerror(ret));
|
|
goto unblock_signals;
|
|
}
|
|
|
|
if (first_run || full_scans != prev_full_scans || !pages_volatile)
|
|
{
|
|
clock_gettime(CLOCK_BOOTTIME, &now);
|
|
|
|
ret = procps_pids_new(&info, items, ARRAY_SIZE(items));
|
|
if (ret < 0)
|
|
{
|
|
fprintf(stderr, "procps_pids_new: %s\n", strerror(-ret));
|
|
goto unblock_signals;
|
|
}
|
|
|
|
while ((stack = procps_pids_get(info, PIDS_FETCH_TASKS_ONLY)))
|
|
{
|
|
/* skip kthreads */
|
|
if (!PKSM_PIDS_VAL(PIDS_VM_SIZE, ul_int))
|
|
continue;
|
|
|
|
pid_t current_pid = PKSM_PIDS_VAL(PIDS_ID_PID, s_int);
|
|
|
|
/* skip ourselves */
|
|
if (current_pid == self)
|
|
continue;
|
|
|
|
/* skip short-living tasks */
|
|
if (now.tv_sec - PKSM_PIDS_VAL(PIDS_TIME_START, real) < OBSERVE_WINDOW_SECS)
|
|
continue;
|
|
|
|
/* skip already processed tasks */
|
|
if (process_ksm(current_pid, PKSM_STATUS))
|
|
continue;
|
|
|
|
/* explicitly disable incompatible tasks */
|
|
if (is_incompatible(PKSM_PIDS_VAL(PIDS_CMD, str)))
|
|
{
|
|
process_ksm(current_pid, PKSM_DISABLE);
|
|
continue;
|
|
}
|
|
|
|
if (process_ksm(current_pid, PKSM_ENABLE))
|
|
continue;
|
|
}
|
|
|
|
ret = procps_pids_unref(&info);
|
|
if (ret < 0)
|
|
{
|
|
fprintf(stderr, "procps_pids_unref: %s\n", strerror(-ret));
|
|
goto unblock_signals;
|
|
}
|
|
|
|
if (first_run)
|
|
first_run = false;
|
|
|
|
prev_full_scans = full_scans;
|
|
}
|
|
|
|
time_to_sleep.tv_sec = IDLE_SLEEP_SECS;
|
|
time_to_sleep.tv_nsec = 0;
|
|
ret = sigtimedwait(&sigmask, &siginfo, &time_to_sleep);
|
|
if (ret == SIGINT || ret == SIGTERM)
|
|
{
|
|
printf("Caught signal %d, shutting down gracefully...\n", ret);
|
|
goto unblock_signals;
|
|
} else if (ret == -1)
|
|
{
|
|
switch (errno)
|
|
{
|
|
case EINVAL:
|
|
ret = errno;
|
|
fprintf(stderr, "sigtimedwait: %s\n", strerror(ret));
|
|
goto unblock_signals;
|
|
case EINTR:
|
|
if (!siginfo.si_signo || siginfo.si_signo == SIGQUIT)
|
|
{
|
|
printf("Are we being traced?\n");
|
|
continue;
|
|
} else
|
|
{
|
|
fprintf(stderr, "sigtimedwait: EINTR came with si_signo = %d\n", siginfo.si_signo);
|
|
continue;
|
|
}
|
|
break;
|
|
case EAGAIN:
|
|
/* timeout, just continuing */
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
unblock_signals:
|
|
#if defined HAVE_SYSTEMD
|
|
sd_notify(0, "STOPPING=1");
|
|
#endif /* HAVE_SYSTEMD */
|
|
|
|
ret = sigprocmask(SIG_SETMASK, &sigorigmask, NULL);
|
|
if (ret == -1)
|
|
{
|
|
ret = errno;
|
|
fprintf(stderr, "sigprocmask: %s\n", strerror(ret));
|
|
goto ksm_ctl_false;
|
|
}
|
|
|
|
ksm_ctl_false:
|
|
ret = ksm_ctl(false);
|
|
if (ret)
|
|
{
|
|
fprintf(stderr, "ksm_ctl: %s\n", strerror(ret));
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
#if defined HAVE_SYSTEMD
|
|
sd_notifyf(0, "ERRNO=%d", ret);
|
|
#endif /* HAVE_SYSTEMD */
|
|
|
|
exit(ret);
|
|
}
|
|
|