/* $NetBSD: linux_futex.c,v 1.37 2017/04/10 15:04:32 dholland Exp $ */
/*-
* Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Emmanuel Dreyfus
* 4. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.37 2017/04/10 15:04:32 dholland Exp $");
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/lwp.h>
#include <sys/queue.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/kmem.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <compat/linux/common/linux_types.h>
#include <compat/linux/common/linux_emuldata.h>
#include <compat/linux/common/linux_exec.h>
#include <compat/linux/common/linux_signal.h>
#include <compat/linux/common/linux_futex.h>
#include <compat/linux/common/linux_sched.h>
#include <compat/linux/common/linux_machdep.h>
#include <compat/linux/linux_syscallargs.h>
struct futex;
struct waiting_proc {
struct futex *wp_futex;
kcondvar_t wp_futex_cv;
TAILQ_ENTRY(waiting_proc) wp_list;
bool wp_onlist;
};
struct futex {
void *f_uaddr;
int f_refcount;
uint32_t f_bitset;
LIST_ENTRY(futex) f_list;
TAILQ_HEAD(, waiting_proc) f_waiting_proc;
};
static LIST_HEAD(futex_list, futex) futex_list;
static kmutex_t futex_lock;
#define FUTEX_LOCK mutex_enter(&futex_lock)
#define FUTEX_UNLOCK mutex_exit(&futex_lock)
#define FUTEX_LOCKASSERT KASSERT(mutex_owned(&futex_lock))
#define FUTEX_SYSTEM_LOCK KERNEL_LOCK(1, NULL)
#define FUTEX_SYSTEM_UNLOCK KERNEL_UNLOCK_ONE(0)
#ifdef DEBUG_LINUX_FUTEX
int debug_futex = 1;
#define FUTEXPRINTF(a) do { if (debug_futex) printf a; } while (0)
#else
#define FUTEXPRINTF(a)
#endif
void
linux_futex_init(void)
{
FUTEXPRINTF(("%s: initializing futex\n", __func__));
mutex_init(&futex_lock, MUTEX_DEFAULT, IPL_NONE);
}
void
linux_futex_fini(void)
{
FUTEXPRINTF(("%s: destroying futex\n", __func__));
mutex_destroy(&futex_lock);
}
static struct waiting_proc *futex_wp_alloc(void);
static void futex_wp_free(struct waiting_proc *);
static struct futex *futex_get(void *, uint32_t);
static void futex_ref(struct futex *);
static void futex_put(struct futex *);
static int futex_sleep(struct futex **, lwp_t *, int, struct waiting_proc *);
static int futex_wake(struct futex *, int, struct futex *, int);
static int futex_atomic_op(lwp_t *, int, void *);
int
linux_sys_futex(struct lwp *l, const struct linux_sys_futex_args *uap,
register_t *retval)
{
/* {
syscallarg(int *) uaddr;
syscallarg(int) op;
syscallarg(int) val;
syscallarg(const struct linux_timespec *) timeout;
syscallarg(int *) uaddr2;
syscallarg(int) val3;
} */
struct linux_timespec lts;
struct timespec ts = { 0, 0 };
int error;
if ((SCARG(uap, op) & LINUX_FUTEX_CMD_MASK) == LINUX_FUTEX_WAIT &&
SCARG(uap, timeout) != NULL) {
if ((error = copyin(SCARG(uap, timeout),
<s, sizeof(lts))) != 0) {
return error;
}
linux_to_native_timespec(&ts, <s);
}
return linux_do_futex(l, uap, &ts, retval);
}
/*
* Note: TS can't be const because ts2timo destroys it.
*/
int
linux_do_futex(struct lwp *l, const struct linux_sys_futex_args *uap,
struct timespec *ts, register_t *retval)
{
/* {
syscallarg(int *) uaddr;
syscallarg(int) op;
syscallarg(int) val;
syscallarg(const struct linux_timespec *) timeout;
syscallarg(int *) uaddr2;
syscallarg(int) val3;
} */
int val, val3;
int ret;
int error = 0;
struct futex *f;
struct futex *newf;
int tout;
struct futex *f2;
struct waiting_proc *wp;
int op_ret, cmd;
clockid_t clk;
cmd = SCARG(uap, op) & LINUX_FUTEX_CMD_MASK;
val3 = SCARG(uap, val3);
if (SCARG(uap, op) & LINUX_FUTEX_CLOCK_REALTIME) {
switch (cmd) {
case LINUX_FUTEX_WAIT_BITSET:
case LINUX_FUTEX_WAIT:
clk = CLOCK_REALTIME;
break;
default:
return ENOSYS;
}
} else
clk = CLOCK_MONOTONIC;
/*
* Our implementation provides only private futexes. Most of the apps
* should use private futexes but don't claim so. Therefore we treat
* all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works
* in most cases (ie. when futexes are not shared on file descriptor
* or between different processes).
*
* Note that we don't handle bitsets at all at the moment. We need
* to move from refcounting uaddr's to handling multiple futex entries
* pointing to the same uaddr, but having possibly different bitmask.
* Perhaps move to an implementation where each uaddr has a list of
* futexes.
*/
switch (cmd) {
case LINUX_FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
/*FALLTHROUGH*/
case LINUX_FUTEX_WAIT_BITSET:
if ((error = ts2timo(clk, 0, ts, &tout, NULL)) != 0) {
if (error != ETIMEDOUT)
return error;
/*
* If the user process requests a non null
* timeout, make sure we do not turn it into
* an infinite timeout because tout is 0.
*
* We use a minimal timeout of 1/hz. Maybe it
* would make sense to just return ETIMEDOUT
* without sleeping.
*/
if (SCARG(uap, timeout) != NULL)
tout = 1;
else
tout = 0;
}
FUTEX_SYSTEM_LOCK;
if ((error = copyin(SCARG(uap, uaddr),
&val, sizeof(val))) != 0) {
FUTEX_SYSTEM_UNLOCK;
return error;
}
if (val != SCARG(uap, val)) {
FUTEX_SYSTEM_UNLOCK;
return EWOULDBLOCK;
}
FUTEXPRINTF(("FUTEX_WAIT %d.%d: val = %d, uaddr = %p, "
"*uaddr = %d, timeout = %lld.%09ld\n",
l->l_proc->p_pid, l->l_lid, SCARG(uap, val),
SCARG(uap, uaddr), val, (long long)ts->tv_sec,
ts->tv_nsec));
wp = futex_wp_alloc();
FUTEX_LOCK;
f = futex_get(SCARG(uap, uaddr), val3);
ret = futex_sleep(&f, l, tout, wp);
futex_put(f);
FUTEX_UNLOCK;
futex_wp_free(wp);
FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, "
"ret = %d\n", l->l_proc->p_pid, l->l_lid,
SCARG(uap, uaddr), ret));
FUTEX_SYSTEM_UNLOCK;
switch (ret) {
case EWOULDBLOCK: /* timeout */
return ETIMEDOUT;
break;
case EINTR: /* signal */
return EINTR;
break;
case 0: /* FUTEX_WAKE received */
FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, got it\n",
l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr)));
return 0;
break;
default:
FUTEXPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret));
break;
}
/* NOTREACHED */
break;
case LINUX_FUTEX_WAKE:
val = FUTEX_BITSET_MATCH_ANY;
/*FALLTHROUGH*/
case LINUX_FUTEX_WAKE_BITSET:
/*
* XXX: Linux is able cope with different addresses
* corresponding to the same mapped memory in the sleeping
* and the waker process(es).
*/
FUTEXPRINTF(("FUTEX_WAKE %d.%d: uaddr = %p, val = %d\n",
l->l_proc->p_pid, l->l_lid,
SCARG(uap, uaddr), SCARG(uap, val)));
FUTEX_SYSTEM_LOCK;
FUTEX_LOCK;
f = futex_get(SCARG(uap, uaddr), val3);
*retval = futex_wake(f, SCARG(uap, val), NULL, 0);
futex_put(f);
FUTEX_UNLOCK;
FUTEX_SYSTEM_UNLOCK;
break;
case LINUX_FUTEX_CMP_REQUEUE:
FUTEX_SYSTEM_LOCK;
if ((error = copyin(SCARG(uap, uaddr),
&val, sizeof(val))) != 0) {
FUTEX_SYSTEM_UNLOCK;
return error;
}
if (val != val3) {
FUTEX_SYSTEM_UNLOCK;
return EAGAIN;
}
FUTEXPRINTF(("FUTEX_CMP_REQUEUE %d.%d: uaddr = %p, val = %d, "
"uaddr2 = %p, val2 = %d\n",
l->l_proc->p_pid, l->l_lid,
SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2),
(int)(unsigned long)SCARG(uap, timeout)));
FUTEX_LOCK;
f = futex_get(SCARG(uap, uaddr), val3);
newf = futex_get(SCARG(uap, uaddr2), val3);
*retval = futex_wake(f, SCARG(uap, val), newf,
(int)(unsigned long)SCARG(uap, timeout));
futex_put(f);
futex_put(newf);
FUTEX_UNLOCK;
FUTEX_SYSTEM_UNLOCK;
break;
case LINUX_FUTEX_REQUEUE:
FUTEX_SYSTEM_LOCK;
FUTEXPRINTF(("FUTEX_REQUEUE %d.%d: uaddr = %p, val = %d, "
"uaddr2 = %p, val2 = %d\n",
l->l_proc->p_pid, l->l_lid,
SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2),
(int)(unsigned long)SCARG(uap, timeout)));
FUTEX_LOCK;
f = futex_get(SCARG(uap, uaddr), val3);
newf = futex_get(SCARG(uap, uaddr2), val3);
*retval = futex_wake(f, SCARG(uap, val), newf,
(int)(unsigned long)SCARG(uap, timeout));
futex_put(f);
futex_put(newf);
FUTEX_UNLOCK;
FUTEX_SYSTEM_UNLOCK;
break;
case LINUX_FUTEX_FD:
FUTEXPRINTF(("%s: unimplemented op %d\n", __func__, cmd));
return ENOSYS;
case LINUX_FUTEX_WAKE_OP:
FUTEX_SYSTEM_LOCK;
FUTEXPRINTF(("FUTEX_WAKE_OP %d.%d: uaddr = %p, op = %d, "
"val = %d, uaddr2 = %p, val2 = %d\n",
l->l_proc->p_pid, l->l_lid,
SCARG(uap, uaddr), cmd, SCARG(uap, val),
SCARG(uap, uaddr2),
(int)(unsigned long)SCARG(uap, timeout)));
FUTEX_LOCK;
f = futex_get(SCARG(uap, uaddr), val3);
f2 = futex_get(SCARG(uap, uaddr2), val3);
FUTEX_UNLOCK;
/*
* This function returns positive number as results and
* negative as errors
*/
op_ret = futex_atomic_op(l, val3, SCARG(uap, uaddr2));
FUTEX_LOCK;
if (op_ret < 0) {
futex_put(f);
futex_put(f2);
FUTEX_UNLOCK;
FUTEX_SYSTEM_UNLOCK;
return -op_ret;
}
ret = futex_wake(f, SCARG(uap, val), NULL, 0);
futex_put(f);
if (op_ret > 0) {
op_ret = 0;
/*
* Linux abuses the address of the timespec parameter
* as the number of retries
*/
op_ret += futex_wake(f2,
(int)(unsigned long)SCARG(uap, timeout), NULL, 0);
ret += op_ret;
}
futex_put(f2);
FUTEX_UNLOCK;
FUTEX_SYSTEM_UNLOCK;
*retval = ret;
break;
default:
FUTEXPRINTF(("%s: unknown op %d\n", __func__, cmd));
return ENOSYS;
}
return 0;
}
static struct waiting_proc *
futex_wp_alloc(void)
{
struct waiting_proc *wp;
wp = kmem_zalloc(sizeof(*wp), KM_SLEEP);
cv_init(&wp->wp_futex_cv, "futex");
return wp;
}
static void
futex_wp_free(struct waiting_proc *wp)
{
cv_destroy(&wp->wp_futex_cv);
kmem_free(wp, sizeof(*wp));
}
static struct futex *
futex_get(void *uaddr, uint32_t bitset)
{
struct futex *f;
FUTEX_LOCKASSERT;
LIST_FOREACH(f, &futex_list, f_list) {
if (f->f_uaddr == uaddr) {
f->f_refcount++;
return f;
}
}
/* Not found, create it */
f = kmem_zalloc(sizeof(*f), KM_SLEEP);
f->f_uaddr = uaddr;
f->f_bitset = bitset;
f->f_refcount = 1;
TAILQ_INIT(&f->f_waiting_proc);
LIST_INSERT_HEAD(&futex_list, f, f_list);
return f;
}
static void
futex_ref(struct futex *f)
{
FUTEX_LOCKASSERT;
f->f_refcount++;
}
static void
futex_put(struct futex *f)
{
FUTEX_LOCKASSERT;
f->f_refcount--;
if (f->f_refcount == 0) {
KASSERT(TAILQ_EMPTY(&f->f_waiting_proc));
LIST_REMOVE(f, f_list);
kmem_free(f, sizeof(*f));
}
}
static int
futex_sleep(struct futex **fp, lwp_t *l, int timeout, struct waiting_proc *wp)
{
struct futex *f;
int ret;
FUTEX_LOCKASSERT;
f = *fp;
wp->wp_futex = f;
TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list);
wp->wp_onlist = true;
ret = cv_timedwait_sig(&wp->wp_futex_cv, &futex_lock, timeout);
/*
* we may have been requeued to a different futex before we were
* woken up, so let the caller know which futex to put. if we were
* woken by futex_wake() then it took us off the waiting list,
* but if our sleep was interrupted or timed out then we might
* need to take ourselves off the waiting list.
*/
f = wp->wp_futex;
if (wp->wp_onlist) {
TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
}
*fp = f;
return ret;
}
static int
futex_wake(struct futex *f, int n, struct futex *newf, int n2)
{
struct waiting_proc *wp;
int count = 0;
FUTEX_LOCKASSERT;
/*
* wake up up to n threads waiting on this futex.
*/
while (n--) {
wp = TAILQ_FIRST(&f->f_waiting_proc);
if (wp == NULL)
return count;
KASSERT(f == wp->wp_futex);
TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
wp->wp_onlist = false;
cv_signal(&wp->wp_futex_cv);
count++;
}
if (newf == NULL)
return count;
/*
* then requeue up to n2 additional threads to newf
* (without waking them up).
*/
while (n2--) {
wp = TAILQ_FIRST(&f->f_waiting_proc);
if (wp == NULL)
return count;
KASSERT(f == wp->wp_futex);
TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
futex_put(f);
wp->wp_futex = newf;
futex_ref(newf);
TAILQ_INSERT_TAIL(&newf->f_waiting_proc, wp, wp_list);
count++;
}
return count;
}
static int
futex_atomic_op(lwp_t *l, int encoded_op, void *uaddr)
{
const int op = (encoded_op >> 28) & 7;
const int cmp = (encoded_op >> 24) & 15;
const int cmparg = (encoded_op << 20) >> 20;
int oparg = (encoded_op << 8) >> 20;
int error, oldval, cval;
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
oparg = 1 << oparg;
/* XXX: linux verifies access here and returns EFAULT */
if (copyin(uaddr, &cval, sizeof(int)) != 0)
return -EFAULT;
for (;;) {
int nval;
switch (op) {
case FUTEX_OP_SET:
nval = oparg;
break;
case FUTEX_OP_ADD:
nval = cval + oparg;
break;
case FUTEX_OP_OR:
nval = cval | oparg;
break;
case FUTEX_OP_ANDN:
nval = cval & ~oparg;
break;
case FUTEX_OP_XOR:
nval = cval ^ oparg;
break;
default:
return -ENOSYS;
}
error = ucas_int(uaddr, cval, nval, &oldval);
if (error || oldval == cval) {
break;
}
cval = oldval;
}
if (error)
return -EFAULT;
switch (cmp) {
case FUTEX_OP_CMP_EQ:
return (oldval == cmparg);
case FUTEX_OP_CMP_NE:
return (oldval != cmparg);
case FUTEX_OP_CMP_LT:
return (oldval < cmparg);
case FUTEX_OP_CMP_GE:
return (oldval >= cmparg);
case FUTEX_OP_CMP_LE:
return (oldval <= cmparg);
case FUTEX_OP_CMP_GT:
return (oldval > cmparg);
default:
return -ENOSYS;
}
}
int
linux_sys_set_robust_list(struct lwp *l,
const struct linux_sys_set_robust_list_args *uap, register_t *retval)
{
/* {
syscallarg(struct linux_robust_list_head *) head;
syscallarg(size_t) len;
} */
struct linux_emuldata *led;
if (SCARG(uap, len) != sizeof(struct linux_robust_list_head))
return EINVAL;
led = l->l_emuldata;
led->led_robust_head = SCARG(uap, head);
*retval = 0;
return 0;
}
int
linux_sys_get_robust_list(struct lwp *l,
const struct linux_sys_get_robust_list_args *uap, register_t *retval)
{
/* {
syscallarg(int) pid;
syscallarg(struct linux_robust_list_head **) head;
syscallarg(size_t *) len;
} */
struct proc *p;
struct linux_emuldata *led;
struct linux_robust_list_head *head;
size_t len;
int error = 0;
p = l->l_proc;
if (!SCARG(uap, pid)) {
led = l->l_emuldata;
head = led->led_robust_head;
} else {
mutex_enter(p->p_lock);
l = lwp_find(p, SCARG(uap, pid));
if (l != NULL) {
led = l->l_emuldata;
head = led->led_robust_head;
}
mutex_exit(p->p_lock);
if (l == NULL) {
return ESRCH;
}
}
#ifdef __arch64__
if (p->p_flag & PK_32) {
uint32_t u32;
u32 = 12;
error = copyout(&u32, SCARG(uap, len), sizeof(u32));
if (error)
return error;
u32 = (uint32_t)(uintptr_t)head;
return copyout(&u32, SCARG(uap, head), sizeof(u32));
}
#endif
len = sizeof(*head);
error = copyout(&len, SCARG(uap, len), sizeof(len));
if (error)
return error;
return copyout(&head, SCARG(uap, head), sizeof(head));
}
static int
handle_futex_death(void *uaddr, pid_t pid, int pi)
{
int uval, nval, mval;
struct futex *f;
retry:
if (copyin(uaddr, &uval, sizeof(uval)))
return EFAULT;
if ((uval & FUTEX_TID_MASK) == pid) {
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
nval = atomic_cas_32(uaddr, uval, mval);
if (nval == -1)
return EFAULT;
if (nval != uval)
goto retry;
if (!pi && (uval & FUTEX_WAITERS)) {
FUTEX_LOCK;
f = futex_get(uaddr, FUTEX_BITSET_MATCH_ANY);
futex_wake(f, 1, NULL, 0);
FUTEX_UNLOCK;
}
}
return 0;
}
static int
fetch_robust_entry(struct lwp *l, struct linux_robust_list **entry,
struct linux_robust_list **head, int *pi)
{
unsigned long uentry;
#ifdef __arch64__
if (l->l_proc->p_flag & PK_32) {
uint32_t u32;
if (copyin(head, &u32, sizeof(u32)))
return EFAULT;
uentry = (unsigned long)u32;
} else
#endif
if (copyin(head, &uentry, sizeof(uentry)))
return EFAULT;
*entry = (void *)(uentry & ~1UL);
*pi = uentry & 1;
return 0;
}
/* This walks the list of robust futexes, releasing them. */
void
release_futexes(struct lwp *l)
{
struct linux_robust_list_head head;
struct linux_robust_list *entry, *next_entry = NULL, *pending;
unsigned int limit = 2048, pi, next_pi, pip;
struct linux_emuldata *led;
unsigned long futex_offset;
int rc;
led = l->l_emuldata;
if (led->led_robust_head == NULL)
return;
#ifdef __arch64__
if (l->l_proc->p_flag & PK_32) {
uint32_t u32s[3];
if (copyin(led->led_robust_head, u32s, sizeof(u32s)))
return;
head.list.next = (void *)(uintptr_t)u32s[0];
head.futex_offset = (unsigned long)u32s[1];
head.pending_list = (void *)(uintptr_t)u32s[2];
} else
#endif
if (copyin(led->led_robust_head, &head, sizeof(head)))
return;
if (fetch_robust_entry(l, &entry, &head.list.next, &pi))
return;
#ifdef __arch64__
if (l->l_proc->p_flag & PK_32) {
uint32_t u32;
if (copyin(led->led_robust_head, &u32, sizeof(u32)))
return;
head.futex_offset = (unsigned long)u32;
futex_offset = head.futex_offset;
} else
#endif
if (copyin(&head.futex_offset, &futex_offset, sizeof(unsigned long)))
return;
if (fetch_robust_entry(l, &pending, &head.pending_list, &pip))
return;
while (entry != &head.list) {
rc = fetch_robust_entry(l, &next_entry, &entry->next, &next_pi);
if (entry != pending)
if (handle_futex_death((char *)entry + futex_offset,
l->l_lid, pi))
return;
if (rc)
return;
entry = next_entry;
pi = next_pi;
if (!--limit)
break;
yield(); /* XXX why? */
}
if (pending)
handle_futex_death((char *)pending + futex_offset,
l->l_lid, pip);
}