#include <sys/cdefs.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/lockf.h>
#include <sys/mbuf.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/socket.h>
#include <sys/socket.h>
#include <sys/unistd.h>
#include <sys/user.h>
#include <sys/vnode.h>
#include <kern/thread_act.h>
#include <machine/limits.h>
#include <net/if.h>
#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs.h>
#include <nfs/nfsmount.h>
#include <nfs/nfsnode.h>
#include <nfs/nfs_lock.h>
#define OFF_MAX QUAD_MAX
uint64_t nfsadvlocks = 0;
struct timeval nfsadvlock_longest = {0, 0};
struct timeval nfsadvlocks_time = {0, 0};
pid_t nfslockdpid = 0;
struct file *nfslockdfp = 0;
int nfslockdwaiting = 0;
int nfslockdfifowritten = 0;
int nfslockdfifolock = 0;
#define NFSLOCKDFIFOLOCK_LOCKED 1
#define NFSLOCKDFIFOLOCK_WANT 2
uint64_t nfs_lockxid = 0;
LOCKD_MSG_QUEUE nfs_pendlockq;
struct nfs_lock_pid {
TAILQ_ENTRY(nfs_lock_pid) lp_lru;
LIST_ENTRY(nfs_lock_pid) lp_hash;
int lp_valid;
int lp_time;
pid_t lp_pid;
struct timeval lp_pid_start;
};
#define NFS_LOCK_PID_HASH_SIZE 64 // XXX tune me
#define NFS_LOCK_PID_HASH(pid) \
(&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
u_long nfs_lock_pid_hash;
int nfs_lock_pid_lock;
void
nfs_lockinit(void)
{
TAILQ_INIT(&nfs_pendlockq);
nfs_lock_pid_lock = 0;
nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
M_TEMP, &nfs_lock_pid_hash);
TAILQ_INIT(&nfs_lock_pid_lru);
}
static inline void
nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
{
LOCKD_MSG_REQUEST *mr;
mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
return;
}
while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
}
if (mr) {
TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
} else {
TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
}
}
static inline void
nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
{
TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
}
static inline LOCKD_MSG_REQUEST *
nfs_lockdmsg_find_by_xid(uint64_t lockxid)
{
LOCKD_MSG_REQUEST *mr;
TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
if (mr->lmr_msg.lm_xid == lockxid)
return mr;
if (mr->lmr_msg.lm_xid > lockxid)
return NULL;
}
return mr;
}
static inline int
nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
{
if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
return 1;
if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
return 1;
if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
return 1;
if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
return 1;
if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
return 1;
if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
return 1;
return 0;
}
static inline LOCKD_MSG_REQUEST *
nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
{
LOCKD_MSG_REQUEST *mr;
if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
return NULL;
TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
break;
}
return mr;
}
static inline uint64_t
nfs_lockxid_get(void)
{
LOCKD_MSG_REQUEST *mr;
if (!nfs_lockxid) {
struct timeval tv;
microtime(&tv);
nfs_lockxid = (uint64_t)tv.tv_sec << 12;
}
do {
if (++nfs_lockxid == 0)
nfs_lockxid++;
if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
(mr->lmr_msg.lm_xid < nfs_lockxid)) {
break;
}
} while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
return nfs_lockxid;
}
static int
nfs_lock_pid_check(struct proc *p, int addflag, struct vnode *vp)
{
struct nfs_lock_pid *lp, *lplru, *lplru_next;
struct proc *plru;
int error = 0;
struct timeval now;
loop:
if (nfs_lock_pid_lock) {
while (nfs_lock_pid_lock) {
nfs_lock_pid_lock = -1;
tsleep(&nfs_lock_pid_lock, PCATCH, "nfslockpid", 0);
if ((error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)))
return (error);
}
goto loop;
}
nfs_lock_pid_lock = 1;
error = ENOENT;
lp = NFS_LOCK_PID_HASH(p->p_pid)->lh_first;
for (; lp != NULL; lp = lp->lp_hash.le_next)
if (lp->lp_pid == p->p_pid) {
if (timevalcmp(&lp->lp_pid_start, &p->p_stats->p_start, ==)) {
TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
microuptime(&now);
lp->lp_time = now.tv_sec;
TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
error = 0;
break;
}
LIST_REMOVE(lp, lp_hash);
lp->lp_valid = 0;
TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
lp = NULL;
break;
}
if ((error == ENOENT) && addflag) {
int lrucnt = 0;
microuptime(&now);
for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
lplru_next = TAILQ_NEXT(lplru, lp_lru);
if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
break;
}
TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
if (!lplru->lp_valid || !(plru = pfind(lplru->lp_pid)) ||
timevalcmp(&lplru->lp_pid_start, &plru->p_stats->p_start, !=)) {
LIST_REMOVE(lplru, lp_hash);
if (!lp) {
lp = lplru;
} else {
FREE(lplru, M_TEMP);
}
} else {
lplru->lp_time = now.tv_sec;
TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
}
if (++lrucnt > 8)
break;
}
if (!lp) {
MALLOC(lp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
M_TEMP, M_WAITOK | M_ZERO);
}
lp->lp_pid = p->p_pid;
lp->lp_pid_start = p->p_stats->p_start;
LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
lp->lp_valid = 1;
lp->lp_time = now.tv_sec;
TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
error = 0;
}
if (nfs_lock_pid_lock < 0) {
nfs_lock_pid_lock = 0;
wakeup(&nfs_lock_pid_lock);
} else
nfs_lock_pid_lock = 0;
return (error);
}
int
nfs_dolock(struct vop_advlock_args *ap)
{
LOCKD_MSG_REQUEST msgreq;
LOCKD_MSG *msg;
struct vnode *vp, *wvp;
struct nfsnode *np;
int error, error1;
struct flock *fl;
int fmode, ioflg;
struct proc *p;
struct nfsmount *nmp;
struct vattr vattr;
off_t start, end;
struct timeval now;
int timeo, endtime, lastmsg, wentdown = 0;
int lockpidcheck;
p = current_proc();
vp = ap->a_vp;
fl = ap->a_fl;
np = VTONFS(vp);
nmp = VFSTONFS(vp->v_mount);
if (!nmp)
return (ENXIO);
if (nmp->nm_flag & NFSMNT_NOLOCKS)
return (EOPNOTSUPP);
if (fl->l_whence != SEEK_END) {
if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
fl->l_start < 0 ||
(fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
(fl->l_len < 0 && fl->l_start + fl->l_len < 0))
return (EINVAL);
}
if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) {
if (!nfslockdwaiting)
return (EOPNOTSUPP);
if (!nfslockdfp && (fl->l_type == F_UNLCK))
return (EINVAL);
(void)wakeup((void *)&nfslockdwaiting);
tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz);
if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data))
return (EOPNOTSUPP);
}
VREF(wvp);
lockpidcheck = nfs_lock_pid_check(p, 0, vp);
if (lockpidcheck) {
if (lockpidcheck != ENOENT)
return (lockpidcheck);
if (ap->a_op == F_UNLCK) {
vrele(wvp);
return (0);
}
}
switch (fl->l_whence) {
case SEEK_SET:
case SEEK_CUR:
start = fl->l_start;
break;
case SEEK_END:
if (np->n_flag & NMODIFIED) {
np->n_xid = 0;
error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1);
if (error) {
vrele(wvp);
return (error);
}
}
np->n_xid = 0;
error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
if (error) {
vrele(wvp);
return (error);
}
start = np->n_size + fl->l_start;
break;
default:
vrele(wvp);
return (EINVAL);
}
if (fl->l_len == 0)
end = -1;
else if (fl->l_len > 0)
end = start + fl->l_len - 1;
else {
end = start - 1;
start += fl->l_len;
}
if (start < 0) {
vrele(wvp);
return (EINVAL);
}
if (!NFS_ISV3(vp) &&
((start >= 0x80000000) || (end >= 0x80000000))) {
vrele(wvp);
return (EINVAL);
}
msgreq.lmr_answered = 0;
msgreq.lmr_errno = 0;
msgreq.lmr_saved_errno = 0;
msg = &msgreq.lmr_msg;
msg->lm_version = LOCKD_MSG_VERSION;
msg->lm_flags = 0;
msg->lm_fl = *fl;
msg->lm_fl.l_start = start;
if (end != -1)
msg->lm_fl.l_len = end - start + 1;
msg->lm_fl.l_pid = p->p_pid;
if (ap->a_flags & F_WAIT)
msg->lm_flags |= LOCKD_MSG_BLOCK;
if (ap->a_op == F_GETLK)
msg->lm_flags |= LOCKD_MSG_TEST;
nmp = VFSTONFS(vp->v_mount);
if (!nmp) {
vrele(wvp);
return (ENXIO);
}
bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg->lm_addr,
min(sizeof msg->lm_addr,
mtod(nmp->nm_nam, struct sockaddr *)->sa_len));
msg->lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
bcopy(VTONFS(vp)->n_fhp, msg->lm_fh, msg->lm_fh_len);
if (NFS_ISV3(vp))
msg->lm_flags |= LOCKD_MSG_NFSV3;
cru2x(p->p_ucred, &msg->lm_cred);
microuptime(&now);
lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
fmode = FFLAGS(O_WRONLY);
if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) {
vrele(wvp);
return (error);
}
++wvp->v_writecount;
msg->lm_xid = nfs_lockxid_get();
nfs_lockdmsg_enqueue(&msgreq);
timeo = 2*hz;
#define IO_NOMACCHECK 0;
ioflg = IO_UNIT | IO_NOMACCHECK;
for (;;) {
VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE);
error = 0;
while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
error = tsleep((void *)&nfslockdfifolock,
PCATCH | PUSER, "lockdfifo", 20*hz);
if (error)
break;
}
if (error)
break;
nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)msg, sizeof(*msg), 0,
UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p);
nfslockdfifowritten = 1;
nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
wakeup((void *)&nfslockdfifolock);
}
if (nfslockdwaiting)
(void)wakeup((void *)&nfslockdwaiting);
if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
break;
}
wait_for_granted:
error = EWOULDBLOCK;
microuptime(&now);
if ((timeo/hz) > 0)
endtime = now.tv_sec + timeo/hz;
else
endtime = now.tv_sec + 1;
while (now.tv_sec < endtime) {
error = tsleep((void *)&msgreq, PCATCH | PUSER, "lockd", 2*hz);
if (msgreq.lmr_answered) {
error = 0;
break;
}
if (error != EWOULDBLOCK)
break;
nmp = VFSTONFS(vp->v_mount);
if (!nmp || (nmp->nm_flag & NFSMNT_NOLOCKS))
break;
if ((p != NULL) && ((p->p_flag & P_NOREMOTEHANG) != 0) &&
((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
if (fl->l_type == F_UNLCK)
printf("nfs_dolock: aborting unlock request "
"due to timeout (noremotehang)\n");
error = EIO;
break;
}
microuptime(&now);
}
if (error) {
nmp = VFSTONFS(vp->v_mount);
if (!nmp) {
if (error == EWOULDBLOCK)
error = ENXIO;
break;
}
if (nmp->nm_flag & NFSMNT_NOLOCKS) {
if (error == EWOULDBLOCK)
error = EOPNOTSUPP;
break;
}
if ((error == EOPNOTSUPP) &&
(nmp->nm_state & NFSSTA_LOCKSWORK)) {
error = EWOULDBLOCK;
}
if (error != EWOULDBLOCK) {
if ((msgreq.lmr_errno == EINPROGRESS) &&
!(msg->lm_flags & LOCKD_MSG_CANCEL)) {
msg->lm_flags |= LOCKD_MSG_CANCEL;
nfs_lockdmsg_dequeue(&msgreq);
msg->lm_xid = nfs_lockxid_get();
nfs_lockdmsg_enqueue(&msgreq);
msgreq.lmr_saved_errno = error;
msgreq.lmr_errno = 0;
msgreq.lmr_answered = 0;
timeo = 2*hz;
continue;
}
break;
}
if ((p != NULL) && ((p->p_flag & P_NOREMOTEHANG) != 0) &&
((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
if (fl->l_type == F_UNLCK)
printf("nfs_dolock: aborting unlock request "
"due to timeout (noremotehang)\n");
error = EIO;
break;
}
microuptime(&now);
if ((msgreq.lmr_errno != EINPROGRESS) &&
(nmp->nm_tprintf_initial_delay != 0) &&
((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
lastmsg = now.tv_sec;
nfs_down(NULL, nmp, p, "lockd not responding",
0, NFSSTA_LOCKTIMEO);
wentdown = 1;
}
if (msgreq.lmr_errno == EINPROGRESS) {
msg->lm_flags |= LOCKD_MSG_CANCEL;
nfs_lockdmsg_dequeue(&msgreq);
msg->lm_xid = nfs_lockxid_get();
nfs_lockdmsg_enqueue(&msgreq);
msgreq.lmr_saved_errno = msgreq.lmr_errno;
msgreq.lmr_errno = 0;
msgreq.lmr_answered = 0;
timeo = 2*hz;
continue;
}
ioflg |= IO_NDELAY;
timeo *= 2;
if (timeo > 60*hz)
timeo = 60*hz;
continue;
}
if (wentdown) {
nfs_up(NULL, VFSTONFS(vp->v_mount), p, "lockd alive again",
NFSSTA_LOCKTIMEO);
wentdown = 0;
}
if (msgreq.lmr_errno == EINPROGRESS) {
timeo = 60*hz;
msgreq.lmr_answered = 0;
goto wait_for_granted;
}
if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
(msgreq.lmr_saved_errno == EINPROGRESS)) {
msg->lm_flags &= ~LOCKD_MSG_CANCEL;
nfs_lockdmsg_dequeue(&msgreq);
msg->lm_xid = nfs_lockxid_get();
nfs_lockdmsg_enqueue(&msgreq);
msgreq.lmr_saved_errno = 0;
msgreq.lmr_errno = 0;
msgreq.lmr_answered = 0;
timeo = 2*hz;
continue;
}
if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
if (msg->lm_fl.l_type != F_UNLCK) {
fl->l_type = msg->lm_fl.l_type;
fl->l_pid = msg->lm_fl.l_pid;
fl->l_start = msg->lm_fl.l_start;
fl->l_len = msg->lm_fl.l_len;
fl->l_whence = SEEK_SET;
} else {
fl->l_type = F_UNLCK;
}
}
if (msg->lm_flags & LOCKD_MSG_CANCEL) {
msg->lm_flags &= ~LOCKD_MSG_CANCEL;
error = msgreq.lmr_saved_errno;
} else
error = msgreq.lmr_errno;
if (!error) {
nmp = VFSTONFS(vp->v_mount);
if (nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK))
nmp->nm_state |= NFSSTA_LOCKSWORK;
if ((lockpidcheck == ENOENT) &&
((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW)))
nfs_lock_pid_check(p, 1, vp);
}
break;
}
nfs_lockdmsg_dequeue(&msgreq);
error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p);
return (error != 0 ? error : error1);
}
int
nfslockdans(struct proc *p, struct lockd_ans *ansp)
{
LOCKD_MSG_REQUEST *msgreq;
int error;
if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 &&
p->p_cred->p_svuid != 0)
return (error);
if (ansp->la_version != LOCKD_ANS_VERSION)
return (EINVAL);
msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
if (ansp->la_flags & LOCKD_ANS_GRANTED) {
if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
msgreq = nfs_lockdmsg_find_by_answer(ansp);
if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
msgreq = NULL;
}
if (!msgreq)
return (EPIPE);
msgreq->lmr_errno = ansp->la_errno;
if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
else
msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
} else {
msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
}
}
msgreq->lmr_answered = 1;
(void)wakeup((void *)msgreq);
return (0);
}
int
nfslockdfd(struct proc *p, int fd)
{
int error;
struct file *fp, *ofp;
error = suser(p->p_ucred, &p->p_acflag);
if (error)
return (error);
if (fd < 0) {
fp = 0;
} else {
error = getvnode(p, fd, &fp);
if (error)
return (error);
(void)fref(fp);
}
ofp = nfslockdfp;
nfslockdfp = fp;
if (ofp)
(void)frele(ofp);
nfslockdpid = nfslockdfp ? p->p_pid : 0;
(void)wakeup((void *)&nfslockdfp);
return (0);
}
int
nfslockdwait(struct proc *p)
{
int error;
struct file *fp, *ofp;
if (p->p_pid != nfslockdpid) {
error = suser(p->p_ucred, &p->p_acflag);
if (error)
return (error);
}
if (nfslockdwaiting)
return (EBUSY);
if (nfslockdfifowritten) {
nfslockdfifowritten = 0;
return (0);
}
nfslockdwaiting = 1;
tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
nfslockdwaiting = 0;
return (0);
}