diff --git a/lib/libc/sys/kqueue.2 b/lib/libc/sys/kqueue.2 index 1ad5ce1..6327eb0 100644 --- a/lib/libc/sys/kqueue.2 +++ b/lib/libc/sys/kqueue.2 @@ -278,6 +278,13 @@ For sockets, the low water mark and socket error handling is identical to the .Dv EVFILT_READ case. +.It Dv EVFILT_EXCEPT +Takes a descriptor as the identifier, and returns whenever one of the +specified exceptional conditions has occurred on the descriptor. Conditions +are specified in +.Va fflags . +Currently, a filter can monitor the reception of out-of-band data with +.Dv NOTE_OOB . .It Dv EVFILT_AIO The sigevent portion of the AIO request is filled in, with .Va sigev_notify_kqueue diff --git a/sys/emulation/linux/linux_epoll.c b/sys/emulation/linux/linux_epoll.c index 9e8cd74..92516bb 100644 --- a/sys/emulation/linux/linux_epoll.c +++ b/sys/emulation/linux/linux_epoll.c @@ -122,7 +122,7 @@ linux_kevent_to_epoll(struct kevent *kevent, struct linux_epoll_event *event) * of the filter. */ static int -linux_kev_copyout(void *arg, struct kevent *kevp, int count) +linux_kev_copyout(void *arg, struct kevent *kevp, int count, int *res) { struct kevent_args *uap; struct linux_epoll_event *eep; @@ -137,11 +137,13 @@ linux_kev_copyout(void *arg, struct kevent *kevp, int count) } error = copyout(eep, uap->eventlist, count * sizeof(*eep)); - if (error) + if (error == 0) { uap->eventlist = (struct kevent *)((char *)uap->eventlist + count * sizeof(*eep)); + *res += count; + } kfree(eep, M_TEMP); - return (0); + return (error); } /* @@ -149,15 +151,16 @@ linux_kev_copyout(void *arg, struct kevent *kevp, int count) * converted filters to the kevent internal memory. */ static int -linux_kev_copyin(void *arg, struct kevent *kevp, int count) +linux_kev_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) { struct kevent_args *uap; uap = (struct kevent_args*) arg; - memcpy(kevp, uap->changelist, count * sizeof(*kevp)); + memcpy(kevp, uap->changelist, maxevents * sizeof(*kevp)); - uap->changelist += count; + uap->changelist += maxevents; + *events = maxevents; return (0); } @@ -204,8 +207,8 @@ sys_linux_epoll_ctl(struct linux_epoll_ctl_args *args) } linux_epoll_to_kevent(args->fd, &le, &kev); - error = kern_kevent(args->epfd, 1, 0, &k_args, linux_kev_copyin, - linux_kev_copyout, NULL); + error = kern_kevent(args->epfd, 0, &k_args.sysmsg_result, &k_args, + linux_kev_copyin, linux_kev_copyout, NULL); /* Check if there was an error during registration. */ if (error == 0 && k_args.sysmsg_result != 0) { /* The copyout callback stored the error there. */ @@ -224,9 +227,9 @@ sys_linux_epoll_wait(struct linux_epoll_wait_args *args) struct kevent_args k_args; int error; - /* Convert from miliseconds to timespec. */ - ts.tv_sec = args->timeout / 1000000; - ts.tv_nsec = (args->timeout % 1000000) * 1000; + /* Convert from milliseconds to timespec. */ + ts.tv_sec = args->timeout / 1000; + ts.tv_nsec = (args->timeout % 1000) * 1000 * 1000; k_args.fd = args->epfd; k_args.changelist = NULL; @@ -240,8 +243,8 @@ sys_linux_epoll_wait(struct linux_epoll_wait_args *args) k_args.nevents = args->maxevents; k_args.timeout = &ts; - error = kern_kevent(args->epfd, 0, args->maxevents, &k_args, - linux_kev_copyin, linux_kev_copyout, &ts); + error = kern_kevent(args->epfd, args->maxevents, &args->sysmsg_result, + &k_args, linux_kev_copyin, linux_kev_copyout, &ts); /* translation? */ return (error); diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 72eaecf..90d3b14 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -59,8 +60,14 @@ MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); +struct kevent_copyin_args { + struct kevent_args *ka; + int pchanges; +}; + +static int kqueue_sleep(struct kqueue *kq, struct timespec *tsp); static int kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, - struct timespec *tsp, int *errorp); + struct knote *marker); static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags); static int kqueue_write(struct file *fp, struct uio *uio, @@ -72,7 +79,6 @@ static int kqueue_kqfilter(struct file *fp, struct knote *kn); static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred); static int kqueue_close(struct file *fp); -static void kqueue_wakeup(struct kqueue *kq); /* * MPSAFE @@ -145,6 +151,7 @@ static struct filterops *sysfilt_ops[] = { &proc_filtops, /* EVFILT_PROC */ &sig_filtops, /* EVFILT_SIGNAL */ &timer_filtops, /* EVFILT_TIMER */ + &file_filtops, /* EVFILT_EXCEPT */ }; static int @@ -462,33 +469,44 @@ sys_kqueue(struct kqueue_args *uap) * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int -kevent_copyout(void *arg, struct kevent *kevp, int count) +kevent_copyout(void *arg, struct kevent *kevp, int count, int *res) { - struct kevent_args *uap; + struct kevent_copyin_args *kap; int error; - uap = (struct kevent_args *)arg; + kap = (struct kevent_copyin_args *)arg; + + error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp)); + if (error == 0) { + kap->ka->eventlist += count; + *res += count; + } else { + *res = -1; + } - error = copyout(kevp, uap->eventlist, count * sizeof *kevp); - if (error == 0) - uap->eventlist += count; return (error); } /* - * Copy 'count' items from the list pointed to by uap->changelist. + * Copy at most 'max' items from the list pointed to by kap->changelist, + * return number of items in 'events'. */ static int -kevent_copyin(void *arg, struct kevent *kevp, int count) +kevent_copyin(void *arg, struct kevent *kevp, int max, int *events) { - struct kevent_args *uap; - int error; + struct kevent_copyin_args *kap; + int error, count; - uap = (struct kevent_args *)arg; + kap = (struct kevent_copyin_args *)arg; + + count = min(kap->ka->nchanges - kap->pchanges, max); + error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp); + if (error == 0) { + kap->ka->changelist += count; + kap->pchanges += count; + *events = count; + } - error = copyin(uap->changelist, kevp, count * sizeof *kevp); - if (error == 0) - uap->changelist += count; return (error); } @@ -496,39 +514,27 @@ kevent_copyin(void *arg, struct kevent *kevp, int count) * MPALMOSTSAFE */ int -kern_kevent(int fd, int nchanges, int nevents, struct kevent_args *uap, - k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn, - struct timespec *tsp_in) +kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, + k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn, + struct timespec *tsp_in) { - struct thread *td = curthread; - struct proc *p = td->td_proc; struct kevent *kevp; - struct kqueue *kq; - struct file *fp = NULL; - struct timespec ts; struct timespec *tsp; - int i, n, total, nerrors, error; + int i, n, total, error, nerrors = 0; struct kevent kev[KQ_NEVENTS]; + struct knote marker; tsp = tsp_in; - - fp = holdfp(p->p_fd, fd, -1); - if (fp == NULL) - return (EBADF); - if (fp->f_type != DTYPE_KQUEUE) { - fdrop(fp); - return (EBADF); - } - - kq = (struct kqueue *)fp->f_data; - nerrors = 0; + *res = 0; get_mplock(); - while (nchanges > 0) { - n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; - error = kevent_copyinfn(uap, kev, n); + for ( ;; ) { + n = 0; + error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n); if (error) goto done; + if (n == 0) + break; for (i = 0; i < n; i++) { kevp = &kev[i]; kevp->flags &= ~EV_SYSFLAGS; @@ -537,7 +543,7 @@ kern_kevent(int fd, int nchanges, int nevents, struct kevent_args *uap, if (nevents != 0) { kevp->flags = EV_ERROR; kevp->data = error; - kevent_copyoutfn(uap, kevp, 1); + kevent_copyoutfn(uap, kevp, 1, res); nevents--; nerrors++; } else { @@ -545,10 +551,8 @@ kern_kevent(int fd, int nchanges, int nevents, struct kevent_args *uap, } } } - nchanges -= n; } if (nerrors) { - uap->sysmsg_result = nerrors; error = 0; goto done; } @@ -568,30 +572,71 @@ kern_kevent(int fd, int nchanges, int nevents, struct kevent_args *uap, /* * Loop as required. * - * Collect as many events as we can. The timeout on successive - * loops is disabled (kqueue_scan() becomes non-blocking). + * Collect as many events as we can. Sleeping on successive + * loops is disabled if copyoutfn has incremented (*res). + * + * The loop stops if an error occurs, all events have been + * scanned (the marker has been reached), or fewer than the + * maximum number of events is found. + * + * The copyoutfn function does not have to increment (*res) in + * order for the loop to continue. + * + * NOTE: doselect() usually passes 0x7FFFFFFF for nevents. */ total = 0; error = 0; + marker.kn_filter = EVFILT_MARKER; + crit_enter(); + TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); + crit_exit(); while ((n = nevents - total) > 0) { if (n > KQ_NEVENTS) n = KQ_NEVENTS; - i = kqueue_scan(kq, kev, n, tsp, &error); - if (i == 0) - break; - error = kevent_copyoutfn(uap, kev, i); - total += i; - if (error || i != n) + + if (kq->kq_count == 0 && *res == 0) { + error = kqueue_sleep(kq, tsp); + + if (error) + break; + + /* + * Move the marker to the end of the list + * after a sleep. + */ + crit_enter(); + TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); + TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); + crit_exit(); + } + + i = kqueue_scan(kq, kev, n, &marker); + if (i) { + error = kevent_copyoutfn(uap, kev, i, res); + total += i; + if (error) + break; + } + + /* + * Normally when fewer events are returned than requested + * we can stop. However, if only spurious events were + * collected the copyout will not bump (*res) and we have + * to continue. + */ + if (i < n && *res) break; - tsp = &ts; /* successive loops non-blocking */ - tsp->tv_sec = 0; - tsp->tv_nsec = 0; } - uap->sysmsg_result = total; + crit_enter(); + TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); + crit_exit(); + + /* Timeouts do not return EWOULDBLOCK. */ + if (error == EWOULDBLOCK) + error = 0; + done: rel_mplock(); - if (fp != NULL) - fdrop(fp); return (error); } @@ -601,7 +646,12 @@ done: int sys_kevent(struct kevent_args *uap) { + struct thread *td = curthread; + struct proc *p = td->td_proc; struct timespec ts, *tsp; + struct kqueue *kq; + struct file *fp = NULL; + struct kevent_copyin_args *kap, ka; int error; if (uap->timeout) { @@ -613,8 +663,24 @@ sys_kevent(struct kevent_args *uap) tsp = NULL; } - error = kern_kevent(uap->fd, uap->nchanges, uap->nevents, - uap, kevent_copyin, kevent_copyout, tsp); + fp = holdfp(p->p_fd, uap->fd, -1); + if (fp == NULL) + return (EBADF); + if (fp->f_type != DTYPE_KQUEUE) { + fdrop(fp); + return (EBADF); + } + + kq = (struct kqueue *)fp->f_data; + + kap = &ka; + kap->ka = uap; + kap->pchanges = 0; + + error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap, + kevent_copyin, kevent_copyout, tsp); + + fdrop(fp); return (error); } @@ -749,63 +815,71 @@ done: } /* - * Scan the kqueue, blocking if necessary until the target time is reached. + * Block as necessary until the target time is reached. * If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both * 0 we do not block at all. */ static int -kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, - struct timespec *tsp, int *errorp) +kqueue_sleep(struct kqueue *kq, struct timespec *tsp) { - struct knote *kn, marker; - int total; + int error = 0; - total = 0; -again: crit_enter(); - if (kq->kq_count == 0) { - if (tsp == NULL) { - kq->kq_state |= KQ_SLEEP; - *errorp = tsleep(kq, PCATCH, "kqread", 0); - } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { - *errorp = EWOULDBLOCK; - } else { - struct timespec ats; - struct timespec atx = *tsp; - int timeout; + if (tsp == NULL) { + kq->kq_state |= KQ_SLEEP; + error = tsleep(kq, PCATCH, "kqread", 0); + } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { + error = EWOULDBLOCK; + } else { + struct timespec ats; + struct timespec atx = *tsp; + int timeout; - nanouptime(&ats); - timespecsub(&atx, &ats); - if (ats.tv_sec < 0) { - *errorp = EWOULDBLOCK; - } else { - timeout = atx.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tstohz_high(&atx); - kq->kq_state |= KQ_SLEEP; - *errorp = tsleep(kq, PCATCH, "kqread", timeout); - } + nanouptime(&ats); + timespecsub(&atx, &ats); + if (ats.tv_sec < 0) { + error = EWOULDBLOCK; + } else { + timeout = atx.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tstohz_high(&atx); + kq->kq_state |= KQ_SLEEP; + error = tsleep(kq, PCATCH, "kqread", timeout); } - crit_exit(); - if (*errorp == 0) - goto again; - /* don't restart after signals... */ - if (*errorp == ERESTART) - *errorp = EINTR; - else if (*errorp == EWOULDBLOCK) - *errorp = 0; - goto done; } + crit_exit(); + + /* don't restart after signals... */ + if (error == ERESTART) + return (EINTR); + + return (error); +} + +/* + * Scan the kqueue, return the number of active events placed in kevp up + * to count. + * + * Continuous mode events may get recycled, do not continue scanning past + * marker unless no events have been collected. + */ +static int +kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, + struct knote *marker) +{ + struct knote *kn; + int total; + + total = 0; + crit_enter(); /* - * Collect events. Continuous mode events may get recycled - * past the marker so we stop when we hit it unless no events - * have been collected. + * Collect events. */ - TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe); while (count) { kn = TAILQ_FIRST(&kq->kq_knpend); - if (kn == &marker) - break; + if (kn->kn_filter == EVFILT_MARKER) + goto done; + TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); if (kn->kn_status & KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; @@ -841,11 +915,9 @@ again: TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); } } - TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe); - crit_exit(); - if (total == 0) - goto again; + done: + crit_exit(); return (total); } @@ -961,7 +1033,7 @@ kqueue_close(struct file *fp) return (0); } -static void +void kqueue_wakeup(struct kqueue *kq) { if (kq->kq_state & KQ_SLEEP) { diff --git a/sys/kern/kern_memio.c b/sys/kern/kern_memio.c index 47e12fc..f50a7a7 100644 --- a/sys/kern/kern_memio.c +++ b/sys/kern/kern_memio.c @@ -75,16 +75,18 @@ static d_write_t mmwrite; static d_ioctl_t mmioctl; static d_mmap_t memmmap; static d_poll_t mmpoll; +static d_kqfilter_t mmkqfilter; #define CDEV_MAJOR 2 static struct dev_ops mem_ops = { - { "mem", CDEV_MAJOR, D_MEM | D_MPSAFE_READ | D_MPSAFE_WRITE }, + { "mem", CDEV_MAJOR, D_MEM | D_MPSAFE_READ | D_MPSAFE_WRITE | D_KQFILTER }, .d_open = mmopen, .d_close = mmclose, .d_read = mmread, .d_write = mmwrite, .d_ioctl = mmioctl, .d_poll = mmpoll, + .d_kqfilter = mmkqfilter, .d_mmap = memmmap, }; @@ -546,6 +548,47 @@ mmpoll(struct dev_poll_args *ap) return (0); } +static int +mm_filter_read(struct knote *kn, long hint) +{ + return (1); +} + +static void +dummy_filter_detach(struct knote *kn) {} + +static struct filterops random_read_filtops = + { 1, NULL, dummy_filter_detach, random_filter_read }; + +static struct filterops mm_read_filtops = + { 1, NULL, dummy_filter_detach, mm_filter_read }; + +int +mmkqfilter(struct dev_kqfilter_args *ap) +{ + struct knote *kn = ap->a_kn; + cdev_t dev = ap->a_head.a_dev; + + ap->a_result = 0; + switch (kn->kn_filter) { + case EVFILT_READ: + switch (minor(dev)) { + case 3: + kn->kn_fop = &random_read_filtops; + break; + default: + kn->kn_fop = &mm_read_filtops; + break; + } + break; + default: + ap->a_result = 1; + return (0); + } + + return (0); +} + int iszerodev(cdev_t dev) { diff --git a/sys/kern/kern_nrandom.c b/sys/kern/kern_nrandom.c index 60205a4..5ad67d6 100644 --- a/sys/kern/kern_nrandom.c +++ b/sys/kern/kern_nrandom.c @@ -128,6 +128,7 @@ #include #include #include +#include #include #include #include @@ -522,6 +523,15 @@ random_poll(cdev_t dev, int events) } /* + * Kqueue filter (always succeeds) + */ +int +random_filter_read(struct knote *kn, long hint) +{ + return (1); +} + +/* * Heavy weight random number generator. May return less then the * requested number of bytes. */ diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c index e6b4fa5..d5ab5b5 100644 --- a/sys/kern/subr_log.c +++ b/sys/kern/subr_log.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -63,17 +64,21 @@ static d_close_t logclose; static d_read_t logread; static d_ioctl_t logioctl; static d_poll_t logpoll; +static d_kqfilter_t logkqfilter; static void logtimeout(void *arg); +static void logfiltdetach(struct knote *kn); +static int logfiltread(struct knote *kn, long hint); #define CDEV_MAJOR 7 static struct dev_ops log_ops = { - { "log", CDEV_MAJOR, 0 }, + { "log", CDEV_MAJOR, D_KQFILTER }, .d_open = logopen, .d_close = logclose, .d_read = logread, .d_ioctl = logioctl, .d_poll = logpoll, + .d_kqfilter = logkqfilter }; static struct logsoftc { @@ -178,6 +183,55 @@ logpoll(struct dev_poll_args *ap) return (0); } +static struct filterops logread_filtops = + { 1, NULL, logfiltdetach, logfiltread }; + +static int +logkqfilter(struct dev_kqfilter_args *ap) +{ + struct knote *kn = ap->a_kn; + struct klist *klist = &logsoftc.sc_selp.si_note; + + ap->a_result = 0; + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &logread_filtops; + break; + default: + ap->a_result = 1; + return (0); + } + + crit_enter(); + SLIST_INSERT_HEAD(klist, kn, kn_selnext); + crit_exit(); + + return (0); +} + +static void +logfiltdetach(struct knote *kn) +{ + struct klist *klist = &logsoftc.sc_selp.si_note; + + crit_enter(); + SLIST_REMOVE(klist, kn, knote, kn_selnext); + crit_exit(); +} + +static int +logfiltread(struct knote *kn, long hint) +{ + int ret = 0; + + crit_enter(); + if (msgbufp->msg_bufr != msgbufp->msg_bufx) + ret = 1; + crit_exit(); + + return (ret); +} + static void logtimeout(void *arg) { diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 5403dec..a764f26 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -79,11 +80,36 @@ static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); +typedef struct kfd_set { + fd_mask fds_bits[2]; +} kfd_set; + +enum select_copyin_states { + COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE }; + +struct select_kevent_copyin_args { + kfd_set *read_set; + kfd_set *write_set; + kfd_set *except_set; + int active_set; /* One of select_copyin_states */ + struct lwp *lwp; /* Pointer to our lwp */ + int num_fds; /* Number of file descriptors (syscall arg) */ + int proc_fds; /* Processed fd's (wraps) */ + int error; /* Returned to userland */ +}; + +struct poll_kevent_copyin_args { + struct lwp *lwp; + struct pollfd *fds; + int nfds; + int pfds; + int error; +}; + static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, - struct timeval *tv, int *res); -static int pollscan (struct proc *, struct pollfd *, u_int, int *); -static int selscan (struct proc *, fd_mask **, fd_mask **, - int, int *); + struct timespec *ts, int *res); +static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts, + int *res); static int dofileread(int, struct file *, struct uio *, int, size_t *); static int dofilewrite(int, struct file *, struct uio *, int, size_t *); @@ -760,13 +786,13 @@ SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); /* * Select system call. * - * MPALMOSTSAFE + * MPSAFE */ int sys_select(struct select_args *uap) { struct timeval ktv; - struct timeval *ktvp; + struct timespec *ktsp, kts; int error; /* @@ -776,21 +802,17 @@ sys_select(struct select_args *uap) error = copyin(uap->tv, &ktv, sizeof (ktv)); if (error) return (error); - error = itimerfix(&ktv); - if (error) - return (error); - ktvp = &ktv; + TIMEVAL_TO_TIMESPEC(&ktv, &kts); + ktsp = &kts; } else { - ktvp = NULL; + ktsp = NULL; } /* * Do real work. */ - get_mplock(); - error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, - &uap->sysmsg_result); - rel_mplock(); + error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, + &uap->sysmsg_result); return (error); } @@ -806,28 +828,20 @@ sys_pselect(struct pselect_args *uap) { struct thread *td = curthread; struct lwp *lp = td->td_lwp; - struct timespec kts; - struct timeval ktv; - struct timeval *ktvp; + struct timespec *ktsp, kts; sigset_t sigmask; int error; /* - * Get timeout if any and convert it. - * Round up during conversion to avoid timeout going off early. + * Get timeout if any. */ if (uap->ts != NULL) { error = copyin(uap->ts, &kts, sizeof (kts)); if (error) return (error); - ktv.tv_sec = kts.tv_sec; - ktv.tv_usec = (kts.tv_nsec + 999) / 1000; - error = itimerfix(&ktv); - if (error) - return (error); - ktvp = &ktv; + ktsp = &kts; } else { - ktvp = NULL; + ktsp = NULL; } /* @@ -848,8 +862,8 @@ sys_pselect(struct pselect_args *uap) /* * Do real job. */ - error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, - &uap->sysmsg_result); + error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, + &uap->sysmsg_result); if (uap->sigmask != NULL) { /* doselect() responsible for turning ERESTART into EINTR */ @@ -874,307 +888,414 @@ sys_pselect(struct pselect_args *uap) return (error); } +static int +select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) +{ + struct select_kevent_copyin_args *skap = NULL; + struct kevent *kev; + int fd; + kfd_set *fdp = NULL; + short filter = 0; + u_int fflags = 0; + + skap = (struct select_kevent_copyin_args *)arg; + + if (*events == maxevents) + return (0); + + while (skap->active_set < COPYIN_DONE) { + switch (skap->active_set) { + case COPYIN_READ: + /* + * Register descriptors for the read filter + */ + fdp = skap->read_set; + filter = EVFILT_READ; + fflags = 0; + if (fdp) + break; + ++skap->active_set; + skap->proc_fds = 0; + /* fall through */ + case COPYIN_WRITE: + /* + * Register descriptors for the write filter + */ + fdp = skap->write_set; + filter = EVFILT_WRITE; + fflags = 0; + if (fdp) + break; + ++skap->active_set; + skap->proc_fds = 0; + /* fall through */ + case COPYIN_EXCEPT: + /* + * Register descriptors for the exception filter + */ + fdp = skap->except_set; + filter = EVFILT_EXCEPT; + fflags = NOTE_OOB; + if (fdp) + break; + ++skap->active_set; + skap->proc_fds = 0; + /* fall through */ + case COPYIN_DONE: + /* + * Nothing left to register + */ + return(0); + /* NOT REACHED */ + } + + while (skap->proc_fds < skap->num_fds) { + fd = skap->proc_fds; + if (FD_ISSET(fd, fdp)) { + kev = &kevp[*events]; + EV_SET(kev, fd, filter, + EV_ADD|EV_ENABLE, + fflags, 0, + (void *)skap->lwp->lwp_kqueue_serial); + FD_CLR(fd, fdp); + ++*events; + } + ++skap->proc_fds; + if (*events == maxevents) + return (0); + } + skap->active_set++; + skap->proc_fds = 0; + } + + return (0); +} + +static int +select_copyout(void *arg, struct kevent *kevp, int count, int *res) +{ + struct select_kevent_copyin_args *skap; + struct kevent kev; + int i = 0; + + skap = (struct select_kevent_copyin_args *)arg; + + if (kevp[0].flags & EV_ERROR) { + skap->error = kevp[0].data; + return (0); + } + + for (i = 0; i < count; ++i) { + if ((u_int)kevp[i].udata != skap->lwp->lwp_kqueue_serial) { + kev = kevp[i]; + kev.flags = EV_DISABLE|EV_DELETE; + kqueue_register(&skap->lwp->lwp_kqueue, &kev); + continue; + } + + switch (kevp[i].filter) { + case EVFILT_READ: + FD_SET(kevp[i].ident, skap->read_set); + break; + case EVFILT_WRITE: + FD_SET(kevp[i].ident, skap->write_set); + break; + case EVFILT_EXCEPT: + FD_SET(kevp[i].ident, skap->except_set); + break; + } + + ++*res; + } + + return (0); +} + +/* + * Copy select bits in from userland. Allocate kernel memory if the + * set is large. + */ +static int +getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set) +{ + int error; + + if (in_set) { + if (bytes < sizeof(*tmp_set)) + *out_set = tmp_set; + else + *out_set = kmalloc(bytes, M_SELECT, M_WAITOK); + error = copyin(in_set, *out_set, bytes); + } else { + *out_set = NULL; + error = 0; + } + return (error); +} + +/* + * Copy returned select bits back out to userland. + */ +static int +putbits(int bytes, kfd_set *in_set, fd_set *out_set) +{ + int error; + + if (in_set) { + error = copyout(in_set, out_set, bytes); + } else { + error = 0; + } + return (error); +} + /* * Common code for sys_select() and sys_pselect(). * - * in, out and ex are userland pointers. tv must point to validated + * in, out and ex are userland pointers. ts must point to validated * kernel-side timeout value or NULL for infinite timeout. res must * point to syscall return value. */ static int -doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tv, - int *res) +doselect(int nd, fd_set *read, fd_set *write, fd_set *except, + struct timespec *ts, int *res) { - struct lwp *lp = curthread->td_lwp; struct proc *p = curproc; + struct select_kevent_copyin_args *kap, ka; + int bytes, error; + kfd_set read_tmp; + kfd_set write_tmp; + kfd_set except_tmp; + + *res = 0; + if (nd < 0) + return (EINVAL); + if (nd > p->p_fd->fd_nfiles) /* limit kmalloc */ + nd = p->p_fd->fd_nfiles; + + kap = &ka; + kap->lwp = curthread->td_lwp; + kap->num_fds = nd; + kap->proc_fds = 0; + kap->error = 0; + kap->active_set = COPYIN_READ; /* - * The magic 2048 here is chosen to be just enough for FD_SETSIZE - * infds with the new FD_SETSIZE of 1024, and more than enough for - * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE - * of 256. + * Calculate bytes based on the number of __fd_mask[] array entries + * multiplied by the size of __fd_mask. */ - fd_mask s_selbits[howmany(2048, NFDBITS)]; - fd_mask *ibits[3], *obits[3], *selbits, *sbp; - struct timeval atv, rtv, ttv; - int ncoll, error, timo; - u_int nbufbytes, ncpbytes, nfdbits; + bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask); - if (nd < 0) - return (EINVAL); - if (nd > p->p_fd->fd_nfiles) - nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ + error = getbits(bytes, read, &kap->read_set, &read_tmp); + if (error == 0) + error = getbits(bytes, write, &kap->write_set, &write_tmp); + if (error == 0) + error = getbits(bytes, except, &kap->except_set, &except_tmp); + if (error) + goto done; /* - * Allocate just enough bits for the non-null fd_sets. Use the - * preallocated auto buffer if possible. + * NOTE: Make sure the max events passed to kern_kevent() is + * effectively unlimited. (nd * 3) accomplishes this. + * + * (*res) continues to increment as returned events are + * loaded in. */ - nfdbits = roundup(nd, NFDBITS); - ncpbytes = nfdbits / NBBY; - nbufbytes = 0; - if (in != NULL) - nbufbytes += 2 * ncpbytes; - if (ou != NULL) - nbufbytes += 2 * ncpbytes; - if (ex != NULL) - nbufbytes += 2 * ncpbytes; - if (nbufbytes <= sizeof s_selbits) - selbits = &s_selbits[0]; - else - selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK); + error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap, + select_copyin, select_copyout, ts); + if (error == 0) + error = putbits(bytes, kap->read_set, read); + if (error == 0) + error = putbits(bytes, kap->write_set, write); + if (error == 0) + error = putbits(bytes, kap->except_set, except); /* - * Assign pointers into the bit buffers and fetch the input bits. - * Put the output buffers together so that they can be bzeroed - * together. + * Cumulative error from individual events (EBADFD?) */ - sbp = selbits; -#define getbits(name, x) \ - do { \ - if (name == NULL) \ - ibits[x] = NULL; \ - else { \ - ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ - obits[x] = sbp; \ - sbp += ncpbytes / sizeof *sbp; \ - error = copyin(name, ibits[x], ncpbytes); \ - if (error != 0) \ - goto done; \ - } \ - } while (0) - getbits(in, 0); - getbits(ou, 1); - getbits(ex, 2); -#undef getbits - if (nbufbytes != 0) - bzero(selbits, nbufbytes / 2); - - if (tv != NULL) { - atv = *tv; - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); - } else { - atv.tv_sec = 0; - atv.tv_usec = 0; - } - timo = 0; -retry: - ncoll = nselcoll; - lp->lwp_flag |= LWP_SELECT; - error = selscan(p, ibits, obits, nd, res); - if (error || *res) - goto done; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) - goto done; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz_high(&ttv); - } - crit_enter(); - tsleep_interlock(&selwait, PCATCH); - if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { - crit_exit(); - goto retry; - } - lp->lwp_flag &= ~LWP_SELECT; - error = tsleep(&selwait, PCATCH | PINTERLOCKED, "select", timo); - crit_exit(); + if (kap->error) + error = kap->error; - if (error == 0) - goto retry; + /* + * Clean up. + */ done: - lp->lwp_flag &= ~LWP_SELECT; - /* select is not restarted after signals... */ - if (error == ERESTART) - error = EINTR; - if (error == EWOULDBLOCK) - error = 0; -#define putbits(name, x) \ - if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ - error = error2; - if (error == 0) { - int error2; - - putbits(in, 0); - putbits(ou, 1); - putbits(ex, 2); -#undef putbits - } - if (selbits != &s_selbits[0]) - kfree(selbits, M_SELECT); - return (error); -} + if (kap->read_set && kap->read_set != &read_tmp) + kfree(kap->read_set, M_SELECT); + if (kap->write_set && kap->write_set != &write_tmp) + kfree(kap->write_set, M_SELECT); + if (kap->except_set && kap->except_set != &except_tmp) + kfree(kap->except_set, M_SELECT); -static int -selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res) -{ - int msk, i, fd; - fd_mask bits; - struct file *fp; - int n = 0; - /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ - static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; + kap->lwp->lwp_kqueue_serial++; - for (msk = 0; msk < 3; msk++) { - if (ibits[msk] == NULL) - continue; - for (i = 0; i < nfd; i += NFDBITS) { - bits = ibits[msk][i/NFDBITS]; - /* ffs(int mask) not portable, fd_mask is long */ - for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { - if (!(bits & 1)) - continue; - fp = holdfp(p->p_fd, fd, -1); - if (fp == NULL) - return (EBADF); - if (fo_poll(fp, flag[msk], fp->f_cred)) { - obits[msk][(fd)/NFDBITS] |= - ((fd_mask)1 << ((fd) % NFDBITS)); - n++; - } - fdrop(fp); - } - } - } - *res = n; - return (0); + return (error); } /* * Poll system call. * - * MPALMOSTSAFE + * MPSAFE */ int sys_poll(struct poll_args *uap) { - struct pollfd *bits; - struct pollfd smallbits[32]; - struct timeval atv, rtv, ttv; - int ncoll, error = 0, timo; - u_int nfds; - size_t ni; - struct lwp *lp = curthread->td_lwp; - struct proc *p = curproc; + struct timespec ts, *tsp; + int error; - nfds = uap->nfds; - /* - * This is kinda bogus. We have fd limits, but that is not - * really related to the size of the pollfd array. Make sure - * we let the process use at least FD_SETSIZE entries and at - * least enough for the current limits. We want to be reasonably - * safe, but not overly restrictive. - */ - if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) - return (EINVAL); - ni = nfds * sizeof(struct pollfd); - if (ni > sizeof(smallbits)) - bits = kmalloc(ni, M_TEMP, M_WAITOK); - else - bits = smallbits; - error = copyin(uap->fds, bits, ni); - if (error) - goto done2; if (uap->timeout != INFTIM) { - atv.tv_sec = uap->timeout / 1000; - atv.tv_usec = (uap->timeout % 1000) * 1000; - if (itimerfix(&atv)) { - error = EINVAL; - goto done2; - } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); + ts.tv_sec = uap->timeout / 1000; + ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000; + tsp = &ts; } else { - atv.tv_sec = 0; - atv.tv_usec = 0; - } - timo = 0; - get_mplock(); -retry: - ncoll = nselcoll; - lp->lwp_flag |= LWP_SELECT; - error = pollscan(p, bits, nfds, &uap->sysmsg_result); - if (error || uap->sysmsg_result) - goto done1; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) - goto done1; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz_high(&ttv); - } - crit_enter(); - tsleep_interlock(&selwait, PCATCH); - if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { - crit_exit(); - goto retry; + tsp = NULL; } - lp->lwp_flag &= ~LWP_SELECT; - error = tsleep(&selwait, PCATCH | PINTERLOCKED, "poll", timo); - crit_exit(); - if (error == 0) - goto retry; -done1: - rel_mplock(); -done2: - lp->lwp_flag &= ~LWP_SELECT; - /* poll is not restarted after signals... */ - if (error == ERESTART) - error = EINTR; - if (error == EWOULDBLOCK) - error = 0; - if (error == 0) { - error = copyout(bits, uap->fds, ni); - if (error) - goto out; - } -out: - if (ni > sizeof(smallbits)) - kfree(bits, M_TEMP); + error = dopoll(uap->nfds, uap->fds, tsp, &uap->sysmsg_result); + return (error); } static int -pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res) +poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) +{ + struct poll_kevent_copyin_args *pkap; + struct pollfd *pfd; + struct kevent *kev; + int kev_count; + + pkap = (struct poll_kevent_copyin_args *)arg; + + while (pkap->pfds < pkap->nfds) { + pfd = &pkap->fds[pkap->pfds]; + + /* Clear return events */ + pfd->revents = 0; + + kev_count = 0; + if (pfd->events & (POLLIN | POLLRDNORM)) + kev_count++; + if (pfd->events & (POLLOUT | POLLWRNORM)) + kev_count++; + if (pfd->events & (POLLPRI | POLLRDBAND)) + kev_count++; + + if (*events + kev_count > maxevents) + return (0); + + kev = &kevp[*events]; + if (pfd->events & (POLLIN | POLLRDNORM)) + EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, + 0, 0, (void *)pkap->pfds); + if (pfd->events & (POLLOUT | POLLWRNORM)) + EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE, + 0, 0, (void *)pkap->pfds); + if (pfd->events & (POLLPRI | POLLRDBAND)) + EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE, + NOTE_OOB, 0, (void *)pkap->pfds); + + ++pkap->pfds; + (*events) += kev_count; + } + + return (0); +} + +static int +poll_copyout(void *arg, struct kevent *kevp, int count, int *res) { + struct poll_kevent_copyin_args *pkap; + struct pollfd *pfd; + struct kevent kev; int i; - struct file *fp; - int n = 0; - - for (i = 0; i < nfd; i++, fds++) { - if (fds->fd >= p->p_fd->fd_nfiles) { - fds->revents = POLLNVAL; - n++; - } else if (fds->fd < 0) { - fds->revents = 0; - } else { - fp = holdfp(p->p_fd, fds->fd, -1); - if (fp == NULL) { - fds->revents = POLLNVAL; - n++; - } else { - /* - * Note: backend also returns POLLHUP and - * POLLERR if appropriate. - */ - fds->revents = fo_poll(fp, fds->events, - fp->f_cred); - if (fds->revents != 0) - n++; - fdrop(fp); + + pkap = (struct poll_kevent_copyin_args *)arg; + + for (i = 0; i < count; ++i) { + if ((int)kevp[i].udata < pkap->nfds) { + pfd = &pkap->fds[(int)kevp[i].udata]; + if (kevp[i].ident == pfd->fd) { + switch (kevp[i].filter) { + case EVFILT_READ: + pfd->revents |= (POLLIN | POLLRDNORM); + break; + case EVFILT_WRITE: + pfd->revents |= (POLLOUT | POLLWRNORM); + break; + case EVFILT_EXCEPT: + pfd->revents |= (POLLPRI | POLLRDBAND); + break; + } + + if (kevp[i].flags & EV_ERROR) { + /* Bad file descriptor */ + if (kevp[i].data == EBADF) + pfd->revents |= POLLNVAL; + else + pfd->revents |= POLLERR; + } + + if (kevp[i].flags & EV_EOF) + pfd->revents |= POLLHUP; + + ++*res; + + continue; } } + + /* Remove descriptor not in pollfd set from kq */ + kev = kevp[i]; + kev.flags = EV_DISABLE|EV_DELETE; + kqueue_register(&pkap->lwp->lwp_kqueue, &kev); } - *res = n; + return (0); } +static int +dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res) +{ + struct poll_kevent_copyin_args ka; + struct pollfd sfds[64]; + int bytes; + int error; + + *res = 0; + if (nfds < 0) + return (EINVAL); + + /* + * This is a bit arbitrary but we need to limit internal kmallocs. + */ + if (nfds > maxfilesperproc * 2) + nfds = maxfilesperproc * 2; + bytes = sizeof(struct pollfd) * nfds; + + ka.lwp = curthread->td_lwp; + ka.nfds = nfds; + ka.pfds = 0; + ka.error = 0; + + if (ka.nfds < 64) + ka.fds = sfds; + else + ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK); + + error = copyin(fds, ka.fds, bytes); + if (error == 0) + error = kern_kevent(&ka.lwp->lwp_kqueue, ka.nfds, res, &ka, + poll_copyin, poll_copyout, ts); + + if (error == 0) + error = copyout(ka.fds, fds, bytes); + + if (ka.fds != sfds) + kfree(ka.fds, M_SELECT); + + return (error); +} + /* * OpenBSD poll system call. * XXX this isn't quite a true representation.. OpenBSD uses select ops. @@ -1264,5 +1385,7 @@ selwakeup(struct selinfo *sip) setrunnable(lp); } crit_exit(); + + kqueue_wakeup(&lp->lwp_kqueue); } diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 189634b..fadff46 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -1396,6 +1396,7 @@ pipe_kqfilter(struct file *fp, struct knote *kn) } break; default: + rel_mplock(); return (1); } kn->kn_hook = (caddr_t)cpipe; diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c index 2273304..224519a 100644 --- a/sys/kern/tty_pty.c +++ b/sys/kern/tty_pty.c @@ -71,6 +71,10 @@ static void ptsstart (struct tty *tp); static void ptsstop (struct tty *tp, int rw); static void ptcwakeup (struct tty *tp, int flag); static void ptyinit (int n); +static int filt_ptcread (struct knote *kn, long hint); +static void filt_ptcrdetach (struct knote *kn); +static int filt_ptcwrite (struct knote *kn, long hint); +static void filt_ptcwdetach (struct knote *kn); static d_open_t ptsopen; static d_close_t ptsclose; @@ -82,6 +86,7 @@ static d_close_t ptcclose; static d_read_t ptcread; static d_write_t ptcwrite; static d_poll_t ptcpoll; +static d_kqfilter_t ptckqfilter; #ifdef UNIX98_PTYS DEVFS_DECLARE_CLONE_BITMAP(pty); @@ -110,7 +115,7 @@ static struct dev_ops ptc98_ops = { .d_write = ptcwrite, .d_ioctl = ptyioctl, .d_poll = ptcpoll, - .d_kqfilter = ttykqfilter, + .d_kqfilter = ptckqfilter, .d_revoke = ttyrevoke }; #endif @@ -137,7 +142,7 @@ static struct dev_ops ptc_ops = { .d_write = ptcwrite, .d_ioctl = ptyioctl, .d_poll = ptcpoll, - .d_kqfilter = ttykqfilter, + .d_kqfilter = ptckqfilter, .d_revoke = ttyrevoke }; @@ -441,10 +446,12 @@ ptcwakeup(struct tty *tp, int flag) if (flag & FREAD) { selwakeup(&pti->pt_selr); wakeup(TSA_PTC_READ(tp)); + KNOTE(&tp->t_rsel.si_note, 0); } if (flag & FWRITE) { selwakeup(&pti->pt_selw); wakeup(TSA_PTC_WRITE(tp)); + KNOTE(&tp->t_wsel.si_note, 0); } } @@ -698,6 +705,104 @@ ptcpoll(struct dev_poll_args *ap) return (0); } +/* + * kqueue ops for pseudo-terminals. + */ +static struct filterops ptcread_filtops = + { 1, NULL, filt_ptcrdetach, filt_ptcread }; +static struct filterops ptcwrite_filtops = + { 1, NULL, filt_ptcwdetach, filt_ptcwrite }; + +static int +ptckqfilter(struct dev_kqfilter_args *ap) +{ + cdev_t dev = ap->a_head.a_dev; + struct knote *kn = ap->a_kn; + struct tty *tp = dev->si_tty; + struct klist *klist; + + ap->a_result = 0; + switch (kn->kn_filter) { + case EVFILT_READ: + klist = &tp->t_rsel.si_note; + kn->kn_fop = &ptcread_filtops; + break; + case EVFILT_WRITE: + klist = &tp->t_wsel.si_note; + kn->kn_fop = &ptcwrite_filtops; + break; + default: + ap->a_result = 1; + return (0); + } + + kn->kn_hook = (caddr_t)dev; + + crit_enter(); + SLIST_INSERT_HEAD(klist, kn, kn_selnext); + crit_exit(); + + return (0); +} + +static int +filt_ptcread (struct knote *kn, long hint) +{ + struct tty *tp = ((cdev_t)kn->kn_hook)->si_tty; + struct pt_ioctl *pti = ((cdev_t)kn->kn_hook)->si_drv1; + + if ((tp->t_state & TS_ISOPEN) && + ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) || + ((pti->pt_flags & PF_PKT) && pti->pt_send) || + ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))) { + kn->kn_data = tp->t_outq.c_cc; + return(1); + } else { + return(0); + } +} + +static int +filt_ptcwrite (struct knote *kn, long hint) +{ + struct tty *tp = ((cdev_t)kn->kn_hook)->si_tty; + struct pt_ioctl *pti = ((cdev_t)kn->kn_hook)->si_drv1; + + if (tp->t_state & TS_ISOPEN && + ((pti->pt_flags & PF_REMOTE) ? + (tp->t_canq.c_cc == 0) : + ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) || + (tp->t_canq.c_cc == 0 && (tp->t_lflag & ICANON))))) { + kn->kn_data = tp->t_canq.c_cc + tp->t_rawq.c_cc; + return(1); + } else { + return(0); + } +} + +static void +filt_ptcrdetach (struct knote *kn) +{ + struct tty *tp = ((cdev_t)kn->kn_hook)->si_tty; + + crit_enter(); + SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext); + crit_exit(); +} + +static void +filt_ptcwdetach (struct knote *kn) +{ + struct tty *tp = ((cdev_t)kn->kn_hook)->si_tty; + + crit_enter(); + SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext); + crit_exit(); +} + +/* + * I/O ops + */ static int ptcwrite(struct dev_write_args *ap) { diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 97ae97e..fd83603 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -116,6 +116,8 @@ static struct filterops soread_filtops = { 1, NULL, filt_sordetach, filt_soread }; static struct filterops sowrite_filtops = { 1, NULL, filt_sowdetach, filt_sowrite }; +static struct filterops soexcept_filtops = + { 1, NULL, filt_sordetach, filt_soread }; MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); MALLOC_DEFINE(M_SONAME, "soname", "socket name"); @@ -1698,6 +1700,7 @@ sohasoutofband(struct socket *so) if (so->so_sigio != NULL) pgsigio(so->so_sigio, SIGURG, 0); selwakeup(&so->so_rcv.ssb_sel); + KNOTE(&so->so_rcv.ssb_sel.si_note, NOTE_OOB); } int @@ -1760,6 +1763,10 @@ sokqfilter(struct file *fp, struct knote *kn) kn->kn_fop = &sowrite_filtops; ssb = &so->so_snd; break; + case EVFILT_EXCEPT: + kn->kn_fop = &soexcept_filtops; + ssb = &so->so_rcv; + break; default: return (1); } @@ -1789,6 +1796,13 @@ filt_soread(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_fp->f_data; + if (kn->kn_sfflags & NOTE_OOB) { + if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { + kn->kn_fflags |= NOTE_OOB; + return (1); + } + return (0); + } kn->kn_data = so->so_rcv.ssb_cc; if (so->so_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; @@ -1799,7 +1813,8 @@ filt_soread(struct knote *kn, long hint) return (1); if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_rcv.ssb_lowat); + return ((kn->kn_data >= so->so_rcv.ssb_lowat) || + !TAILQ_EMPTY(&so->so_comp)); } static void diff --git a/sys/net/bpf.c b/sys/net/bpf.c index b7b2f1d..9780c99 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -58,6 +58,7 @@ #include #include +#include #include #include @@ -128,6 +129,8 @@ static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void bpf_drvinit(void *unused); +static void bpf_filter_detach(struct knote *kn); +static int bpf_filter_read(struct knote *kn, long hint); static d_open_t bpfopen; static d_clone_t bpfclone; @@ -136,16 +139,18 @@ static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; +static d_kqfilter_t bpfkqfilter; #define CDEV_MAJOR 23 static struct dev_ops bpf_ops = { - { "bpf", CDEV_MAJOR, 0 }, + { "bpf", CDEV_MAJOR, D_KQFILTER }, .d_open = bpfopen, .d_close = bpfclose, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, + .d_kqfilter = bpfkqfilter }; @@ -1111,6 +1116,81 @@ bpfpoll(struct dev_poll_args *ap) return(0); } +static struct filterops bpf_read_filtops = + { 1, NULL, bpf_filter_detach, bpf_filter_read }; + +static int +bpfkqfilter(struct dev_kqfilter_args *ap) +{ + cdev_t dev = ap->a_head.a_dev; + struct knote *kn = ap->a_kn; + struct klist *klist; + struct bpf_d *d; + + d = dev->si_drv1; + if (d->bd_bif == NULL) { + ap->a_result = 1; + return (0); + } + + ap->a_result = 0; + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &bpf_read_filtops; + kn->kn_hook = (caddr_t)d; + break; + default: + ap->a_result = 1; + return (0); + } + + crit_enter(); + klist = &d->bd_sel.si_note; + SLIST_INSERT_HEAD(klist, kn, kn_selnext); + crit_exit(); + + return (0); +} + +static void +bpf_filter_detach(struct knote *kn) +{ + struct klist *klist; + struct bpf_d *d; + + crit_enter(); + d = (struct bpf_d *)kn->kn_hook; + klist = &d->bd_sel.si_note; + SLIST_REMOVE(klist, kn, knote, kn_selnext); + crit_exit(); +} + +static int +bpf_filter_read(struct knote *kn, long hint) +{ + struct bpf_d *d; + int ready = 0; + + crit_enter(); + d = (struct bpf_d *)kn->kn_hook; + if (d->bd_hlen != 0 || + ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0)) { + ready = 1; + } else { + /* Start the read timeout if necessary. */ + if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { + callout_reset(&d->bd_callout, d->bd_rtout, + bpf_timed_out, d); + d->bd_state = BPF_WAITING; + } + } + crit_exit(); + + return (ready); +} + + /* * Process the packet pkt of length pktlen. The packet is parsed * by each listener's filter, and if accepted, stashed into the diff --git a/sys/net/tun/if_tun.c b/sys/net/tun/if_tun.c index 7a06e71..eacbdd3 100644 --- a/sys/net/tun/if_tun.c +++ b/sys/net/tun/if_tun.c @@ -78,6 +78,8 @@ static int tunoutput (struct ifnet *, struct mbuf *, struct sockaddr *, static int tunifioctl (struct ifnet *, u_long, caddr_t, struct ucred *); static int tuninit (struct ifnet *); static void tunstart(struct ifnet *); +static void tun_filter_detach(struct knote *); +static int tun_filter_read(struct knote *, long); static d_open_t tunopen; static d_close_t tunclose; @@ -85,6 +87,7 @@ static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; +static d_kqfilter_t tunkqfilter; static d_clone_t tunclone; DEVFS_DECLARE_CLONE_BITMAP(tun); @@ -97,13 +100,14 @@ DEVFS_DECLARE_CLONE_BITMAP(tun); #define CDEV_MAJOR 52 static struct dev_ops tun_ops = { - { "tun", CDEV_MAJOR, 0 }, + { "tun", CDEV_MAJOR, D_KQFILTER }, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, + .d_kqfilter = tunkqfilter }; static void @@ -731,6 +735,63 @@ tunpoll(struct dev_poll_args *ap) return(0); } +static struct filterops tun_read_filtops = + { 1, NULL, tun_filter_detach, tun_filter_read }; + +static int +tunkqfilter(struct dev_kqfilter_args *ap) +{ + cdev_t dev = ap->a_head.a_dev; + struct tun_softc *tp = dev->si_drv1; + struct knote *kn = ap->a_kn; + struct klist *klist; + + ap->a_result = 0; + ifnet_serialize_all(&tp->tun_if); + + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &tun_read_filtops; + kn->kn_hook = (caddr_t)&tp; + break; + default: + ap->a_result = 1; + ifnet_deserialize_all(&tp->tun_if); + return (0); + } + + klist = &tp->tun_rsel.si_note; + SLIST_INSERT_HEAD(klist, kn, kn_selnext); + + ifnet_deserialize_all(&tp->tun_if); + + return (0); +} + +static void +tun_filter_detach(struct knote *kn) +{ + struct tun_softc *tp = (struct tun_softc *)kn->kn_hook; + struct klist *klist; + + klist = &tp->tun_rsel.si_note; + SLIST_REMOVE(klist, kn, knote, kn_selnext); +} + +static int +tun_filter_read(struct knote *kn, long hint) +{ + struct tun_softc *tp = (struct tun_softc *)kn->kn_hook; + int ready = 0; + + ifnet_serialize_all(&tp->tun_if); + if (!ifq_is_empty(&tp->tun_if.if_snd)) + ready = 1; + ifnet_deserialize_all(&tp->tun_if); + + return (ready); +} + /* * Start packet transmission on the interface. * when the interface queue is rate-limited by ALTQ, diff --git a/sys/sys/event.h b/sys/sys/event.h index a2922fe..28aed8c 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -41,8 +41,11 @@ #define EVFILT_PROC (-5) /* attached to struct proc */ #define EVFILT_SIGNAL (-6) /* attached to struct proc */ #define EVFILT_TIMER (-7) /* timers */ +#define EVFILT_EXCEPT (-8) /* exceptional conditions */ -#define EVFILT_SYSCOUNT 7 +#define EVFILT_MARKER 0xF /* placemarker for tailq */ + +#define EVFILT_SYSCOUNT 8 #define EV_SET(kevp_, a, b, c, d, e, f) do { \ struct kevent *kevp = (kevp_); \ @@ -86,6 +89,11 @@ struct kevent { #define NOTE_LOWAT 0x0001 /* low water mark */ /* + * data/hint flags for EVFILT_EXCEPT, shared with userspace + */ +#define NOTE_OOB 0x0002 /* OOB data on a socket */ + +/* * data/hint flags for EVFILT_VNODE, shared with userspace */ #define NOTE_DELETE 0x0001 /* vnode was removed */ @@ -134,8 +142,12 @@ MALLOC_DECLARE(M_KQUEUE); struct filterops { int f_isfd; /* true if ident == filedescriptor */ + + /* f_attach returns 0 on success or valid error code on failure */ int (*f_attach) (struct knote *kn); void (*f_detach) (struct knote *kn); + + /* f_event returns boolean truth */ int (*f_event) (struct knote *kn, long hint); }; @@ -173,9 +185,11 @@ struct thread; struct filedesc; struct kevent_args; -typedef int (*k_copyout_fn)(void *arg, struct kevent *kevp, int count); -typedef int (*k_copyin_fn)(void *arg, struct kevent *kevp, int count); -int kern_kevent(int fd, int nchanges, int nevents, struct kevent_args *uap, +typedef int (*k_copyout_fn)(void *arg, struct kevent *kevp, int count, + int *res); +typedef int (*k_copyin_fn)(void *arg, struct kevent *kevp, int max, + int *events); +int kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, k_copyin_fn kevent_copyin, k_copyout_fn kevent_copyout, struct timespec *tsp); @@ -185,6 +199,7 @@ extern void knote_fdclose(struct file *fp, struct filedesc *fdp, int fd); extern void kqueue_init(struct kqueue *kq, struct filedesc *fdp); extern void kqueue_terminate(struct kqueue *kq); extern int kqueue_register(struct kqueue *kq, struct kevent *kev); +extern void kqueue_wakeup(struct kqueue *kq); #endif /* _KERNEL */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 8d50470..19d8f13 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -218,6 +218,7 @@ struct lwp { struct thread *lwp_thread; /* backpointer to proc's thread */ struct upcall *lwp_upcall; /* REGISTERED USERLAND POINTER! */ struct kqueue lwp_kqueue; /* for select/poll */ + u_int lwp_kqueue_serial; }; struct proc { diff --git a/sys/sys/random.h b/sys/sys/random.h index c53817d..f19d562 100644 --- a/sys/sys/random.h +++ b/sys/sys/random.h @@ -95,7 +95,9 @@ u_int read_random_unlimited(void *buf, u_int size); u_int write_random(const char *buf, u_int nbytes); #endif struct thread; +struct knote; int random_poll(cdev_t dev, int events); +int random_filter_read(struct knote *kn, long hint); #endif /* _KERNEL */ diff --git a/sys/vfs/devfs/devfs_vnops.c b/sys/vfs/devfs/devfs_vnops.c index 2108c10..3b2799d 100644 --- a/sys/vfs/devfs/devfs_vnops.c +++ b/sys/vfs/devfs/devfs_vnops.c @@ -60,6 +60,7 @@ #include #include #include +#include #include @@ -1390,7 +1391,10 @@ devfs_specf_kqfilter(struct file *fp, struct knote *kn) done: rel_mplock(); - return (error); + if (error) + return (-1); + + return (0); } @@ -1685,7 +1689,7 @@ devfs_spec_kqfilter(struct vop_kqfilter_args *ap) cdev_t dev; if ((dev = vp->v_rdev) == NULL) - return (EBADF); /* device was revoked */ + return (1); /* device was revoked (EBADF) */ node = DEVFS_NODE(vp); #if 0 @@ -1693,7 +1697,7 @@ devfs_spec_kqfilter(struct vop_kqfilter_args *ap) nanotime(&node->atime); #endif - return (dev_dkqfilter(dev, ap->a_kn)); + return (!dev_dkqfilter(dev, ap->a_kn)); } /* diff --git a/tools/test/kqueue/Makefile b/tools/test/kqueue/Makefile new file mode 100644 index 0000000..c03a456 --- /dev/null +++ b/tools/test/kqueue/Makefile @@ -0,0 +1,8 @@ +PROG= kqueue_oob +SRCS= kqueue_oob.c +NOMAN= sorry + +test: kqueue_oob + @csh -x -c "./kqueue_oob" + +.include diff --git a/tools/test/kqueue/kqueue_oob.c b/tools/test/kqueue/kqueue_oob.c new file mode 100644 index 0000000..47bdbcd --- /dev/null +++ b/tools/test/kqueue/kqueue_oob.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +main(int argc, char *argv[]) +{ + struct sockaddr_in sa_local, sa_remote; + socklen_t sin_size = sizeof(struct sockaddr_in); + struct timespec timeout; + struct kevent eventlist[1], changelist[1]; + int kq, fd_l, fd_c, fd_n, i; + + if ((fd_l = socket(PF_INET, SOCK_STREAM, 0)) == -1) + err(EX_OSERR, "socket(2) failure"); + + if ((fd_c = socket(PF_INET, SOCK_STREAM, 0)) == -1) + err(EX_OSERR, "socket(2) failure"); + + sa_local.sin_family = AF_INET; + sa_local.sin_port = 0; + sa_local.sin_addr.s_addr = htonl(INADDR_ANY); + memset(&(sa_local.sin_zero), 0, sizeof(sa_local.sin_zero)); + + if (bind(fd_l, (struct sockaddr *)&sa_local, sizeof(struct sockaddr)) == -1) + err(EX_OSERR, "bind(2) failure"); + + if (getsockname(fd_l, (struct sockaddr *)&sa_local, &sin_size) == -1) + err(EX_OSERR, "getsockname(2) failure"); + + if (listen(fd_l, 1) == -1) + err(EX_OSERR, "listen(2) failure"); + + if (connect(fd_c, (struct sockaddr *)&sa_local, sizeof(struct sockaddr)) == -1) + err(EX_OSERR, "connect(2) failure"); + + fd_n = accept(fd_l, (struct sockaddr *)&sa_remote, &sin_size); + + if ((kq = kqueue()) == -1) + err(EX_OSERR, "kqueue(2) failure"); + + if (send(fd_c, "x", 1, MSG_OOB) == -1) + err(EX_OSERR, "send(2) failure"); + + EV_SET(&changelist[0], fd_n, EVFILT_EXCEPT, EV_ADD|EV_ENABLE, NOTE_OOB, 0, NULL); + + memset(&timeout, 0, sizeof(timeout)); + if ((i = kevent(kq, &changelist[0], 1, &eventlist[0], 1, &timeout)) == -1) + err(EX_OSERR, "kevent(2) failure"); + + if (i == 1 && eventlist[0].fflags & NOTE_OOB) { + printf("ok\n"); + } + + return (0); +} diff --git a/tools/test/select/Makefile b/tools/test/select/Makefile new file mode 100644 index 0000000..fd5ce46 --- /dev/null +++ b/tools/test/select/Makefile @@ -0,0 +1,8 @@ +PROG= select_oob +SRCS= select_oob.c +NOMAN= sorry + +test: select_oob + @csh -x -c "./select_oob" + +.include diff --git a/tools/test/select/select_many_write.c b/tools/test/select/select_many_write.c new file mode 100644 index 0000000..779a5f2 --- /dev/null +++ b/tools/test/select/select_many_write.c @@ -0,0 +1,33 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MANY 80 + +int +main(int argc, char *argv[]) +{ + fd_set write_fds; + int fd[MANY+3], i, maxfd; + + FD_ZERO(&write_fds); + for (i = 0; i < MANY; ++i) { + if ((fd[i] = socket(PF_INET, SOCK_DGRAM, 0)) == -1) + err(EX_OSERR, "socket(2) failure"); + + FD_SET(fd[i], &write_fds); + maxfd = fd[i]; + } + + i = select(maxfd+1, NULL, &write_fds, NULL, NULL); + + if (i == MANY) + printf("ok\n"); + + return (0); +} diff --git a/tools/test/select/select_oob.c b/tools/test/select/select_oob.c new file mode 100644 index 0000000..3e48277 --- /dev/null +++ b/tools/test/select/select_oob.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +int +main(int argc, char *argv[]) +{ + struct sockaddr_in sa_local, sa_remote; + socklen_t sin_size = sizeof(struct sockaddr_in); + fd_set exceptfds; + struct timeval timeout; + int fd_l, fd_c, fd_n, i; + + if ((fd_l = socket(PF_INET, SOCK_STREAM, 0)) == -1) + err(EX_OSERR, "socket(2) failure"); + + if ((fd_c = socket(PF_INET, SOCK_STREAM, 0)) == -1) + err(EX_OSERR, "socket(2) failure"); + + sa_local.sin_family = AF_INET; + sa_local.sin_port = 0; + sa_local.sin_addr.s_addr = htonl(INADDR_ANY); + memset(&(sa_local.sin_zero), 0, sizeof(sa_local.sin_zero)); + + if (bind(fd_l, (struct sockaddr *)&sa_local, sizeof(struct sockaddr)) == -1) + err(EX_OSERR, "bind(2) failure"); + + if (getsockname(fd_l, (struct sockaddr *)&sa_local, &sin_size) == -1) + err(EX_OSERR, "getsockname(2) failure"); + + if (listen(fd_l, 1) == -1) + err(EX_OSERR, "listen(2) failure"); + + if (connect(fd_c, (struct sockaddr *)&sa_local, sizeof(struct sockaddr)) == -1) + err(EX_OSERR, "connect(2) failure"); + + fd_n = accept(fd_l, (struct sockaddr *)&sa_remote, &sin_size); + + FD_ZERO(&exceptfds); + FD_SET(fd_n, &exceptfds); + + if (send(fd_c, "x", 1, MSG_OOB) == -1) + err(EX_OSERR, "send(2) failure"); + + memset(&timeout, 0, sizeof(timeout)); + i = select(fd_n+1, NULL, NULL, &exceptfds, &timeout); + + if (i == 1) + printf("ok\n"); + + return (0); +}