deraadt@ noticed that pflogd wakes up twice a second, event if it has
nothing to do or log. the reason for this is how bpf works.

when you try and read from a bpf file descriptor, there are three
reasons that the read will finish.

the first is the obvious one: the bpf packet buffer in the kernel
fills up. by default this is about 32k, so if pf is logging a small
packet every few seconds, it can take a long time for the buffer
to fill up and for those packets to end up on disk.

the second is if bpf has been configured to enable immediate mode with
ioctl(BIOCIMMEDIATE). this means that when any packet is written into
the bpf buffer, the buffer is immediately readable. if pf is logging a
packet every few seconds, this is great because pflogd will wake up
every few seconds and put the packet on disk. however, if pf is logging
a lot of packets in a short period of time, this means pflogd will do a
lot of reads and writes in a short period of time and throwing out the
benefit of using the large buffer that bpf provides.

the third mechanism is if bpf has been configured with the BIOCSRTIMEOUT
ioctl, which sets a maximum wait time on a bpf read. BIOCSRTIMEOUT
means than a clock starts ticking down when a program (eg pflogd)
reads from bpf. when the clock reaches zero then the read returns
with whatever is in the bpf packet buffer, which could be nothing.
this is what's happening in pflogd. it wants to have a maximum of
half a second between when pf logs a packet and when it ends up in
/var/log/pflog, but also wants to avoid waking up for every packet when
things get busy.

unfortunately, waking up for 0 packets all the time is not the best
either. ideally, what we'd want is for pflogd to be able to sleep
forever until there's a packet, but then wait a little bit more after
that to try and collect some more packets.

it could be argued that you could do this in userland with BIOCIMMEDIATE,
nonblocking bpf io, and a timer. a handler for bpf read notifications
could start a timeout rather than call read() from the timeout handler.
the problem with this is if packets are being logged fast enough to fill
the bpf buffer before the timeout fires, you'll lose packets.

this diff adds code to bpf in the kernel to implement the "wait a bit"
semantic. a new BIOCSWTIMEOUT ioctl lets you specify a wait time between
when a packet is put in the bpf buffer and when the buffer becomes
readable.

by default this wait time is infinite, meaning the buffer has to be
filled before it becomes readable. BIOCIMMEDIATE is turned into a
variation of BIOCSWTIMEOUT with the wait time set to 0, ie, wait 0
seconds between when a packet is written to the buffer and when the
buffer becomes readable. combining BIOCSWTIMEOUT and BIOCIMMEDIATE
simplifies the code a lot.

pflogd in this diff now sets BIOCSWTIMEOUT instead of BIOCSRTIMEOUT, and
gets to have it's cake and eat it too now.

discussed a lot with deraadt@ and sashan@
existing programs seem to still work as expected, but tests would be
welcome. ok?

Index: sys/net/bpf.c
===================================================================
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.219
diff -u -p -r1.219 bpf.c
--- sys/net/bpf.c       9 Jul 2022 12:48:21 -0000       1.219
+++ sys/net/bpf.c       7 Feb 2023 01:25:20 -0000
@@ -77,6 +77,10 @@
 
 #define BPF_BUFSIZE 32768
 
+#define BPF_S_IDLE     0
+#define BPF_S_WAIT     1
+#define BPF_S_DONE     2
+
 #define PRINET  26                     /* interruptible */
 
 /*
@@ -101,6 +105,7 @@ int bpf_setif(struct bpf_d *, struct ifr
 int    bpfkqfilter(dev_t, struct knote *);
 void   bpf_wakeup(struct bpf_d *);
 void   bpf_wakeup_cb(void *);
+void   bpf_wait_cb(void *);
 int    _bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int);
 void   bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
            const struct bpf_hdr *);
@@ -392,11 +397,13 @@ bpfopen(dev_t dev, int flag, int mode, s
        bd->bd_sig = SIGIO;
        mtx_init(&bd->bd_mtx, IPL_NET);
        task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
+       timeout_set(&bd->bd_wait_tmo, bpf_wait_cb, bd);
        smr_init(&bd->bd_smr);
        sigio_init(&bd->bd_sigio);
        klist_init_mutex(&bd->bd_klist, &bd->bd_mtx);
 
        bd->bd_rtout = 0;       /* no timeout by default */
+       bd->bd_wtout = INFSLP;  /* wait for the buffer to fill by default */
 
        refcnt_init(&bd->bd_refcnt);
        LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
@@ -435,6 +442,7 @@ bpfclose(dev_t dev, int flag, int mode, 
        (d)->bd_hbuf = (d)->bd_sbuf; \
        (d)->bd_hlen = (d)->bd_slen; \
        (d)->bd_sbuf = (d)->bd_fbuf; \
+       (d)->bd_state = BPF_S_IDLE; \
        (d)->bd_slen = 0; \
        (d)->bd_fbuf = NULL;
 
@@ -492,7 +500,7 @@ bpfread(dev_t dev, struct uio *uio, int 
                        ROTATE_BUFFERS(d);
                        break;
                }
-               if (d->bd_immediate && d->bd_slen != 0) {
+               if (d->bd_state == BPF_S_DONE) {
                        /*
                         * A packet(s) either arrived since the previous
                         * read or arrived while we were asleep.
@@ -611,6 +619,21 @@ bpf_wakeup_cb(void *xd)
        bpf_put(d);
 }
 
+void
+bpf_wait_cb(void *xd)
+{
+       struct bpf_d *d = xd;
+
+       mtx_enter(&d->bd_mtx);
+       if (d->bd_state == BPF_S_WAIT) {
+               d->bd_state = BPF_S_DONE;
+               bpf_wakeup(d);
+       }
+       mtx_leave(&d->bd_mtx);
+
+       bpf_put(d);
+}
+
 int
 bpfwrite(dev_t dev, struct uio *uio, int ioflag)
 {
@@ -674,17 +697,64 @@ bpf_resetd(struct bpf_d *d)
        MUTEX_ASSERT_LOCKED(&d->bd_mtx);
        KASSERT(d->bd_in_uiomove == 0);
 
+       if (timeout_del(&d->bd_wait_tmo))
+               bpf_put(d);
+
        if (d->bd_hbuf != NULL) {
                /* Free the hold buffer. */
                d->bd_fbuf = d->bd_hbuf;
                d->bd_hbuf = NULL;
        }
+       d->bd_state = BPF_S_IDLE;
        d->bd_slen = 0;
        d->bd_hlen = 0;
        d->bd_rcount = 0;
        d->bd_dcount = 0;
 }
 
+static int
+bpf_set_wtout(struct bpf_d *d, uint64_t wtout)
+{
+       mtx_enter(&d->bd_mtx);
+       d->bd_wtout = wtout;
+       mtx_leave(&d->bd_mtx);
+
+       return (0);
+}
+
+static int
+bpf_set_wtimeout(struct bpf_d *d, const struct timeval *tv)
+{
+       uint64_t nsec;
+
+       if (tv->tv_sec < 0 || !timerisvalid(tv))
+               return (EINVAL);
+
+       nsec = TIMEVAL_TO_NSEC(tv);
+       if (nsec > MAXTSLP)
+               return (EOVERFLOW);
+
+       return (bpf_set_wtout(d, nsec));
+}
+
+static int
+bpf_get_wtimeout(struct bpf_d *d, struct timeval *tv)
+{
+       uint64_t nsec;
+
+       mtx_enter(&d->bd_mtx);
+       nsec = d->bd_wtout;
+       mtx_leave(&d->bd_mtx);
+
+       if (nsec == INFSLP)
+               return (ENXIO);
+
+       memset(tv, 0, sizeof(*tv));
+       NSEC_TO_TIMEVAL(nsec, tv);
+
+       return (0);
+}
+
 /*
  *  FIONREAD           Check for read packet available.
  *  BIOCGBLEN          Get buffer len [for read()].
@@ -698,6 +768,9 @@ bpf_resetd(struct bpf_d *d)
  *  BIOCSETIF          Set interface.
  *  BIOCSRTIMEOUT      Set read timeout.
  *  BIOCGRTIMEOUT      Get read timeout.
+ *  BIOCSWTIMEOUT      Set wait timeout.
+ *  BIOCGWTIMEOUT      Get wait timeout.
+ *  BIOCDWTIMEOUT      Del wait timeout.
  *  BIOCGSTATS         Get packet stats.
  *  BIOCIMMEDIATE      Set immediate mode.
  *  BIOCVERSION                Get filter language version.
@@ -720,6 +793,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
                case BIOCGDLTLIST:
                case BIOCGETIF:
                case BIOCGRTIMEOUT:
+               case BIOCGWTIMEOUT:
                case BIOCGSTATS:
                case BIOCVERSION:
                case BIOCGRSIG:
@@ -727,6 +801,8 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
                case FIONREAD:
                case BIOCLOCK:
                case BIOCSRTIMEOUT:
+               case BIOCSWTIMEOUT:
+               case BIOCDWTIMEOUT:
                case BIOCIMMEDIATE:
                case TIOCGPGRP:
                case BIOCGDIRFILT:
@@ -933,7 +1009,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
         * Set immediate mode.
         */
        case BIOCIMMEDIATE:
-               d->bd_immediate = *(u_int *)addr;
+               error = bpf_set_wtout(d, *(int *)addr ? 0 : INFSLP);
+               break;
+
+       /*
+        * Wait timeout.
+        */
+       case BIOCSWTIMEOUT:
+               error = bpf_set_wtimeout(d, (const struct timeval *)addr);
+               break;
+       case BIOCGWTIMEOUT:
+               error = bpf_get_wtimeout(d, (struct timeval *)addr);
+               break;
+       case BIOCDWTIMEOUT:
+               error = bpf_set_wtout(d, INFSLP);
                break;
 
        case BIOCVERSION:
@@ -1193,7 +1282,7 @@ filt_bpfread(struct knote *kn, long hint
        MUTEX_ASSERT_LOCKED(&d->bd_mtx);
 
        kn->kn_data = d->bd_hlen;
-       if (d->bd_immediate)
+       if (d->bd_wtout == 0)
                kn->kn_data += d->bd_slen;
 
        return (kn->kn_data > 0);
@@ -1510,6 +1599,15 @@ bpf_catchpacket(struct bpf_d *d, u_char 
                        ++d->bd_dcount;
                        return;
                }
+
+               /*
+                * there's a small chance bpf_wait_cb is running
+                * concurrently with this and two wakeups will be
+                * generated.
+                */
+               if (timeout_del(&d->bd_wait_tmo))
+                       bpf_put(d);
+
                ROTATE_BUFFERS(d);
                do_wakeup = 1;
                curlen = 0;
@@ -1530,12 +1628,27 @@ bpf_catchpacket(struct bpf_d *d, u_char 
        bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen);
        d->bd_slen = curlen + totlen;
 
-       if (d->bd_immediate) {
+       switch (d->bd_wtout) {
+       case 0:
                /*
                 * Immediate mode is set.  A packet arrived so any
                 * reads should be woken up.
                 */
+               if (d->bd_state == BPF_S_IDLE)
+                       d->bd_state = BPF_S_DONE;
                do_wakeup = 1;
+               break;
+       case INFSLP:
+               break;
+       default:
+               if (d->bd_state == BPF_S_IDLE) {
+                       d->bd_state = BPF_S_WAIT;
+
+                       bpf_get(d);
+                       if (!timeout_add_nsec(&d->bd_wait_tmo, d->bd_wtout))
+                               bpf_put(d);
+               }
+               break;
        }
 
        if (do_wakeup)
Index: sys/net/bpf.h
===================================================================
RCS file: /cvs/src/sys/net/bpf.h,v
retrieving revision 1.70
diff -u -p -r1.70 bpf.h
--- sys/net/bpf.h       3 Aug 2020 03:21:24 -0000       1.70
+++ sys/net/bpf.h       7 Feb 2023 01:25:20 -0000
@@ -119,6 +119,9 @@ struct bpf_version {
 #define BIOCGDLTLIST   _IOWR('B',123, struct bpf_dltlist)
 #define BIOCGDIRFILT   _IOR('B',124, u_int)
 #define BIOCSDIRFILT   _IOW('B',125, u_int)
+#define BIOCSWTIMEOUT  _IOW('B',126, struct timeval)
+#define BIOCGWTIMEOUT  _IOR('B',126, struct timeval)
+#define BIOCDWTIMEOUT  _IO('B',126)
 
 /*
  * Direction filters for BIOCSDIRFILT/BIOCGDIRFILT
Index: sys/net/bpfdesc.h
===================================================================
RCS file: /cvs/src/sys/net/bpfdesc.h,v
retrieving revision 1.47
diff -u -p -r1.47 bpfdesc.h
--- sys/net/bpfdesc.h   9 Jul 2022 12:48:21 -0000       1.47
+++ sys/net/bpfdesc.h   7 Feb 2023 01:25:20 -0000
@@ -79,6 +79,7 @@ struct bpf_d {
 
        struct bpf_if  *bd_bif;         /* interface descriptor */
        uint64_t        bd_rtout;       /* [m] Read timeout in nanoseconds */
+       uint64_t        bd_wtout;       /* [m] Wait time in nanoseconds */
        u_long          bd_nreaders;    /* [m] # threads asleep in bpfread() */
        struct bpf_program_smr
                       *bd_rfilter;     /* read filter code */
@@ -88,8 +89,7 @@ struct bpf_d {
        u_long          bd_dcount;      /* number of packets dropped */
 
        u_char          bd_promisc;     /* true if listening promiscuously */
-       u_char          bd_state;       /* idle, waiting, or timed out */
-       u_char          bd_immediate;   /* true to return on packet arrival */
+       u_char          bd_state;       /* [m] idle, waiting, or timed out */
        u_char          bd_locked;      /* true if descriptor is locked */
        u_char          bd_fildrop;     /* true if filtered packets will be 
dropped */
        u_char          bd_dirfilt;     /* direction filter */
@@ -103,7 +103,8 @@ struct bpf_d {
        int             bd_unit;        /* logical unit number */
        LIST_ENTRY(bpf_d) bd_list;      /* descriptor list */
 
-       struct task     bd_wake_task;   /* delay pgsigio() and selwakeup() */
+       struct task     bd_wake_task;   /* defer pgsigio() and selwakeup() */
+       struct timeout  bd_wait_tmo;    /* delay wakeup after catching pkt */
 
        struct smr_entry
                        bd_smr;
Index: sbin/pflogd/pflogd.c
===================================================================
RCS file: /cvs/src/sbin/pflogd/pflogd.c,v
retrieving revision 1.62
diff -u -p -r1.62 pflogd.c
--- sbin/pflogd/pflogd.c        25 Jul 2019 17:32:33 -0000      1.62
+++ sbin/pflogd/pflogd.c        7 Feb 2023 01:25:20 -0000
@@ -251,8 +251,8 @@ pflog_read_live(const char *source, int 
                struct timeval to;
                to.tv_sec = to_ms / 1000;
                to.tv_usec = (to_ms * 1000) % 1000000;
-               if (ioctl(p->fd, BIOCSRTIMEOUT, &to) == -1) {
-                       snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSRTIMEOUT: %s",
+               if (ioctl(p->fd, BIOCSWTIMEOUT, &to) == -1) {
+                       snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSWTIMEOUT: %s",
                            pcap_strerror(errno));
                        goto bad;
                }
Index: share/man/man4/bpf.4
===================================================================
RCS file: /cvs/src/share/man/man4/bpf.4,v
retrieving revision 1.44
diff -u -p -r1.44 bpf.4
--- share/man/man4/bpf.4        11 Sep 2022 06:38:11 -0000      1.44
+++ share/man/man4/bpf.4        7 Feb 2023 01:25:20 -0000
@@ -220,6 +220,8 @@ The allowable ioctls are
 .Dv BIOCIMMEDIATE ,
 .Dv BIOCLOCK ,
 .Dv BIOCSRTIMEOUT ,
+.Dv BIOCSWTIMEOUT ,
+.Dv BIOCDWTIMEOUT ,
 .Dv BIOCVERSION ,
 .Dv TIOCGPGRP ,
 and
@@ -301,6 +303,18 @@ This is useful for programs like
 .Xr rarpd 8 ,
 which must respond to messages in real time.
 The default for a new file is off.
+.Pp
+.It Dv BIOCSWTIMEOUT Fa "struct timeval *"
+.It Dv BIOCGWTIMEOUT Fa "struct timeval *"
+.It Dv BIOCDWTIMEOUT
+Sets, gets, or deletes (resets) the wait timeout parameter.
+The
+.Ar timeval
+specifies the length of time to wait between receiving a packet and
+the kernel buffer becoming readable.
+By default, or when reset, the wait timeout is infinite, meaning
+the age of packets in the kernel buffer does not make the buffer
+readable.
 .Pp
 .It Dv BIOCSETF Fa "struct bpf_program *"
 Sets the filter program used by the kernel to discard uninteresting packets.

Reply via email to