tcpdump mailing list archives

Re: Libpcap reentrancy and PF_RING patch

From: Guy Harris <guy () alum mit edu>
Date: Thu, 24 Jan 2008 13:08:44 -0800

Luca Deri wrote:

I have considered your suggestion to move pfring into a pcap-pfring.*file.


I didn't make such a suggestion.

I did say

Would it work better if, for PF_RING sockets, there were a separate
pcap_read_pf_ring, and handle->read_op were set to pcap_read_pf_ring if
a PF_RING socket were being used? That'd avoid some per-packet checks in
the read_op routine, and might involve fewer #ifdefs as well.


For that matter, should there be a separate live_open_pf_ring()
routine, with that routine called first and, if it fails (e.g., because
the system doesn't have PF_RING support), live_open_new() called?

but "make separate routines" doesn't require the separate routines to bein a separate file - see, for example, the current top-of-treepcap-linux.c, where there are some separate routines for accessing thering bufffer.

Here's a patch (unified diff) to the current top-of-tree pcap-linux.cthat implements that scheme. It also cleans up some things that lookedas if they were problems:

as PF_RING sockets don't support a mechanism similar to PF_PACKETsockets' PACKET_ADD_MEMBERSHIP mechanism for enabling promiscuous mode(as well as "all-multicast" mode and adding particular multicastaddresses), which automatically cancels the membership (thus turningpromiscuous mode off) when the socket is closed, for PF_RING sockets, aswith SOCK_PACKET sockets, we have to add a PF_RING pcap_t to the list ofsockets to be closed on exit, so that if an app exits without explicitlyclosing a pcap_t, the pcap_t is still closed, and promiscuous mode isturned off (that doesn't handle exiting due to a signal, of course);

pfring_open() doesn't appear to handle NULL or "any" as the deviceargument, so we don't use PF_RING in that case;

it also doesn't use SOCK_DGRAM sockets, so, in cases where we fall backon cooked mode, we don't use PF_RING.

Index: pcap-linux.c
===================================================================
RCS file: /tcpdump/master/libpcap/pcap-linux.c,v
retrieving revision 1.134
diff -u -r1.134 pcap-linux.c
--- pcap-linux.c        24 Jan 2008 20:20:08 -0000      1.134
+++ pcap-linux.c        24 Jan 2008 20:51:58 -0000
@@ -239,6 +239,13 @@
 static int pcap_getnonblock_mmap(pcap_t *p, char *errbuf);
 #endif
 
+#ifdef HAVE_PF_RING
+static int live_open_pf_ring(pcap_t *, const char *, int, int, char *);
+static void pcap_close_linux_pf_ring(pcap_t *);
+static int pcap_read_linux_pf_ring(pcap_t *, int, pcap_handler , u_char *);
+static int pcap_stats_linux_pf_ring(pcap_t *, struct pcap_stat *);
+#endif
+
 /*
  * Wrap some ioctl calls
  */
@@ -355,6 +362,30 @@
                return NULL;
        }
 
+#ifdef HAVE_PF_RING
+       /*
+        * Try to use the PF_RING socket mechanism first.
+        */
+       err = live_open_pf_ring(handle, device, promisc, to_ms, ebuf);
+       if (err == 1) {
+               /*
+                * That succeeded.
+                */
+               live_open_ok = 1;       /* succeeded */
+
+               /*
+                * Override certain operations.
+                */
+               handle->close_op = pcap_close_linux_pf_ring;
+               handle->read_op = pcap_read_linux_pf_ring;
+               handle->stats_op = pcap_stats_linux_pf_ring;
+       } else if (err == 0) {
+               /*
+                * That failed, but not fatally - try using the other
+                * mechanisms.
+                */
+#endif
+
        /*
         * Current Linux kernels use the protocol family PF_PACKET to
         * allow direct access to all packets on the network while
@@ -375,11 +406,14 @@
                if (live_open_old(handle, device, promisc, to_ms, ebuf))
                        live_open_ok = 1;
        }
+#ifdef HAVE_PF_RING
+       }
+#endif
        if (!live_open_ok) {
                /*
-                * Both methods to open the packet socket failed. Tidy
-                * up and report our failure (ebuf is expected to be
-                * set by the functions above).
+                * All methods to open the device for capturing failed.
+                * Tidy up and report our failure (ebuf is expected to
+                * be set by the functions above).
                 */
 
                if (handle->md.device != NULL)
@@ -2502,6 +2536,19 @@
        int ret;
        int save_errno;
 
+#ifdef HAVE_PF_RING
+       if (handle->ring) {
+               /*
+                * For PF_RING sockets, we don't do the flushing
+                * stuff.
+                * XXX - do we need to do so?
+                * XXX - should this be SOL_SOCKET or 0?
+                */
+               return setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER,
+                                fcode, sizeof(*fcode));
+       }
+#endif
+
        /*
         * The socket filter code doesn't discard all packets queued
         * up on the socket when the filter is changed; this means
@@ -2611,3 +2658,364 @@
                                   &dummy, sizeof(dummy));
 }
 #endif
+
+#ifdef HAVE_PF_RING
+
+/* ===== Functions to interface to the PF_RING mechanism ================== */
+
+/*
+ *  Try to open a packet socket using the PF_RING mechanism.
+ *  Returns 1 on success, 0 on failure.
+ */
+static int
+live_open_pf_ring(pcap_t *handle, const char *device, int promisc,
+                 int to_ms, char *ebuf)
+{
+       int                     sock_fd, arptype;
+
+       /*
+        * The PF_RING library doesn't support a PF_RING socket not
+        * bound to a device (it assumes the "device" argument
+        * is non-null).
+        */
+       if (device == NULL)
+               return 0;
+
+       handle->ring = pfring_open((char*)device, promisc, 1);
+       if (handle->ring == NULL)
+               return 0;
+       sock_fd = handle->fd = handle->ring->fd;
+       handle->bufsize = handle->snapshot;
+
+       /* It seems the kernel supports the new interface. */
+       handle->md.sock_packet = 0;
+
+       /*
+        * Get the interface index of the loopback device.
+        * If the attempt fails, don't fail, just set the
+        * "md.lo_ifindex" to -1.
+        *
+        * XXX - can there be more than one device that loops
+        * packets back, i.e. devices other than "lo"?  If so,
+        * we'd need to find them all, and have an array of
+        * indices for them, and check all of them in
+        * "pcap_read_packet()".
+        */
+       handle->md.lo_ifindex = iface_get_id(sock_fd, "lo", ebuf);
+
+       /*
+        * Default value for offset to align link-layer payload
+        * on a 4-byte boundary.
+        */
+       handle->offset   = 0;
+
+       /*
+        * What kind of frames do we have to deal with?  Fail,
+        * but not fatally, if we have an unknown interface type,
+        * so that we'll try a PF_PACKET socket.
+        */
+       /* Assume for now we don't need cooked mode. */
+       handle->md.cooked = 0;
+
+       arptype = iface_get_arptype(sock_fd, device, ebuf);
+       if (arptype == -1) {
+               /*
+                * Shut down the ring.
+                */
+               pfring_close(handle->ring);
+
+               /*
+                * Get rid of any link-layer type list we allocated.
+                */
+               if (handle->dlt_list != NULL)
+                       free(handle->dlt_list);
+               /*
+                * This is a fatal error; we won't try using
+                * PF_PACKET sockets, as they'll presumably
+                * get the same error from iface_get_arptype().
+                */
+               return -2;
+       }
+       map_arphrd_to_dlt(handle, arptype, 1);
+       if (handle->linktype == -1 ||
+           handle->linktype == DLT_LINUX_SLL ||
+           handle->linktype == DLT_LINUX_IRDA ||
+           handle->linktype == DLT_LINUX_LAPD ||
+           (handle->linktype == DLT_EN10MB &&
+            (strncmp("isdn", device, 4) == 0 ||
+             strncmp("isdY", device, 4) == 0))) {
+               /*
+                * Unknown interface type (-1), or a
+                * device we explicitly chose to run
+                * in cooked mode (e.g., PPP devices),
+                * or an ISDN device (whose link-layer
+                * type we can only determine by using
+                * APIs that may be different on different
+                * kernels) - fail, as PF_RING sockets only
+                * support SOCK_RAW, not SOCK_DGRAM, so
+                * there's no cooked mode.
+                */
+               /*
+                * Get rid of any link-layer type list
+                * we allocated - this only supports cooked
+                * capture.
+                */
+               if (handle->dlt_list != NULL) {
+                       free(handle->dlt_list);
+                       handle->dlt_list = NULL;
+                       handle->dlt_count = 0;
+               }
+               pfring_close(handle->ring);
+               handle->ring = NULL;
+               return 0;
+       }
+
+       /*
+        * PF_RING uses the old SIOCSIFFLAGS-based mechanism for turning
+        * promiscuous mode on and off, so, just as we have to do when
+        * using SOCK_PACKET sockets, we have to add this to the list
+        * of pcaps to close when we exit, so promiscuous mode gets
+        * turned off even if the application exits without explicitly
+        * closing the pcap_t.
+        */
+       handle->md.next = pcaps_to_close;
+       pcaps_to_close = handle;
+
+       return 1;
+}
+
+static void
+pcap_close_linux_pf_ring(pcap_t *handle)
+{
+       pfring_close(handle->ring);
+
+       /*
+        * pfring_close() already closed the file descriptor, so set
+        * handle->fd to -1 so pcap_close_common() doesn't close it.
+        */
+       handle->fd = -1;
+       pcap_close_linux(handle);
+}
+
+/*
+ *  Read a packet from the socket calling the handler provided by
+ *  the user. Returns the number of packets received or -1 if an
+ *  error occured.
+ */
+static int
+pcap_read_linux_pf_ring(pcap_t *handle, pcap_handler callback, u_char *userdata)
+{
+       u_char                  *bp;
+       int                     packet_len, caplen;
+       struct pfring_pkthdr    pcap_header;
+
+       for (;;) {
+               if (handle->break_loop) {
+                       /*
+                        * Yes - clear the flag that indicates that it
+                        * has, and return -2 as an indication that we
+                        * were told to break out of the loop.
+                        *
+                        * Patch courtesy of Michael Stiller <ms () 2scale net>
+                        */
+                       handle->break_loop = 0;
+                       return -2;
+               }
+
+               packet_len = pfring_recv(handle->ring, (char*)handle->buffer,
+                                        handle->bufsize,
+                                        &pcap_header,
+                                        1 /* wait_for_incoming_packet */);
+               if (packet_len > 0) {
+                       bp = handle->buffer;
+                       pcap_header.caplen = min(pcap_header.caplen, handle->bufsize);
+                       caplen = pcap_header.caplen, packet_len = pcap_header.len;
+                       break;
+               } else if (packet_len == -1 && errno == EINTR)
+                       continue;
+               else
+                       return -1;
+       }
+
+       /*
+        * XXX: According to the kernel source we should get the real
+        * packet len if calling recvfrom with MSG_TRUNC set. It does
+        * not seem to work here :(, but it is supported by this code
+        * anyway.
+        * To be honest the code RELIES on that feature so this is really
+        * broken with 2.2.x kernels.
+        * I spend a day to figure out what's going on and I found out
+        * that the following is happening:
+        *
+        * The packet comes from a random interface and the packet_rcv
+        * hook is called with a clone of the packet. That code inserts
+        * the packet into the receive queue of the packet socket.
+        * If a filter is attached to that socket that filter is run
+        * first - and there lies the problem. The default filter always
+        * cuts the packet at the snaplen:
+        *
+        * # tcpdump -d
+        * (000) ret      #68
+        *
+        * So the packet filter cuts down the packet. The recvfrom call
+        * says "hey, it's only 68 bytes, it fits into the buffer" with
+        * the result that we don't get the real packet length. This
+        * is valid at least until kernel 2.2.17pre6.
+        *
+        * We currently handle this by making a copy of the filter
+        * program, fixing all "ret" instructions with non-zero
+        * operands to have an operand of 65535 so that the filter
+        * doesn't truncate the packet, and supplying that modified
+        * filter to the kernel.
+        *
+        * XXX - does any of that apply for PF_RING?
+        */
+       caplen = packet_len;
+       if (caplen > handle->snapshot)
+               caplen = handle->snapshot;
+
+       /* Run the packet filter if not using kernel filter */
+       if (!handle->md.use_bpf && handle->fcode.bf_insns) {
+               if (bpf_filter(handle->fcode.bf_insns, bp,
+                               packet_len, caplen) == 0)
+               {
+                       /* rejected by filter */
+                       return 0;
+               }
+       }
+
+       /*
+        * Count the packet.
+        *
+        * Arguably, we should count them before we check the filter,
+        * as on many other platforms "ps_recv" counts packets
+        * handed to the filter rather than packets that passed
+        * the filter, but if filtering is done in the kernel, we
+        * can't get a count of packets that passed the filter,
+        * and that would mean the meaning of "ps_recv" wouldn't
+        * be the same on all Linux systems.
+        *
+        * XXX - it's not the same on all systems in any case;
+        * ideally, we should have a "get the statistics" call
+        * that supplies more counts and indicates which of them
+        * it supplies, so that we supply a count of packets
+        * handed to the filter only on platforms where that
+        * information is available.
+        *
+        * We count them here even if we can get the packet count
+        * from the kernel, as we can only determine at run time
+        * whether we'll be able to get it from the kernel (if
+        * HAVE_TPACKET_STATS isn't defined, we can't get it from
+        * the kernel, but if it is defined, the library might
+        * have been built with a 2.4 or later kernel, but we
+        * might be running on a 2.2[.x] kernel without Alexey
+        * Kuznetzov's turbopacket patches, and thus the kernel
+        * might not be able to supply those statistics).  We
+        * could, I guess, try, when opening the socket, to get
+        * the statistics, and if we can not increment the count
+        * here, but it's not clear that always incrementing
+        * the count is more expensive than always testing a flag
+        * in memory.
+        *
+        * We keep the count in "md.packets_read", and use that for
+        * "ps_recv" if we can't get the statistics from the kernel.
+        * We do that because, if we *can* get the statistics from
+        * the kernel, we use "md.stat.ps_recv" and "md.stat.ps_drop"
+        * as running counts, as reading the statistics from the
+        * kernel resets the kernel statistics, and if we directly
+        * increment "md.stat.ps_recv" here, that means it will
+        * count packets *twice* on systems where we can get kernel
+        * statistics - once here, and once in pcap_stats_linux().
+        */
+       handle->md.packets_read++;
+
+       /* Call the user supplied callback function */
+        callback(userdata, (struct pcap_pkthdr*)&pcap_header, bp);
+       return 1;
+}
+
+/*
+ *  Get the statistics for the given packet capture handle.
+ */
+static int
+pcap_stats_linux_pf_ring(pcap_t *handle, struct pcap_stat *stats)
+{
+       struct tpacket_stats kstats;
+       socklen_t len = sizeof (struct tpacket_stats);
+
+       /*
+        * Try to get the packet counts from the kernel.
+        */
+       if (getsockopt(handle->fd, SOL_PACKET, PACKET_STATISTICS,
+                       &kstats, &len) > -1) {
+               /*
+                * On systems where the PACKET_STATISTICS "getsockopt()"
+                * argument is supported on PF_PACKET sockets:
+                *
+                *      "ps_recv" counts only packets that *passed* the
+                *      filter, not packets that didn't pass the filter.
+                *      This includes packets later dropped because we
+                *      ran out of buffer space.
+                *
+                *      "ps_drop" counts packets dropped because we ran
+                *      out of buffer space.  It doesn't count packets
+                *      dropped by the interface driver.  It counts only
+                *      packets that passed the filter.
+                *
+                *      Both statistics include packets not yet read from
+                *      the kernel by libpcap, and thus not yet seen by
+                *      the application.
+                *
+                * In "linux/net/packet/af_packet.c", at least in the
+                * 2.4.9 kernel, "tp_packets" is incremented for every
+                * packet that passes the packet filter *and* is
+                * successfully queued on the socket; "tp_drops" is
+                * incremented for every packet dropped because there's
+                * not enough free space in the socket buffer.
+                *
+                * When the statistics are returned for a PACKET_STATISTICS
+                * "getsockopt()" call, "tp_drops" is added to "tp_packets",
+                * so that "tp_packets" counts all packets handed to
+                * the PF_PACKET socket, including packets dropped because
+                * there wasn't room on the socket buffer - but not
+                * including packets that didn't pass the filter.
+                *
+                * In the BSD BPF, the count of received packets is
+                * incremented for every packet handed to BPF, regardless
+                * of whether it passed the filter.
+                *
+                * We can't make "pcap_stats()" work the same on both
+                * platforms, but the best approximation is to return
+                * "tp_packets" as the count of packets and "tp_drops"
+                * as the count of drops.
+                */
+               handle->md.stat.ps_recv = kstats.tp_packets;
+               handle->md.stat.ps_drop = kstats.tp_drops;
+               *stats = handle->md.stat;
+               return 0;
+       }
+       else
+       {
+               snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                   "pcap_stats: %s", pcap_strerror(errno));
+               return -1;
+       }
+}
+
+int pcap_set_cluster(pfring *ring, u_int clusterId) { 
+  return(pfring_set_cluster(ring, clusterId)); 
+}
+
+int pcap_remove_from_cluster(pfring *ring) {
+  return(pfring_remove_from_cluster(ring)); 
+}
+
+int pcap_set_reflector(pfring *ring, 
+                      char *reflectorDevice) {
+  return(pfring_set_reflector(ring, reflectorDevice)); 
+}
+
+pfring* pcap_get_pfring_handle(const pcap_t *pHandle) {
+  return(pHandle ? pHandle->ring : NULL);
+}
+#endif /* HAVE_PF_RING */

-
This is the tcpdump-workers list.
Visit https://cod.sandelman.ca/ to unsubscribe.

Current thread:

Re: Libpcap reentrancy and PF_RING patch, (continued)
- Re: Libpcap reentrancy and PF_RING patch Luca Deri (Jan 02)
  - Re: Libpcap reentrancy and PF_RING patch Gregor Maier (Jan 07)
- Re: Libpcap reentrancy and PF_RING patch Luca Deri (Jan 02)
  - Re: Libpcap reentrancy and PF_RING patch Guy Harris (Jan 05)
    - Re: Libpcap reentrancy and PF_RING patch Luca Deri (Jan 06)
    - Re: Libpcap reentrancy and PF_RING patch Guy Harris (Jan 06)
    - Re: Libpcap reentrancy and PF_RING patch Luca Deri (Jan 06)
    - Re: Libpcap reentrancy and PF_RING patch Guy Harris (Jan 06)
    - Re: Libpcap reentrancy and PF_RING patch Guy Harris (Jan 10)
    - Re: Libpcap reentrancy and PF_RING patch Luca Deri (Jan 22)
    - Re: Libpcap reentrancy and PF_RING patch Guy Harris (Jan 24)
- Re: Libpcap reentrancy and PF_RING patch Guy Harris (Jan 03)
- Re: Libpcap reentrancy and PF_RING patch Gregor Maier (Jan 07)