Couple of numbers... Remapping of the physical page took about 25-50% less time than 1500 bytes copying using memcpy(). And 15 times faster just after reboot, i.e. without anything in the cache.
CPU is Xeon with HT enabled: cpu family : 15 model : 2 model name : Intel(R) Xeon(TM) CPU 2.40GHz stepping : 7 cpu MHz : 800.384 1. packet_mmap_test: 1000 remaps took 1495 usec. packet_mmap_test: 1000 copyings took 1988 usec. 2. packet_mmap_test: 1000 remaps took 1406 usec. packet_mmap_test: 1000 copyings took 2613 usec. 3. And just after reboot, when there is nothing in cache: packet_mmap_test: 1000 remaps took 1387 usec. packet_mmap_test: 1000 copyings took 20173 usec. 4. Yet another "just after reboot": packet_mmap_test: 1000 remaps took 1295 usec. packet_mmap_test: 1000 copyings took 14889 usec. Above copying is being done using arbitrary kernel virtual address as source address and with PAGE_SIZE addition to it before each memcpy(). On Thu, Jul 28, 2005 at 12:44:41PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) wrote: > Hello, developers. > > This cruft works now much better. > Unfortunately I need to add some scary PTE insults- you can find them in > update_address(). > One big nitpick is that this module can not be unloaded if application > do not closes socket - socket is being removed after mapping is destroyed, > so I need to grab MM reference, but can not drop it. > Also it uses flush_tlb() all over the place, but it is only one macros, > that can be used in modules - tlb_flush_page() and tlb_flush_one() are not > exported. It also has a race on startup, when there is only one page > mapped (control page), but userspace (very simple) may want to access > data pages. > Control page contains set of control structures one per mapped page, > i.e. mapped skb, control structure has an offset of skb->mac.raw in the > page and flags field. > > I gladly want to listen your comments. > Thanks. > > Included files: > af_tlb.[ch] - zero-copy sniffer implementation. > tlb_test.c - simple userspace sniffer. > > af_tlb.c > /* > * af_tlb.c > * > * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]> > * All rights reserved. > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > * the Free Software Foundation; either version 2 of the License, or > * (at your option) any later version. > * > * This program is distributed in the hope that it will be useful, > * but WITHOUT ANY WARRANTY; without even the implied warranty of > * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > * GNU General Public License for more details. > * > * You should have received a copy of the GNU General Public License > * along with this program; if not, write to the Free Software > * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > */ > > > #include <linux/config.h> > #include <linux/types.h> > #include <linux/sched.h> > #include <linux/mm.h> > #include <linux/fcntl.h> > #include <linux/socket.h> > #include <linux/in.h> > #include <linux/inet.h> > #include <linux/netdevice.h> > #include <linux/if_packet.h> > #include <linux/wireless.h> > #include <linux/kmod.h> > #include <net/ip.h> > #include <net/protocol.h> > #include <linux/skbuff.h> > #include <net/sock.h> > #include <linux/errno.h> > #include <linux/timer.h> > #include <linux/module.h> > #include <linux/moduleparam.h> > #include <linux/init.h> > #include <linux/workqueue.h> > > #include <linux/mempolicy.h> > #include <linux/rmap.h> > #include <linux/fs.h> > #include <linux/shm.h> > #include <linux/mm.h> > #include <linux/mman.h> > #include <linux/pagemap.h> > #include <linux/swap.h> > #include <linux/hugetlb.h> > #include <linux/mman.h> > #include <linux/slab.h> > #include <linux/swapops.h> > > #include <asm/io.h> > #include <asm/uaccess.h> > #include <asm/tlb.h> > #include <asm/tlbflush.h> > #include <asm/pgtable.h> > #include <asm/pgalloc.h> > #include <asm/uaccess.h> > #include <asm/cacheflush.h> > > #include "af_tlb.h" > > static unsigned int free_timeout = 10; > module_param(free_timeout, uint, 0); > > static void test_timer_func(void *data); > static DECLARE_WORK(w, test_timer_func, NULL); > > static void packet_free_skbs(struct packet_sock *po, int clear_last); > > static inline struct packet_sock *pkt_sk(struct sock *sk) > { > return (struct packet_sock *)sk; > } > > static void packet_sock_destruct(struct sock *sk) > { > BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); > BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); > > if (!sock_flag(sk, SOCK_DEAD)) { > printk("Attempt to release alive packet socket: %p\n", sk); > return; > } > } > > > static struct proto_ops packet_ops_spkt; > > static void dump_skb(struct sk_buff *skb) > { > struct ethhdr *eth; > int i; > > printk("shared=%d, cloned=%d, len=%4d: ", skb_shared(skb), > skb_cloned(skb), skb->len); > > eth = eth_hdr(skb); > > printk("MAC: proto=%04x, src=", eth->h_proto); > for (i=0; i<ETH_ALEN-1; ++i) > printk("%02x:", eth->h_source[i]); > printk("%02x, dst=", eth->h_source[ETH_ALEN-1]); > for (i=0; i<ETH_ALEN-1; ++i) > printk("%02x:", eth->h_dest[i]); > printk("%02x.\n", eth->h_dest[ETH_ALEN-1]); > } > > static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, > struct packet_type *pt) > { > struct sock *sk; > struct sockaddr_pkt *spkt; > struct packet_sock *po; > int err; > > sk = pt->af_packet_priv; > po = pkt_sk(sk); > > po->total++; > > /* > * Yank back the headers [hope the device set this > * right or kerboom...] > * > * Incoming packets have ll header pulled, > * push it back. > * > * For outgoing ones skb->data == skb->mac.raw > * so that this procedure is noop. > */ > > if (skb->pkt_type == PACKET_LOOPBACK) > goto out; > > if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) > goto oom; > > /* drop any routing info */ > dst_release(skb->dst); > skb->dst = NULL; > > spkt = (struct sockaddr_pkt*)skb->cb; > > skb_push(skb, skb->data-skb->mac.raw); > > /* > * The SOCK_PACKET socket receives _all_ frames. > */ > > spkt->spkt_family = dev->type; > strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); > spkt->spkt_protocol = skb->protocol; > > err = sock_queue_rcv_skb(sk, skb); > if (!err) > po->queued++; > else > po->dropped++; > > if (test_bit(PACKET_SOCKET_MAPPED, &po->flags)) > schedule_work(&w); > > if (!err) > return 0; > > out: > kfree_skb(skb); > oom: > return 0; > } > > > /* > * Close a PACKET socket. This is fairly simple. We immediately go > * to 'closed' state and remove our protocol entry in the device list. > */ > > static int packet_release(struct socket *sock) > { > struct sock *sk = sock->sk; > struct packet_sock *po; > > if (!sk) > return 0; > > po = pkt_sk(sk); > > sk_del_node_init(sk); > > if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) { > dev_remove_pack(&po->prot_hook); > clear_bit(PACKET_SOCKET_RUNNING, &po->flags); > __sock_put(sk); > } > > sock_orphan(sk); > sock->sk = NULL; > > printk("%s: Waiting to workqueue.\n", __func__); > > clear_bit(PACKET_SOCKET_RUNNING, &po->flags); > > cancel_delayed_work(&w); > flush_scheduled_work(); > > skb_queue_purge(&sk->sk_receive_queue); > skb_queue_purge(&po->sk_free_queue); > > printk("%s: releasing page.\n", __func__); > > free_page(po->page); > sock_put(sk); > > mmput(po->tsk->mm); > > return 0; > } > > /* > * Attach a packet hook. > */ > > static int packet_do_bind(struct sock *sk, struct net_device *dev, int > protocol) > { > struct packet_sock *po = pkt_sk(sk); > > lock_sock(sk); > > spin_lock(&po->bind_lock); > if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) { > __sock_put(sk); > clear_bit(PACKET_SOCKET_RUNNING, &po->flags); > po->num = 0; > spin_unlock(&po->bind_lock); > dev_remove_pack(&po->prot_hook); > spin_lock(&po->bind_lock); > } > > po->num = protocol; > po->prot_hook.type = protocol; > po->prot_hook.dev = dev; > > po->ifindex = dev ? dev->ifindex : 0; > > if (protocol == 0) > goto out_unlock; > > if (dev) { > if (dev->flags&IFF_UP) { > dev_add_pack(&po->prot_hook); > sock_hold(sk); > set_bit(PACKET_SOCKET_RUNNING, &po->flags); > } else { > sk->sk_err = ENETDOWN; > if (!sock_flag(sk, SOCK_DEAD)) > sk->sk_error_report(sk); > } > } else { > dev_add_pack(&po->prot_hook); > sock_hold(sk); > set_bit(PACKET_SOCKET_RUNNING, &po->flags); > } > > out_unlock: > spin_unlock(&po->bind_lock); > release_sock(sk); > return 0; > } > > static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int > addr_len) > { > struct sock *sk=sock->sk; > char name[15]; > struct net_device *dev; > int err = -ENODEV; > > strlcpy(name, uaddr->sa_data, sizeof(name)); > printk( "%s: name=%s.\n", __func__, name); > > if(addr_len!=sizeof(struct sockaddr)) > return -EINVAL; > > dev = dev_get_by_name(name); > if (dev) { > err = packet_do_bind(sk, dev, pkt_sk(sk)->num); > dev_put(dev); > } > return err; > } > > static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long > arg) > { > switch(cmd) { > default: > return dev_ioctl(cmd, (void __user *)arg); > } > return 0; > } > > static struct proto packet_proto = { > .name = "PACKET", > .owner = THIS_MODULE, > .obj_size = sizeof(struct packet_sock), > }; > > static int packet_sock_init(struct packet_sock *po, int protocol, struct sock > *sk) > { > skb_queue_head_init(&po->sk_free_queue); > > po->last = 0; > po->total = 0; > po->dropped = 0; > po->queued = 0; > po->flags = 0; > po->budget = 1; > po->next_free = jiffies + msecs_to_jiffies(free_timeout); > > spin_lock_init(&po->bind_lock); > > po->tsk = current; > > po->page = __get_free_page(GFP_KERNEL); > if (!po->page) > return -ENOMEM; > > memset((void *)po->page, 0, PAGE_SIZE); > > po->num = protocol; > po->prot_hook.func = packet_rcv_spkt; > po->prot_hook.af_packet_priv = sk; > > get_task_mm(po->tsk); > > return 0; > } > > static int packet_create(struct socket *sock, int protocol) > { > struct sock *sk; > struct packet_sock *po; > int err; > > if (!capable(CAP_NET_RAW)) > return -EPERM; > if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != > SOCK_PACKET) > return -ESOCKTNOSUPPORT; > > sock->state = SS_UNCONNECTED; > > err = -ENOBUFS; > sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); > if (sk == NULL) > goto err_out_exit; > > sock->ops = &packet_ops_spkt; > > sock_init_data(sock, sk); > > po = pkt_sk(sk); > sk->sk_family = PF_PACKET; > sk->sk_destruct = packet_sock_destruct; > > err = packet_sock_init(po, protocol, sk); > if (err) > goto err_out_sock_free; > > if (protocol) { > po->prot_hook.type = protocol; > dev_add_pack(&po->prot_hook); > sock_hold(sk); > set_bit(PACKET_SOCKET_RUNNING, &po->flags); > } > > return 0; > > err_out_sock_free: > sk_free(sk); > err_out_exit: > return err; > } > > static struct packet_shared *packet_find_shared_lazy(struct packet_sock *po, > struct sk_buff *skb) > { > u16 offset = offset_in_page(skb->mac.raw); > struct packet_shared *ps = (struct packet_shared *)po->page; > int i; > > for (i=0; i<po->budget; ++i) { > if (ps->offset == offset) > break; > > ps++; > } > > if (i == po->budget) > return NULL; > > return ps; > } > > static void packet_free_skbs(struct packet_sock *po, int clear_last) > { > struct sk_buff *skb; > int num = 0; > //struct sock *sk = po->prot_hook.af_packet_priv; > struct packet_shared *ps; > struct page *page; > > while ((!skb_queue_empty(&po->sk_free_queue) && po->free_queued > > po->budget) || clear_last > 0) { > spin_lock_bh(&po->sk_free_queue.lock); > skb = __skb_dequeue(&po->sk_free_queue); > if (skb) > po->free_queued--; > spin_unlock_bh(&po->sk_free_queue.lock); > > if (!skb) > break; > > ps = packet_find_shared_lazy(po, skb); > if (ps) { > if (!test_bit(PACKET_MAPPED, &ps->flags)) > printk("%s: pos=%d, offset=%04x, > flags=%08lx.\n", __func__, ps->pos, ps->offset, ps->flags); > clear_bit(PACKET_MAPPED, &ps->flags); > } > > page = virt_to_page(skb->mac.raw); > > put_page(page); > if (!page_count(page)) { > ClearPageReserved(page); > } > > kfree_skb(skb); > num++; > clear_last--; > } > #if 0 > printk("%s: freed=%d, free_queued=%d, qeued=%d [rmem=%d, > max=%d], budget=%d, queued=%lu, dropped=%lu, total=%lu.\n", > __func__, num, po->free_queued, > skb_queue_len(&sk->sk_receive_queue), > atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf, > po->budget, po->queued, po->dropped, po->total); > #endif > } > > static inline pte_t *get_pte(struct vm_area_struct *vma, unsigned long addr) > { > pgd_t *pgd; > pud_t *pud; > pmd_t *pmd; > pte_t *pte; > > pgd = pgd_offset(vma->vm_mm, addr); > pud = pud_offset(pgd, addr); > pmd = pmd_offset(pud, addr); > > if (pmd_none(*pmd)) > vma->vm_mm->nr_ptes--; > > pte = pte_offset_map(pmd, addr); > > printk("%s: addr=%08lx, pte=%p, %08lx, pmd=%p, pud=%p, pgd=%p, > nr_pte=%ld.\n", > __func__, addr, pte, pte_val(*pte), pmd, pud, pgd, > vma->vm_mm->nr_ptes); > > > return pte; > } > > static inline void update_address(struct vm_area_struct *vma, unsigned long > addr, unsigned long pfn) > { > pte_t *pte; > struct page *page; > > pte = get_pte(vma, addr); > page = pfn_to_page(pfn); > > printk("%s: pfn=%08lx, valid=%d, page=%p, res=%d, mapcount=%d.\n", > __func__, pfn, pfn_valid(pfn), page, > PageReserved(page), page_mapcount(page)); > > pte_clear(vma->mm, addr, pte); > pte_unmap(pte); > } > > static void test_timer_func(void *data) > { > struct sock *sk = (struct sock *)data; > struct packet_sock *po; > struct packet_shared *ps; > struct sk_buff *skb; > unsigned long virt, start; > int num = 0; > > if (!sk) > return; > > po = pkt_sk(sk); > if (!po || !po->tsk || !po->tsk->mm || !test_bit(PACKET_SOCKET_RUNNING, > &po->flags) || !test_bit(PACKET_SOCKET_MAPPED, &po->flags)) > return; > > down_write(&po->tsk->mm->mmap_sem); > #if 1 > printk("%s: free_queued=%d, qeued=%d [rmem=%d, max=%d], budget=%d, > queued=%lu, dropped=%lu, total=%lu.\n", > __func__, po->free_queued, > skb_queue_len(&sk->sk_receive_queue), > atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf, > po->budget, po->queued, po->dropped, po->total); > #endif > while (++num <= po->budget && (skb = > skb_dequeue(&sk->sk_receive_queue))) { > virt = (unsigned long)skb->mac.raw; > if (!virt) > goto out; > > start = po->vma->vm_start + PAGE_SIZE*(1+po->last); > ps = &((struct packet_shared *)po->page)[po->last]; > > printk("s=%08lx, p=%p, pos=%d, offset=%04x, flags=%08lx.\n", > start, virt_to_page(virt), ps->pos, ps->offset, ps->flags); > if (0) { > //int i; > > printk("offset=%4lx, num=%2d, last=%2d, users=%1d, > dataref=%1d: ", > offset_in_page(virt), num, po->last, > atomic_read(&skb->users), > atomic_read(&skb_shinfo(skb)->dataref)); > dump_skb(skb); > #if 0 > for (i=0; i<32; ++i) > printk("%02x ", ((unsigned char *)virt)[i]); > printk("\n"); > #endif > } > > /* > * This actually should not be flush_tlb(), > * but it is the only one call that can be used in modules. > * --zbr > */ > update_address(po->vma, start, __pa(virt) >> PAGE_SHIFT); > __flush_tlb(); > > SetPageReserved(virt_to_page(virt)); > get_page(virt_to_page(virt)); > if (remap_pfn_range(po->vma, start, __pa(virt) >> PAGE_SHIFT, > PAGE_SIZE, po->vma->vm_page_prot)) { > printk("Remapping error.\n"); > ClearPageReserved(virt_to_page(virt)); > goto out; > } > > flush_dcache_page(virt_to_page(virt)); > > if (test_bit(PACKET_MAPPED, &ps->flags)) > packet_free_skbs(po, 1); > > ps->offset = offset_in_page(virt); > set_bit(PACKET_MAPPED, &ps->flags); > > if (++po->last == po->budget) > po->last = 0; > > { > start = po->vma->vm_start; > > while (start < po->vma->vm_end) { > pte_t *pte = get_pte(po->vma, start); > > if (pte_present(*pte)) { > struct page *page = NULL; > unsigned long pfn = pte_pfn(*pte); > if (pfn_valid(pfn)) { > page = pfn_to_page(pfn); > > printk("s=%08lx, p=%p, r=%d, > m=%d, pfn=%08lx.\n", > start, page, > PageReserved(page), page_mapcount(page), pfn); > } else > printk("p=NULL, pfn=%08lx.\n", > pfn); > > } else { > printk("pte=%p is not present.\n", pte); > } > > start += PAGE_SIZE; > } > } > > out: > /* > * Actually here should be some smart algo, which will defer > skb freeing > * until userspace "read" it, so userspace should provide some > kind of callback, > * which will require write permisions to the area, so it > should be splitted. > * Or better just to free it after some timeout, say 100 msec > should be enough. > * --zbr > * > * Tricky algo is to place skbs into new list, which will be > traversed > * in a some interval and skbs will be unlinked and freed. > * Actually, there is no need to lock this queue against > freeing, since it happens > * synchroniously, but if someday freeing will be separate > nothing will be changed. > * --zbr > */ > > spin_lock_bh(&po->sk_free_queue.lock); > po->free_queued++; > __skb_queue_tail(&po->sk_free_queue, skb); > spin_unlock_bh(&po->sk_free_queue.lock); > } > #if 0 > if (time_after(jiffies, po->next_free)) { > po->next_free = jiffies + msecs_to_jiffies(free_timeout); > packet_free_skbs(po, 0); > } > #endif > printk("%s: UP: po->tsk->mm=%p.\n", __func__, po->tsk->mm); > up_write(&po->tsk->mm->mmap_sem); > > printk("%s finished.\n", __func__); > } > > static void packet_mm_open(struct vm_area_struct *vma) > { > struct file *file = vma->vm_file; > struct inode *inode = file->f_dentry->d_inode; > struct socket * sock = SOCKET_I(inode); > struct sock *sk = sock->sk; > > printk( "%s, sk=%p.\n", __func__, sk); > } > > static void packet_mm_close(struct vm_area_struct *vma) > { > struct file *file = vma->vm_file; > struct inode *inode = file->f_dentry->d_inode; > struct socket *sock = SOCKET_I(inode); > struct sock *sk = sock->sk; > > printk( "%s, sk=%p.\n", __func__, sk); > > if (sk) { > struct packet_sock *po = pkt_sk(sk); > > if (po) { > down_write(&vma->vm_mm->mmap_sem); > clear_bit(PACKET_SOCKET_MAPPED, &po->flags); > up_write(&vma->vm_mm->mmap_sem); > } > } > } > > static struct vm_operations_struct packet_mmap_ops = { > .open = packet_mm_open, > .close = packet_mm_close, > }; > > static int packet_mmap_test(struct socket *sock, struct vm_area_struct *vma) > { > int i; > struct timeval tv1, tv2; > unsigned long start = vma->vm_start; > u8 *data1, *data2; > > do_gettimeofday(&tv1); > for (i=0; i<1000; i++) { > > update_address(vma, start, __pa(PAGE_OFFSET) >> PAGE_SHIFT); > __flush_tlb(); > > if (remap_pfn_range(vma, start, > __pa(PAGE_OFFSET) >> PAGE_SHIFT, > PAGE_SIZE, > vma->vm_page_prot)) > break; > > start += PAGE_SIZE; > } > do_gettimeofday(&tv2); > > printk("%s: 1000 remaps took %lu usec.\n", __func__, (tv2.tv_sec - > tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec); > > data1 = kmalloc(PAGE_SIZE, GFP_KERNEL); > if (!data1) > return -ENOMEM; > data2 = kmalloc(PAGE_SIZE, GFP_KERNEL); > if (!data2) { > kfree(data2); > return -ENOMEM; > } > > do_gettimeofday(&tv1); > for (i=0; i<1000; i++) { > memcpy(data1, data2, 1500); > } > do_gettimeofday(&tv2); > > printk("%s: 1000 copyings took %lu usec.\n", __func__, (tv2.tv_sec - > tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec); > > kfree(data1); > kfree(data2); > > return 0; > } > > static int packet_mmap(struct file *file, struct socket *sock, struct > vm_area_struct *vma) > { > struct sock *sk = sock->sk; > struct packet_sock *po = pkt_sk(sk); > unsigned long size = vma->vm_end - vma->vm_start; > int err = 0; > > vma->vm_ops = &packet_mmap_ops; > > //err = packet_mmap_test(sock, vma); > if (err) > return err; > > lock_sock(sk); > po->budget = (size - PAGE_SIZE) / PAGE_SIZE; > > update_address(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT); > __flush_tlb(); > > SetPageReserved(virt_to_page(po->page)); > if (remap_pfn_range(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT, > PAGE_SIZE, vma->vm_page_prot)) { > ClearPageReserved(virt_to_page(po->page)); > err = -EIO; > goto err_out_unlock; > } > > po->vma = vma; > > release_sock(sk); > > INIT_WORK(&w, test_timer_func, sk); > > set_bit(PACKET_SOCKET_MAPPED, &po->flags); > > return 0; > > err_out_unlock: > release_sock(sk); > return err; > } > > static unsigned int packet_poll(struct file * file, struct socket *sock, > poll_table *wait) > { > struct sock *sk = sock->sk; > struct packet_sock *po = pkt_sk(sk); > unsigned int mask = datagram_poll(file, sock, wait); > > spin_lock_bh(&sk->sk_receive_queue.lock); > if (po->free_queued < po->total) > mask |= POLLIN | POLLRDNORM; > spin_unlock_bh(&sk->sk_receive_queue.lock); > return mask; > } > > static struct proto_ops packet_ops_spkt = { > .family = PF_PACKET, > .owner = THIS_MODULE, > .release = packet_release, > .bind = packet_bind, > .connect = sock_no_connect, > .socketpair = sock_no_socketpair, > .accept = sock_no_accept, > .getname = sock_no_getname, > .poll = packet_poll, > .ioctl = packet_ioctl, > .listen = sock_no_listen, > .shutdown = sock_no_shutdown, > .setsockopt = sock_no_setsockopt, > .getsockopt = sock_no_getsockopt, > .sendmsg = sock_no_sendmsg, > .recvmsg = sock_no_recvmsg, > .mmap = packet_mmap, > .sendpage = sock_no_sendpage, > }; > > static struct net_proto_family packet_family_ops = { > .family = PF_PACKET, > .create = packet_create, > .owner = THIS_MODULE, > }; > > static void __exit packet_exit(void) > { > sock_unregister(PF_PACKET); > proto_unregister(&packet_proto); > } > > static int __init packet_init(void) > { > int rc = proto_register(&packet_proto, 0); > > if (rc != 0) > goto out; > > sock_register(&packet_family_ops); > > printk("%s: initialized at %lu.\n", __func__, jiffies); > out: > return rc; > } > > module_init(packet_init); > module_exit(packet_exit); > MODULE_LICENSE("GPL"); > MODULE_ALIAS_NETPROTO(PF_PACKET); > > af_tlb.h > > /* > * af_tlb.h > * > * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]> > * All rights reserved. > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > * the Free Software Foundation; either version 2 of the License, or > * (at your option) any later version. > * > * This program is distributed in the hope that it will be useful, > * but WITHOUT ANY WARRANTY; without even the implied warranty of > * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > * GNU General Public License for more details. > * > * You should have received a copy of the GNU General Public License > * along with this program; if not, write to the Free Software > * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > */ > > #ifndef __AF_TLB_H > #define __AF_TLB_H > > enum packet_shared_flags { > PACKET_MAPPED = 0, > }; > > struct packet_shared { > __u16 offset; > __u16 reserved; > int pos; > long flags; > } __attribute__ ((packed)); > > #ifdef __KERNEL__ > > enum packet_flags { > PACKET_SOCKET_RUNNING = 0, > PACKET_SOCKET_MAPPED, > }; > > struct packet_sock { > struct sock sk; > struct packet_type prot_hook; > spinlock_t bind_lock; > > long flags; > int ifindex; > unsigned short num; > > struct vm_area_struct *vma; > > struct task_struct *tsk; > > int budget, last; > unsigned long page; > > struct sk_buff_head sk_free_queue; > int free_queued; > > unsigned long next_free; > > unsigned long queued; > unsigned long dropped; > unsigned long total; > }; > > #endif /* __KERNEL__ */ > > #endif /* __AF_TLB_H */ > > tlb_test.c > #include <sys/types.h> > #include <sys/socket.h> > #include <sys/mman.h> > #include <sys/poll.h> > > #include <stdio.h> > #include <string.h> > #include <stdlib.h> > #include <errno.h> > #include <unistd.h> > > #include <netinet/in.h> > #include <netinet/ip.h> > #include <net/ethernet.h> > > #include <linux/if_ether.h> > #include <linux/types.h> > > #include "af_tlb.h" > > #define PAGE_SIZE 4096 > static size_t mmap_size = 17*PAGE_SIZE; > > #define ulog(f, a...) do { fprintf(stderr, f, ##a); fflush(stderr); } while > (0) > #define NIPQUAD(addr) \ > ((unsigned char *)&addr)[0], \ > ((unsigned char *)&addr)[1], \ > ((unsigned char *)&addr)[2], \ > ((unsigned char *)&addr)[3] > > static __inline__ void set_bit(int bit, uint32_t *f) > { > *f |= (1<<bit); > } > > static __inline__ void clear_bit(int bit, uint32_t *f) > { > *f &= ~(1<<bit); > } > > static __inline__ int test_bit(int bit, uint32_t *f) > { > return ((*f >> bit) & 1); > } > > static void dump_data(void *ptr, __u16 offset, int size) > { > int i; > unsigned char *data = ptr + offset; > > ulog("%p: ", ptr); > for (i=0; i<size; ++i) > ulog("%02x ", data[i]); > ulog("\n"); > } > > static int dump_network(void *ptr, __u16 offset) > { > struct ether_header *eth = ptr + offset; > struct iphdr *ip; > char *proto; > int i; > unsigned short ether_type; > > //ulog("offset=%x: ", offset); > > ether_type = ntohs(eth->ether_type); > if (ether_type != ETH_P_IP && ether_type != ETH_P_ARP) { > //ulog("\n"); > return -1; > } > > ulog("MAC: proto=%04x, src=", eth->ether_type); > for (i=0; i<ETH_ALEN-1; ++i) > ulog("%02x:", eth->ether_shost[i]); > ulog("%02x, dst=", eth->ether_shost[ETH_ALEN-1]); > for (i=0; i<ETH_ALEN-1; ++i) > ulog("%02x:", eth->ether_dhost[i]); > ulog("%02x. ", eth->ether_dhost[ETH_ALEN-1]); > > if (ether_type != ETH_P_IP) { > dump_data(ptr, offset + sizeof(*eth), 16); > return 0; > } > > ip = (struct iphdr *)(ptr + offset + sizeof(*eth)); > > switch (ip->protocol) { > case IPPROTO_TCP: > proto = "TCP "; > break; > case IPPROTO_UDP: > proto = "UDP "; > break; > case IPPROTO_ICMP: > proto = "ICMP"; > break; > default: > proto = "UNKN"; > dump_data(ptr, offset + sizeof(*eth), 16); > return 0; > } > > ulog("%s: ", proto); > ulog("%u.%u.%u.%u -> %u.%u.%u.%u.\n", NIPQUAD(ip->saddr), > NIPQUAD(ip->daddr)); > > return 0; > } > > int main(int argc, char *argv[]) > { > struct sockaddr sa; > int s, err, num, i, j; > socklen_t len = sizeof(sa); > void *mmap_ptr; > struct packet_shared *ps, *ops; > void *old_ps; > struct pollfd pfd; > > if (argc > 1) > memcpy(sa.sa_data, argv[1], sizeof(sa.sa_data)); > else > memcpy(sa.sa_data, "eth0", sizeof(sa.sa_data)); > > old_ps = malloc(PAGE_SIZE); > if (!old_ps) { > ulog("Failed to allocate backup packet shared page.\n"); > return -ENOMEM; > } > > memset(old_ps, 0, PAGE_SIZE); > > s = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); > if (s == -1) { > ulog("Failed to create PF_PACKET socket: %s [%d].\n", > strerror(errno), errno); > err = -errno; > goto err_out_free_old_ps; > } > > mmap_ptr = mmap(NULL, mmap_size, PROT_READ, MAP_SHARED, s, 0); > if (mmap_ptr == MAP_FAILED) { > ulog("Failed to map socket %d: %s [%d].\n", s, strerror(errno), > errno); > err = -errno; > goto err_out_close; > } > > err = bind(s, &sa, len); > if (err == -1) { > ulog("Failed to bind socket %d to device %s: %s [%d].\n", > s, sa.sa_data, strerror(errno), errno); > goto err_out_unmap; > } > > pfd.fd = s; > pfd.events = POLLIN; > pfd.revents = 0; > > num = (mmap_size - PAGE_SIZE) / PAGE_SIZE; > > j = 0; > while (1) { > /*err = poll(&pfd, 1, -1); > > if ((err == 0 || err == -1) && (errno != EINTR)) { > err = -errno; > break; > }*/ > > ps = (struct packet_shared *)mmap_ptr; > ops = (struct packet_shared *)old_ps; > > for (i=0; i<num; ++i) { > void *ptr = mmap_ptr + PAGE_SIZE*(i+1); > > if (test_bit(PACKET_MAPPED, &ps->flags) && ps->offset > != ops->offset) { > err = dump_network(ptr, ps->offset); > if (++j > 1000) > goto err_out_unmap; > } > #if 0 > if (err && ps->offset) > dump_data(ptr, ps->offset, 32); > #endif > *ops++ = *ps++; > } > > pfd.events = POLLIN; > pfd.revents = 0; > } > > err = 0; > > err_out_unmap: > munmap(mmap_ptr, mmap_size); > err_out_close: > close(s); > err_out_free_old_ps: > free(old_ps); > > return err; > } > > Makefile. > > obj-m := af_tlb.o > > KDIR := /lib/modules/`uname -r`/build > #KDIR := /usr/local/src/linux-2.6 > PWD := $(shell pwd) > UCFLAGS := -W -Wall > > default: > $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules > > test: > gcc $(UCFLAGS) tlb_test.c -o tlb_test > > clean: > $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) clean > @rm -f *~ > > -- > Evgeniy Polyakov > - > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to [EMAIL PROTECTED] > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Evgeniy Polyakov - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html