Couple of numbers...
Remapping of the physical page took about 25-50% less time than 1500
bytes copying using memcpy().
And 15 times faster just after reboot, i.e. without anything in the
cache.

CPU is Xeon with HT enabled:
cpu family      : 15
model           : 2
model name      : Intel(R) Xeon(TM) CPU 2.40GHz
stepping        : 7
cpu MHz         : 800.384


1.
packet_mmap_test: 1000 remaps took 1495 usec.
packet_mmap_test: 1000 copyings took 1988 usec.

2.
packet_mmap_test: 1000 remaps took 1406 usec.
packet_mmap_test: 1000 copyings took 2613 usec.

3. And just after reboot, when there is nothing in cache:
packet_mmap_test: 1000 remaps took 1387 usec.
packet_mmap_test: 1000 copyings took 20173 usec.

4. Yet another "just after reboot":
packet_mmap_test: 1000 remaps took 1295 usec.
packet_mmap_test: 1000 copyings took 14889 usec.

Above copying is being done using arbitrary kernel virtual address
as source address and with PAGE_SIZE addition to it before each
memcpy().

On Thu, Jul 28, 2005 at 12:44:41PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) 
wrote:
> Hello, developers.
> 
> This cruft works now much better.
> Unfortunately I need to add some scary PTE insults- you can find them in
> update_address().
> One big nitpick is that this module can not be unloaded if application 
> do not closes socket - socket is being removed after mapping is destroyed, 
> so I need to grab MM reference, but can not drop it.
> Also it uses flush_tlb() all over the place, but it is only one macros,
> that can be used in modules - tlb_flush_page() and tlb_flush_one() are not
> exported. It also has a race on startup, when there is only one page
> mapped (control page), but userspace (very simple) may want to access 
> data pages.
> Control page contains set of control structures one per mapped page,
> i.e. mapped skb, control structure has an offset of skb->mac.raw in the
> page and flags field.
> 
> I gladly want to listen your comments.
> Thanks.
> 
> Included files: 
> af_tlb.[ch] - zero-copy sniffer implementation.
> tlb_test.c - simple userspace sniffer.
> 
> af_tlb.c
> /*
>  *    af_tlb.c
>  * 
>  * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]>
>  * All rights reserved.
>  * 
>  * This program is free software; you can redistribute it and/or modify
>  * it under the terms of the GNU General Public License as published by
>  * the Free Software Foundation; either version 2 of the License, or
>  * (at your option) any later version.
>  *
>  * This program is distributed in the hope that it will be useful,
>  * but WITHOUT ANY WARRANTY; without even the implied warranty of
>  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>  * GNU General Public License for more details.
>  *
>  * You should have received a copy of the GNU General Public License
>  * along with this program; if not, write to the Free Software
>  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
>  */
> 
>  
> #include <linux/config.h>
> #include <linux/types.h>
> #include <linux/sched.h>
> #include <linux/mm.h>
> #include <linux/fcntl.h>
> #include <linux/socket.h>
> #include <linux/in.h>
> #include <linux/inet.h>
> #include <linux/netdevice.h>
> #include <linux/if_packet.h>
> #include <linux/wireless.h>
> #include <linux/kmod.h>
> #include <net/ip.h>
> #include <net/protocol.h>
> #include <linux/skbuff.h>
> #include <net/sock.h>
> #include <linux/errno.h>
> #include <linux/timer.h>
> #include <linux/module.h>
> #include <linux/moduleparam.h>
> #include <linux/init.h>
> #include <linux/workqueue.h>
> 
> #include <linux/mempolicy.h>
> #include <linux/rmap.h>
> #include <linux/fs.h>
> #include <linux/shm.h>
> #include <linux/mm.h>
> #include <linux/mman.h>
> #include <linux/pagemap.h>
> #include <linux/swap.h>
> #include <linux/hugetlb.h>
> #include <linux/mman.h>
> #include <linux/slab.h>
> #include <linux/swapops.h>
> 
> #include <asm/io.h>
> #include <asm/uaccess.h>
> #include <asm/tlb.h>
> #include <asm/tlbflush.h>
> #include <asm/pgtable.h>
> #include <asm/pgalloc.h>
> #include <asm/uaccess.h>
> #include <asm/cacheflush.h>
> 
> #include "af_tlb.h"
> 
> static unsigned int free_timeout = 10;
> module_param(free_timeout, uint, 0);
> 
> static void test_timer_func(void *data);
> static DECLARE_WORK(w, test_timer_func, NULL);
> 
> static void packet_free_skbs(struct packet_sock *po, int clear_last);
> 
> static inline struct packet_sock *pkt_sk(struct sock *sk)
> {
>       return (struct packet_sock *)sk;
> }
> 
> static void packet_sock_destruct(struct sock *sk)
> {
>       BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
>       BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
> 
>       if (!sock_flag(sk, SOCK_DEAD)) {
>               printk("Attempt to release alive packet socket: %p\n", sk);
>               return;
>       }
> }
> 
> 
> static struct proto_ops packet_ops_spkt;
> 
> static void dump_skb(struct sk_buff *skb)
> {
>       struct ethhdr *eth;
>       int i;
> 
>       printk("shared=%d, cloned=%d, len=%4d: ", skb_shared(skb), 
> skb_cloned(skb), skb->len);
> 
>       eth = eth_hdr(skb);
> 
>       printk("MAC: proto=%04x, src=", eth->h_proto);
>       for (i=0; i<ETH_ALEN-1; ++i)
>               printk("%02x:", eth->h_source[i]);
>       printk("%02x, dst=", eth->h_source[ETH_ALEN-1]);
>       for (i=0; i<ETH_ALEN-1; ++i)
>               printk("%02x:", eth->h_dest[i]);
>       printk("%02x.\n", eth->h_dest[ETH_ALEN-1]);
> }
> 
> static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  
> struct packet_type *pt)
> {
>       struct sock *sk;
>       struct sockaddr_pkt *spkt;
>       struct packet_sock *po;
>       int err;
> 
>       sk = pt->af_packet_priv;
>       po = pkt_sk(sk);
> 
>       po->total++;
> 
>       /*
>        *      Yank back the headers [hope the device set this
>        *      right or kerboom...]
>        *
>        *      Incoming packets have ll header pulled,
>        *      push it back.
>        *
>        *      For outgoing ones skb->data == skb->mac.raw
>        *      so that this procedure is noop.
>        */
> 
>       if (skb->pkt_type == PACKET_LOOPBACK)
>               goto out;
> 
>       if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
>               goto oom;
> 
>       /* drop any routing info */
>       dst_release(skb->dst);
>       skb->dst = NULL;
> 
>       spkt = (struct sockaddr_pkt*)skb->cb;
> 
>       skb_push(skb, skb->data-skb->mac.raw);
> 
>       /*
>        *      The SOCK_PACKET socket receives _all_ frames.
>        */
> 
>       spkt->spkt_family = dev->type;
>       strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
>       spkt->spkt_protocol = skb->protocol;
> 
>       err = sock_queue_rcv_skb(sk, skb);
>       if (!err)
>               po->queued++;
>       else
>               po->dropped++;
> 
>       if (test_bit(PACKET_SOCKET_MAPPED, &po->flags))
>               schedule_work(&w);
> 
>       if (!err)
>               return 0;
> 
> out:
>       kfree_skb(skb);
> oom:
>       return 0;
> }
> 
> 
> /*
>  *    Close a PACKET socket. This is fairly simple. We immediately go
>  *    to 'closed' state and remove our protocol entry in the device list.
>  */
> 
> static int packet_release(struct socket *sock)
> {
>       struct sock *sk = sock->sk;
>       struct packet_sock *po;
> 
>       if (!sk)
>               return 0;
> 
>       po = pkt_sk(sk);
> 
>       sk_del_node_init(sk);
> 
>       if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) {
>               dev_remove_pack(&po->prot_hook);
>               clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
>               __sock_put(sk);
>       }
> 
>       sock_orphan(sk);
>       sock->sk = NULL;
> 
>       printk("%s: Waiting to workqueue.\n", __func__);
> 
>       clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
> 
>       cancel_delayed_work(&w);
>       flush_scheduled_work();
> 
>       skb_queue_purge(&sk->sk_receive_queue);
>       skb_queue_purge(&po->sk_free_queue);
> 
>       printk("%s: releasing page.\n", __func__);
> 
>       free_page(po->page);
>       sock_put(sk);
>       
>       mmput(po->tsk->mm);
>       
>       return 0;
> }
> 
> /*
>  *    Attach a packet hook.
>  */
> 
> static int packet_do_bind(struct sock *sk, struct net_device *dev, int 
> protocol)
> {
>       struct packet_sock *po = pkt_sk(sk);
> 
>       lock_sock(sk);
> 
>       spin_lock(&po->bind_lock);
>       if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) {
>               __sock_put(sk);
>               clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
>               po->num = 0;
>               spin_unlock(&po->bind_lock);
>               dev_remove_pack(&po->prot_hook);
>               spin_lock(&po->bind_lock);
>       }
> 
>       po->num = protocol;
>       po->prot_hook.type = protocol;
>       po->prot_hook.dev = dev;
> 
>       po->ifindex = dev ? dev->ifindex : 0;
> 
>       if (protocol == 0)
>               goto out_unlock;
> 
>       if (dev) {
>               if (dev->flags&IFF_UP) {
>                       dev_add_pack(&po->prot_hook);
>                       sock_hold(sk);
>                       set_bit(PACKET_SOCKET_RUNNING, &po->flags);
>               } else {
>                       sk->sk_err = ENETDOWN;
>                       if (!sock_flag(sk, SOCK_DEAD))
>                               sk->sk_error_report(sk);
>               }
>       } else {
>               dev_add_pack(&po->prot_hook);
>               sock_hold(sk);
>               set_bit(PACKET_SOCKET_RUNNING, &po->flags);
>       }
> 
> out_unlock:
>       spin_unlock(&po->bind_lock);
>       release_sock(sk);
>       return 0;
> }
> 
> static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int 
> addr_len)
> {
>       struct sock *sk=sock->sk;
>       char name[15];
>       struct net_device *dev;
>       int err = -ENODEV;
> 
>       strlcpy(name, uaddr->sa_data, sizeof(name));
>       printk( "%s: name=%s.\n", __func__, name);
>       
>       if(addr_len!=sizeof(struct sockaddr))
>               return -EINVAL;
> 
>       dev = dev_get_by_name(name);
>       if (dev) {
>               err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
>               dev_put(dev);
>       }
>       return err;
> }
> 
> static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long 
> arg)
> {
>       switch(cmd) {
>               default:
>                       return dev_ioctl(cmd, (void __user *)arg);
>       }
>       return 0;
> }
> 
> static struct proto packet_proto = {
>       .name     = "PACKET",
>       .owner    = THIS_MODULE,
>       .obj_size = sizeof(struct packet_sock),
> };
> 
> static int packet_sock_init(struct packet_sock *po, int protocol, struct sock 
> *sk)
> {
>       skb_queue_head_init(&po->sk_free_queue);
>       
>       po->last        = 0;
>       po->total       = 0;
>       po->dropped     = 0;
>       po->queued      = 0;
>       po->flags       = 0;
>       po->budget      = 1;
>       po->next_free   = jiffies + msecs_to_jiffies(free_timeout);
> 
>       spin_lock_init(&po->bind_lock);
> 
>       po->tsk = current;
>       
>       po->page = __get_free_page(GFP_KERNEL);
>       if (!po->page)
>               return -ENOMEM;
>       
>       memset((void *)po->page, 0, PAGE_SIZE);
>       
>       po->num = protocol;
>       po->prot_hook.func = packet_rcv_spkt;
>       po->prot_hook.af_packet_priv = sk;
>       
>       get_task_mm(po->tsk);
> 
>       return 0;
> }
> 
> static int packet_create(struct socket *sock, int protocol)
> {
>       struct sock *sk;
>       struct packet_sock *po;
>       int err;
> 
>       if (!capable(CAP_NET_RAW))
>               return -EPERM;
>       if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != 
> SOCK_PACKET)
>               return -ESOCKTNOSUPPORT;
> 
>       sock->state = SS_UNCONNECTED;
> 
>       err = -ENOBUFS;
>       sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
>       if (sk == NULL)
>               goto err_out_exit;
> 
>       sock->ops = &packet_ops_spkt;
>       
>       sock_init_data(sock, sk);
> 
>       po = pkt_sk(sk);
>       sk->sk_family = PF_PACKET;
>       sk->sk_destruct = packet_sock_destruct;
> 
>       err = packet_sock_init(po, protocol, sk);
>       if (err)
>               goto err_out_sock_free;
> 
>       if (protocol) {
>               po->prot_hook.type = protocol;
>               dev_add_pack(&po->prot_hook);
>               sock_hold(sk);
>               set_bit(PACKET_SOCKET_RUNNING, &po->flags);
>       }
> 
>       return 0;
> 
> err_out_sock_free:
>       sk_free(sk);
> err_out_exit:
>       return err;
> }
> 
> static struct packet_shared *packet_find_shared_lazy(struct packet_sock *po, 
> struct sk_buff *skb)
> {
>       u16 offset = offset_in_page(skb->mac.raw);
>       struct packet_shared *ps = (struct packet_shared *)po->page;
>       int i;
>       
>       for (i=0; i<po->budget; ++i) {
>               if (ps->offset == offset)
>                       break;
> 
>               ps++;
>       }
> 
>       if (i == po->budget)
>               return NULL;
> 
>       return ps;
> }
> 
> static void packet_free_skbs(struct packet_sock *po, int clear_last)
> {
>       struct sk_buff *skb;
>       int num = 0;
>       //struct sock *sk = po->prot_hook.af_packet_priv;
>       struct packet_shared *ps;
>       struct page *page;
>       
>       while ((!skb_queue_empty(&po->sk_free_queue) && po->free_queued > 
> po->budget) || clear_last > 0) {
>               spin_lock_bh(&po->sk_free_queue.lock);
>               skb = __skb_dequeue(&po->sk_free_queue);
>               if (skb)
>                       po->free_queued--;
>               spin_unlock_bh(&po->sk_free_queue.lock);
> 
>               if (!skb)
>                       break;
>               
>               ps = packet_find_shared_lazy(po, skb);
>               if (ps) {
>                       if (!test_bit(PACKET_MAPPED, &ps->flags))
>                               printk("%s: pos=%d, offset=%04x, 
> flags=%08lx.\n", __func__, ps->pos, ps->offset, ps->flags);
>                       clear_bit(PACKET_MAPPED, &ps->flags);
>               }
> 
>               page = virt_to_page(skb->mac.raw);
> 
>               put_page(page);
>               if (!page_count(page)) {
>                       ClearPageReserved(page);
>               }
>       
>               kfree_skb(skb);
>               num++;
>               clear_last--;
>       }
> #if 0
>               printk("%s: freed=%d, free_queued=%d, qeued=%d [rmem=%d, 
> max=%d], budget=%d, queued=%lu, dropped=%lu, total=%lu.\n", 
>                               __func__, num, po->free_queued, 
>                               skb_queue_len(&sk->sk_receive_queue), 
>                               atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf, 
> po->budget, po->queued, po->dropped, po->total);
> #endif
> }
> 
> static inline pte_t *get_pte(struct vm_area_struct *vma, unsigned long addr)
> {
>       pgd_t *pgd;
>       pud_t *pud;
>       pmd_t *pmd;
>       pte_t *pte;
> 
>       pgd = pgd_offset(vma->vm_mm, addr);
>       pud = pud_offset(pgd, addr);
>       pmd = pmd_offset(pud, addr);
> 
>       if (pmd_none(*pmd))
>               vma->vm_mm->nr_ptes--;
> 
>       pte = pte_offset_map(pmd, addr);
> 
>       printk("%s: addr=%08lx, pte=%p, %08lx, pmd=%p, pud=%p, pgd=%p, 
> nr_pte=%ld.\n", 
>                       __func__, addr, pte, pte_val(*pte), pmd, pud, pgd, 
> vma->vm_mm->nr_ptes);
>       
> 
>       return pte;
> }
> 
> static inline void update_address(struct vm_area_struct *vma, unsigned long 
> addr, unsigned long pfn)
> {
>       pte_t *pte;
>       struct page *page;
> 
>       pte = get_pte(vma, addr);
>       page = pfn_to_page(pfn);
>                       
>       printk("%s: pfn=%08lx, valid=%d, page=%p, res=%d, mapcount=%d.\n", 
>                       __func__, pfn, pfn_valid(pfn), page, 
> PageReserved(page), page_mapcount(page));
>       
>       pte_clear(vma->mm, addr, pte);
>       pte_unmap(pte);
> }
> 
> static void test_timer_func(void *data)
> {
>       struct sock *sk = (struct sock *)data;
>       struct packet_sock *po;
>       struct packet_shared *ps;
>       struct sk_buff *skb;
>       unsigned long virt, start;
>       int num = 0;
> 
>       if (!sk)
>               return;
> 
>       po = pkt_sk(sk);
>       if (!po || !po->tsk || !po->tsk->mm || !test_bit(PACKET_SOCKET_RUNNING, 
> &po->flags) || !test_bit(PACKET_SOCKET_MAPPED, &po->flags))
>               return;
> 
>       down_write(&po->tsk->mm->mmap_sem);
> #if 1
>       printk("%s: free_queued=%d, qeued=%d [rmem=%d, max=%d], budget=%d, 
> queued=%lu, dropped=%lu, total=%lu.\n", 
>                       __func__, po->free_queued, 
>                       skb_queue_len(&sk->sk_receive_queue), 
>                       atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf, 
>                       po->budget, po->queued, po->dropped, po->total);
> #endif
>       while (++num <= po->budget && (skb = 
> skb_dequeue(&sk->sk_receive_queue))) {
>               virt = (unsigned long)skb->mac.raw;
>               if (!virt)
>                       goto out;
>       
>               start = po->vma->vm_start + PAGE_SIZE*(1+po->last);
>               ps = &((struct packet_shared *)po->page)[po->last];
> 
>               printk("s=%08lx, p=%p, pos=%d, offset=%04x, flags=%08lx.\n", 
> start, virt_to_page(virt), ps->pos, ps->offset, ps->flags);
>               if (0) {
>                       //int i;
>               
>                       printk("offset=%4lx, num=%2d, last=%2d, users=%1d, 
> dataref=%1d: ", 
>                                       offset_in_page(virt), num, po->last, 
>                                       atomic_read(&skb->users), 
> atomic_read(&skb_shinfo(skb)->dataref));
>                       dump_skb(skb);
> #if 0
>                       for (i=0; i<32; ++i)
>                               printk("%02x ", ((unsigned char *)virt)[i]);
>                       printk("\n");
> #endif
>               }
> 
>               /*
>                * This actually should not be flush_tlb(), 
>                * but it is the only one call that can be used in modules.
>                * --zbr
>                */
>               update_address(po->vma, start, __pa(virt) >> PAGE_SHIFT);
>               __flush_tlb();
>               
>               SetPageReserved(virt_to_page(virt));
>               get_page(virt_to_page(virt));
>               if (remap_pfn_range(po->vma, start, __pa(virt) >> PAGE_SHIFT, 
> PAGE_SIZE, po->vma->vm_page_prot)) {
>                       printk("Remapping error.\n");
>                       ClearPageReserved(virt_to_page(virt));
>                       goto out;
>               }
> 
>               flush_dcache_page(virt_to_page(virt));
> 
>               if (test_bit(PACKET_MAPPED, &ps->flags))
>                       packet_free_skbs(po, 1);
> 
>               ps->offset = offset_in_page(virt);
>               set_bit(PACKET_MAPPED, &ps->flags);
> 
>               if (++po->last == po->budget)
>                       po->last = 0;
> 
>               {
>                       start = po->vma->vm_start;
> 
>                       while (start < po->vma->vm_end) {
>                               pte_t *pte = get_pte(po->vma, start);
> 
>                               if (pte_present(*pte)) {
>                                       struct page *page = NULL;
>                                       unsigned long pfn = pte_pfn(*pte);
>                                       if (pfn_valid(pfn)) {
>                                               page = pfn_to_page(pfn);
> 
>                                               printk("s=%08lx, p=%p, r=%d, 
> m=%d, pfn=%08lx.\n", 
>                                                               start, page, 
> PageReserved(page), page_mapcount(page), pfn);
>                                       } else
>                                               printk("p=NULL, pfn=%08lx.\n", 
> pfn);
> 
>                               } else {
>                                       printk("pte=%p is not present.\n", pte);
>                               }
> 
>                               start += PAGE_SIZE;
>                       }
>               }
> 
> out:
>               /*
>                * Actually here should be some smart algo, which will defer 
> skb freeing
>                * until userspace "read" it, so userspace should provide some 
> kind of callback,
>                * which will require write permisions to the area, so it 
> should be splitted.
>                * Or better just to free it after some timeout, say 100 msec 
> should be enough.
>                * --zbr
>                *
>                *  Tricky algo is to place skbs into new list, which will be 
> traversed 
>                *  in a some interval and skbs will be unlinked and freed.
>                *  Actually, there is no need to lock this queue against 
> freeing, since it happens
>                *  synchroniously, but if someday freeing will be separate 
> nothing will be changed.
>                *  --zbr
>                */
> 
>               spin_lock_bh(&po->sk_free_queue.lock);
>               po->free_queued++;
>               __skb_queue_tail(&po->sk_free_queue, skb);
>               spin_unlock_bh(&po->sk_free_queue.lock);
>       }
> #if 0
>       if (time_after(jiffies, po->next_free)) {
>               po->next_free = jiffies + msecs_to_jiffies(free_timeout);
>               packet_free_skbs(po, 0);
>       }
> #endif
>       printk("%s: UP: po->tsk->mm=%p.\n", __func__, po->tsk->mm);
>       up_write(&po->tsk->mm->mmap_sem);
> 
>       printk("%s finished.\n", __func__);
> }
> 
> static void packet_mm_open(struct vm_area_struct *vma)
> {
>       struct file *file = vma->vm_file;
>       struct inode *inode = file->f_dentry->d_inode;
>       struct socket * sock = SOCKET_I(inode);
>       struct sock *sk = sock->sk;
>       
>       printk( "%s, sk=%p.\n", __func__, sk);
> }
> 
> static void packet_mm_close(struct vm_area_struct *vma)
> {
>       struct file *file = vma->vm_file;
>       struct inode *inode = file->f_dentry->d_inode;
>       struct socket *sock = SOCKET_I(inode);
>       struct sock *sk = sock->sk;
>       
>       printk( "%s, sk=%p.\n", __func__, sk);
>                       
>       if (sk) {
>               struct packet_sock *po = pkt_sk(sk);
> 
>               if (po) {
>                       down_write(&vma->vm_mm->mmap_sem);
>                       clear_bit(PACKET_SOCKET_MAPPED, &po->flags);
>                       up_write(&vma->vm_mm->mmap_sem);
>               }
>       }
> }
> 
> static struct vm_operations_struct packet_mmap_ops = {
>       .open           = packet_mm_open,
>       .close          = packet_mm_close,
> };
> 
> static int packet_mmap_test(struct socket *sock, struct vm_area_struct *vma)
> {
>       int i;
>       struct timeval tv1, tv2;
>       unsigned long start = vma->vm_start;
>       u8 *data1, *data2;
> 
>       do_gettimeofday(&tv1);
>       for (i=0; i<1000; i++) {
> 
>               update_address(vma, start, __pa(PAGE_OFFSET) >> PAGE_SHIFT);
>               __flush_tlb();
> 
>               if (remap_pfn_range(vma, start,
>                                    __pa(PAGE_OFFSET) >> PAGE_SHIFT,
>                                    PAGE_SIZE,
>                                    vma->vm_page_prot))
>                       break;
> 
>               start += PAGE_SIZE;
>       }
>       do_gettimeofday(&tv2);
> 
>       printk("%s: 1000 remaps took %lu usec.\n", __func__, (tv2.tv_sec - 
> tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec);
> 
>       data1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
>       if (!data1)
>               return -ENOMEM;
>       data2 = kmalloc(PAGE_SIZE, GFP_KERNEL);
>       if (!data2) {
>               kfree(data2);
>               return -ENOMEM;
>       }
> 
>       do_gettimeofday(&tv1);
>       for (i=0; i<1000; i++) {
>               memcpy(data1, data2, 1500);
>       }
>       do_gettimeofday(&tv2);
> 
>       printk("%s: 1000 copyings took %lu usec.\n", __func__, (tv2.tv_sec - 
> tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec);
> 
>       kfree(data1);
>       kfree(data2);
> 
>       return 0;
> }
> 
> static int packet_mmap(struct file *file, struct socket *sock, struct 
> vm_area_struct *vma)
> {
>       struct sock *sk = sock->sk;
>       struct packet_sock *po = pkt_sk(sk);
>       unsigned long size = vma->vm_end - vma->vm_start;
>       int err = 0;
>       
>       vma->vm_ops = &packet_mmap_ops;
> 
>       //err = packet_mmap_test(sock, vma);
>       if (err)
>               return err;
> 
>       lock_sock(sk);
>       po->budget = (size - PAGE_SIZE) / PAGE_SIZE;
>       
>       update_address(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT);
>       __flush_tlb();
> 
>       SetPageReserved(virt_to_page(po->page));
>       if (remap_pfn_range(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT, 
> PAGE_SIZE, vma->vm_page_prot)) {
>               ClearPageReserved(virt_to_page(po->page));
>               err = -EIO;
>               goto err_out_unlock;
>       }
> 
>       po->vma = vma;
> 
>       release_sock(sk);
> 
>       INIT_WORK(&w, test_timer_func, sk);
>       
>       set_bit(PACKET_SOCKET_MAPPED, &po->flags);
> 
>       return 0;
> 
> err_out_unlock:
>       release_sock(sk);
>       return err;
> }
> 
> static unsigned int packet_poll(struct file * file, struct socket *sock, 
> poll_table *wait)
> {
>       struct sock *sk = sock->sk;
>       struct packet_sock *po = pkt_sk(sk);
>       unsigned int mask = datagram_poll(file, sock, wait);
> 
>       spin_lock_bh(&sk->sk_receive_queue.lock);
>       if (po->free_queued < po->total)
>               mask |= POLLIN | POLLRDNORM;
>       spin_unlock_bh(&sk->sk_receive_queue.lock);
>       return mask;
> }
> 
> static struct proto_ops packet_ops_spkt = {
>       .family         = PF_PACKET,
>       .owner          = THIS_MODULE,
>       .release        = packet_release,
>       .bind           = packet_bind,
>       .connect        = sock_no_connect,
>       .socketpair     = sock_no_socketpair,
>       .accept         = sock_no_accept,
>       .getname        = sock_no_getname,
>       .poll           = packet_poll,
>       .ioctl          = packet_ioctl,
>       .listen         = sock_no_listen,
>       .shutdown       = sock_no_shutdown,
>       .setsockopt     = sock_no_setsockopt,
>       .getsockopt     = sock_no_getsockopt,
>       .sendmsg        = sock_no_sendmsg,
>       .recvmsg        = sock_no_recvmsg,
>       .mmap           = packet_mmap,
>       .sendpage       = sock_no_sendpage,
> };
> 
> static struct net_proto_family packet_family_ops = {
>       .family         = PF_PACKET,
>       .create         = packet_create,
>       .owner          = THIS_MODULE,
> };
> 
> static void __exit packet_exit(void)
> {
>       sock_unregister(PF_PACKET);
>       proto_unregister(&packet_proto);
> }
> 
> static int __init packet_init(void)
> {
>       int rc = proto_register(&packet_proto, 0);
> 
>       if (rc != 0)
>               goto out;
> 
>       sock_register(&packet_family_ops);
> 
>       printk("%s: initialized at %lu.\n", __func__, jiffies);
> out:
>       return rc;
> }
> 
> module_init(packet_init);
> module_exit(packet_exit);
> MODULE_LICENSE("GPL");
> MODULE_ALIAS_NETPROTO(PF_PACKET);
> 
> af_tlb.h
> 
> /*
>  *    af_tlb.h
>  * 
>  * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]>
>  * All rights reserved.
>  * 
>  * This program is free software; you can redistribute it and/or modify
>  * it under the terms of the GNU General Public License as published by
>  * the Free Software Foundation; either version 2 of the License, or
>  * (at your option) any later version.
>  *
>  * This program is distributed in the hope that it will be useful,
>  * but WITHOUT ANY WARRANTY; without even the implied warranty of
>  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>  * GNU General Public License for more details.
>  *
>  * You should have received a copy of the GNU General Public License
>  * along with this program; if not, write to the Free Software
>  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
>  */
> 
> #ifndef __AF_TLB_H
> #define __AF_TLB_H
> 
> enum packet_shared_flags {
>       PACKET_MAPPED = 0,
> };
> 
> struct packet_shared {
>       __u16                   offset;
>       __u16                   reserved;
>       int                     pos;
>       long                    flags;
> } __attribute__ ((packed));
> 
> #ifdef __KERNEL__
> 
> enum packet_flags {
>       PACKET_SOCKET_RUNNING = 0,
>       PACKET_SOCKET_MAPPED,
> };
> 
> struct packet_sock {
>       struct sock             sk;
>       struct packet_type      prot_hook;
>       spinlock_t              bind_lock;
>       
>       long                    flags;
>       int                     ifindex;
>       unsigned short          num;
> 
>       struct vm_area_struct   *vma;
> 
>       struct task_struct      *tsk;
> 
>       int                     budget, last;
>       unsigned long           page;
> 
>       struct sk_buff_head     sk_free_queue;
>       int                     free_queued;
> 
>       unsigned long           next_free;
> 
>       unsigned long           queued;
>       unsigned long           dropped;
>       unsigned long           total;
> };
> 
> #endif /* __KERNEL__ */
> 
> #endif /* __AF_TLB_H */
> 
> tlb_test.c
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <sys/mman.h>
> #include <sys/poll.h>
> 
> #include <stdio.h>
> #include <string.h>
> #include <stdlib.h>
> #include <errno.h>
> #include <unistd.h>
> 
> #include <netinet/in.h>
> #include <netinet/ip.h>
> #include <net/ethernet.h>
> 
> #include <linux/if_ether.h>
> #include <linux/types.h>
> 
> #include "af_tlb.h"
> 
> #define PAGE_SIZE     4096
> static size_t mmap_size = 17*PAGE_SIZE;
> 
> #define ulog(f, a...) do { fprintf(stderr, f, ##a); fflush(stderr); } while 
> (0)
> #define NIPQUAD(addr) \
>       ((unsigned char *)&addr)[0], \
>       ((unsigned char *)&addr)[1], \
>       ((unsigned char *)&addr)[2], \
>       ((unsigned char *)&addr)[3]
> 
> static __inline__ void set_bit(int bit, uint32_t *f)
> {
>       *f |= (1<<bit);
> }
> 
> static __inline__ void clear_bit(int bit, uint32_t *f)
> {
>       *f &= ~(1<<bit);
> }
> 
> static __inline__ int test_bit(int bit, uint32_t *f)
> {
>       return ((*f >> bit) & 1);
> }
> 
> static void dump_data(void *ptr, __u16 offset, int size)
> {
>       int i;
>       unsigned char *data = ptr + offset;
> 
>       ulog("%p: ", ptr);
>       for (i=0; i<size; ++i)
>               ulog("%02x ", data[i]);
>       ulog("\n");
> }
> 
> static int dump_network(void *ptr, __u16 offset)
> {
>       struct ether_header *eth = ptr + offset;
>       struct iphdr *ip;
>       char *proto;
>       int i;
>       unsigned short ether_type;
> 
>       //ulog("offset=%x: ", offset);
>       
>       ether_type = ntohs(eth->ether_type);
>       if (ether_type != ETH_P_IP && ether_type != ETH_P_ARP) {
>               //ulog("\n");
>               return -1;
>       }
> 
>       ulog("MAC: proto=%04x, src=", eth->ether_type);
>       for (i=0; i<ETH_ALEN-1; ++i)
>               ulog("%02x:", eth->ether_shost[i]);
>       ulog("%02x, dst=", eth->ether_shost[ETH_ALEN-1]);
>       for (i=0; i<ETH_ALEN-1; ++i)
>               ulog("%02x:", eth->ether_dhost[i]);
>       ulog("%02x. ", eth->ether_dhost[ETH_ALEN-1]);
> 
>       if (ether_type != ETH_P_IP) {
>               dump_data(ptr, offset + sizeof(*eth), 16);
>               return 0;
>       }
> 
>       ip = (struct iphdr *)(ptr + offset + sizeof(*eth));
> 
>       switch (ip->protocol) {
>               case IPPROTO_TCP:
>                       proto = "TCP ";
>                       break;
>               case IPPROTO_UDP:
>                       proto = "UDP ";
>                       break;
>               case IPPROTO_ICMP:
>                       proto = "ICMP";
>                       break;
>               default:
>                       proto = "UNKN";
>                       dump_data(ptr, offset + sizeof(*eth), 16);
>                       return 0;
>       }
> 
>       ulog("%s: ", proto);
>       ulog("%u.%u.%u.%u -> %u.%u.%u.%u.\n", NIPQUAD(ip->saddr), 
> NIPQUAD(ip->daddr));
> 
>       return 0;
> }
> 
> int main(int argc, char *argv[])
> {
>       struct sockaddr sa;
>       int s, err, num, i, j;
>       socklen_t len = sizeof(sa);
>       void *mmap_ptr;
>       struct packet_shared *ps, *ops;
>       void *old_ps;
>       struct pollfd pfd;
>       
>       if (argc > 1)
>               memcpy(sa.sa_data, argv[1], sizeof(sa.sa_data));
>       else
>               memcpy(sa.sa_data, "eth0", sizeof(sa.sa_data));
> 
>       old_ps = malloc(PAGE_SIZE);
>       if (!old_ps) {
>               ulog("Failed to allocate backup packet shared page.\n");
>               return -ENOMEM;
>       }
> 
>       memset(old_ps, 0, PAGE_SIZE);
> 
>       s = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
>       if (s == -1) {
>               ulog("Failed to create PF_PACKET socket: %s [%d].\n", 
>                               strerror(errno), errno);
>               err = -errno;
>               goto err_out_free_old_ps;
>       }
> 
>       mmap_ptr = mmap(NULL, mmap_size, PROT_READ, MAP_SHARED, s, 0);
>       if (mmap_ptr == MAP_FAILED) {
>               ulog("Failed to map socket %d: %s [%d].\n", s, strerror(errno), 
> errno);
>               err = -errno;
>               goto err_out_close;
>       }
>       
>       err = bind(s, &sa, len);
>       if (err == -1) {
>               ulog("Failed to bind socket %d to device %s: %s [%d].\n",
>                               s, sa.sa_data, strerror(errno), errno);
>               goto err_out_unmap;
>       }
> 
>       pfd.fd = s;
>       pfd.events = POLLIN;
>       pfd.revents = 0;
>       
>       num = (mmap_size - PAGE_SIZE) / PAGE_SIZE;
>       
>       j = 0;
>       while (1) {
>               /*err = poll(&pfd, 1, -1);
>               
>               if ((err == 0 || err == -1) && (errno != EINTR)) {
>                       err = -errno;
>                       break;
>               }*/
> 
>               ps = (struct packet_shared *)mmap_ptr;
>               ops = (struct packet_shared *)old_ps;
> 
>               for (i=0; i<num; ++i) {
>                       void *ptr = mmap_ptr + PAGE_SIZE*(i+1);
> 
>                       if (test_bit(PACKET_MAPPED, &ps->flags) && ps->offset 
> != ops->offset) {
>                               err = dump_network(ptr, ps->offset);
>                               if (++j > 1000)
>                                       goto err_out_unmap;
>                       }
> #if 0
>                       if (err && ps->offset)
>                               dump_data(ptr, ps->offset, 32);
> #endif
>                       *ops++ = *ps++;
>               }
>               
>               pfd.events = POLLIN;
>               pfd.revents = 0;
>       }
> 
>       err = 0;
>       
> err_out_unmap:
>       munmap(mmap_ptr, mmap_size);
> err_out_close:
>       close(s);
> err_out_free_old_ps:
>       free(old_ps);
> 
>       return err;
> }
> 
> Makefile.
> 
> obj-m         := af_tlb.o
> 
> KDIR  := /lib/modules/`uname -r`/build
> #KDIR := /usr/local/src/linux-2.6
> PWD   := $(shell pwd)
> UCFLAGS       := -W -Wall
> 
> default:
>       $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules
> 
> test:
>       gcc $(UCFLAGS) tlb_test.c -o tlb_test
> 
> clean:
>       $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) clean
>       @rm -f *~
> 
> -- 
>       Evgeniy Polyakov
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
        Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to