Hello, developers.

This cruft works now much better.
Unfortunately I need to add some scary PTE insults- you can find them in
update_address().
One big nitpick is that this module can not be unloaded if application 
do not closes socket - socket is being removed after mapping is destroyed, 
so I need to grab MM reference, but can not drop it.
Also it uses flush_tlb() all over the place, but it is only one macros,
that can be used in modules - tlb_flush_page() and tlb_flush_one() are not
exported. It also has a race on startup, when there is only one page
mapped (control page), but userspace (very simple) may want to access 
data pages.
Control page contains set of control structures one per mapped page,
i.e. mapped skb, control structure has an offset of skb->mac.raw in the
page and flags field.

I gladly want to listen your comments.
Thanks.

Included files: 
af_tlb.[ch] - zero-copy sniffer implementation.
tlb_test.c - simple userspace sniffer.

af_tlb.c
/*
 *      af_tlb.c
 * 
 * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]>
 * All rights reserved.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

 
#include <linux/config.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/wireless.h>
#include <linux/kmod.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/workqueue.h>

#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/fs.h>
#include <linux/shm.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/swapops.h>

#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>

#include "af_tlb.h"

static unsigned int free_timeout = 10;
module_param(free_timeout, uint, 0);

static void test_timer_func(void *data);
static DECLARE_WORK(w, test_timer_func, NULL);

static void packet_free_skbs(struct packet_sock *po, int clear_last);

static inline struct packet_sock *pkt_sk(struct sock *sk)
{
        return (struct packet_sock *)sk;
}

static void packet_sock_destruct(struct sock *sk)
{
        BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
        BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));

        if (!sock_flag(sk, SOCK_DEAD)) {
                printk("Attempt to release alive packet socket: %p\n", sk);
                return;
        }
}


static struct proto_ops packet_ops_spkt;

static void dump_skb(struct sk_buff *skb)
{
        struct ethhdr *eth;
        int i;

        printk("shared=%d, cloned=%d, len=%4d: ", skb_shared(skb), 
skb_cloned(skb), skb->len);

        eth = eth_hdr(skb);

        printk("MAC: proto=%04x, src=", eth->h_proto);
        for (i=0; i<ETH_ALEN-1; ++i)
                printk("%02x:", eth->h_source[i]);
        printk("%02x, dst=", eth->h_source[ETH_ALEN-1]);
        for (i=0; i<ETH_ALEN-1; ++i)
                printk("%02x:", eth->h_dest[i]);
        printk("%02x.\n", eth->h_dest[ETH_ALEN-1]);
}

static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct 
packet_type *pt)
{
        struct sock *sk;
        struct sockaddr_pkt *spkt;
        struct packet_sock *po;
        int err;

        sk = pt->af_packet_priv;
        po = pkt_sk(sk);

        po->total++;

        /*
         *      Yank back the headers [hope the device set this
         *      right or kerboom...]
         *
         *      Incoming packets have ll header pulled,
         *      push it back.
         *
         *      For outgoing ones skb->data == skb->mac.raw
         *      so that this procedure is noop.
         */

        if (skb->pkt_type == PACKET_LOOPBACK)
                goto out;

        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
                goto oom;

        /* drop any routing info */
        dst_release(skb->dst);
        skb->dst = NULL;

        spkt = (struct sockaddr_pkt*)skb->cb;

        skb_push(skb, skb->data-skb->mac.raw);

        /*
         *      The SOCK_PACKET socket receives _all_ frames.
         */

        spkt->spkt_family = dev->type;
        strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
        spkt->spkt_protocol = skb->protocol;

        err = sock_queue_rcv_skb(sk, skb);
        if (!err)
                po->queued++;
        else
                po->dropped++;

        if (test_bit(PACKET_SOCKET_MAPPED, &po->flags))
                schedule_work(&w);

        if (!err)
                return 0;

out:
        kfree_skb(skb);
oom:
        return 0;
}


/*
 *      Close a PACKET socket. This is fairly simple. We immediately go
 *      to 'closed' state and remove our protocol entry in the device list.
 */

static int packet_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po;

        if (!sk)
                return 0;

        po = pkt_sk(sk);

        sk_del_node_init(sk);

        if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) {
                dev_remove_pack(&po->prot_hook);
                clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
                __sock_put(sk);
        }

        sock_orphan(sk);
        sock->sk = NULL;

        printk("%s: Waiting to workqueue.\n", __func__);

        clear_bit(PACKET_SOCKET_RUNNING, &po->flags);

        cancel_delayed_work(&w);
        flush_scheduled_work();

        skb_queue_purge(&sk->sk_receive_queue);
        skb_queue_purge(&po->sk_free_queue);

        printk("%s: releasing page.\n", __func__);

        free_page(po->page);
        sock_put(sk);
        
        mmput(po->tsk->mm);
        
        return 0;
}

/*
 *      Attach a packet hook.
 */

static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
{
        struct packet_sock *po = pkt_sk(sk);

        lock_sock(sk);

        spin_lock(&po->bind_lock);
        if (test_bit(PACKET_SOCKET_RUNNING, &po->flags)) {
                __sock_put(sk);
                clear_bit(PACKET_SOCKET_RUNNING, &po->flags);
                po->num = 0;
                spin_unlock(&po->bind_lock);
                dev_remove_pack(&po->prot_hook);
                spin_lock(&po->bind_lock);
        }

        po->num = protocol;
        po->prot_hook.type = protocol;
        po->prot_hook.dev = dev;

        po->ifindex = dev ? dev->ifindex : 0;

        if (protocol == 0)
                goto out_unlock;

        if (dev) {
                if (dev->flags&IFF_UP) {
                        dev_add_pack(&po->prot_hook);
                        sock_hold(sk);
                        set_bit(PACKET_SOCKET_RUNNING, &po->flags);
                } else {
                        sk->sk_err = ENETDOWN;
                        if (!sock_flag(sk, SOCK_DEAD))
                                sk->sk_error_report(sk);
                }
        } else {
                dev_add_pack(&po->prot_hook);
                sock_hold(sk);
                set_bit(PACKET_SOCKET_RUNNING, &po->flags);
        }

out_unlock:
        spin_unlock(&po->bind_lock);
        release_sock(sk);
        return 0;
}

static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int 
addr_len)
{
        struct sock *sk=sock->sk;
        char name[15];
        struct net_device *dev;
        int err = -ENODEV;

        strlcpy(name, uaddr->sa_data, sizeof(name));
        printk( "%s: name=%s.\n", __func__, name);
        
        if(addr_len!=sizeof(struct sockaddr))
                return -EINVAL;

        dev = dev_get_by_name(name);
        if (dev) {
                err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
                dev_put(dev);
        }
        return err;
}

static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long 
arg)
{
        switch(cmd) {
                default:
                        return dev_ioctl(cmd, (void __user *)arg);
        }
        return 0;
}

static struct proto packet_proto = {
        .name     = "PACKET",
        .owner    = THIS_MODULE,
        .obj_size = sizeof(struct packet_sock),
};

static int packet_sock_init(struct packet_sock *po, int protocol, struct sock 
*sk)
{
        skb_queue_head_init(&po->sk_free_queue);
        
        po->last        = 0;
        po->total       = 0;
        po->dropped     = 0;
        po->queued      = 0;
        po->flags       = 0;
        po->budget      = 1;
        po->next_free   = jiffies + msecs_to_jiffies(free_timeout);

        spin_lock_init(&po->bind_lock);

        po->tsk = current;
        
        po->page = __get_free_page(GFP_KERNEL);
        if (!po->page)
                return -ENOMEM;
        
        memset((void *)po->page, 0, PAGE_SIZE);
        
        po->num = protocol;
        po->prot_hook.func = packet_rcv_spkt;
        po->prot_hook.af_packet_priv = sk;
        
        get_task_mm(po->tsk);

        return 0;
}

static int packet_create(struct socket *sock, int protocol)
{
        struct sock *sk;
        struct packet_sock *po;
        int err;

        if (!capable(CAP_NET_RAW))
                return -EPERM;
        if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != 
SOCK_PACKET)
                return -ESOCKTNOSUPPORT;

        sock->state = SS_UNCONNECTED;

        err = -ENOBUFS;
        sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
        if (sk == NULL)
                goto err_out_exit;

        sock->ops = &packet_ops_spkt;
        
        sock_init_data(sock, sk);

        po = pkt_sk(sk);
        sk->sk_family = PF_PACKET;
        sk->sk_destruct = packet_sock_destruct;

        err = packet_sock_init(po, protocol, sk);
        if (err)
                goto err_out_sock_free;

        if (protocol) {
                po->prot_hook.type = protocol;
                dev_add_pack(&po->prot_hook);
                sock_hold(sk);
                set_bit(PACKET_SOCKET_RUNNING, &po->flags);
        }

        return 0;

err_out_sock_free:
        sk_free(sk);
err_out_exit:
        return err;
}

static struct packet_shared *packet_find_shared_lazy(struct packet_sock *po, 
struct sk_buff *skb)
{
        u16 offset = offset_in_page(skb->mac.raw);
        struct packet_shared *ps = (struct packet_shared *)po->page;
        int i;
        
        for (i=0; i<po->budget; ++i) {
                if (ps->offset == offset)
                        break;

                ps++;
        }

        if (i == po->budget)
                return NULL;

        return ps;
}

static void packet_free_skbs(struct packet_sock *po, int clear_last)
{
        struct sk_buff *skb;
        int num = 0;
        //struct sock *sk = po->prot_hook.af_packet_priv;
        struct packet_shared *ps;
        struct page *page;
        
        while ((!skb_queue_empty(&po->sk_free_queue) && po->free_queued > 
po->budget) || clear_last > 0) {
                spin_lock_bh(&po->sk_free_queue.lock);
                skb = __skb_dequeue(&po->sk_free_queue);
                if (skb)
                        po->free_queued--;
                spin_unlock_bh(&po->sk_free_queue.lock);

                if (!skb)
                        break;
                
                ps = packet_find_shared_lazy(po, skb);
                if (ps) {
                        if (!test_bit(PACKET_MAPPED, &ps->flags))
                                printk("%s: pos=%d, offset=%04x, 
flags=%08lx.\n", __func__, ps->pos, ps->offset, ps->flags);
                        clear_bit(PACKET_MAPPED, &ps->flags);
                }

                page = virt_to_page(skb->mac.raw);

                put_page(page);
                if (!page_count(page)) {
                        ClearPageReserved(page);
                }
        
                kfree_skb(skb);
                num++;
                clear_last--;
        }
#if 0
                printk("%s: freed=%d, free_queued=%d, qeued=%d [rmem=%d, 
max=%d], budget=%d, queued=%lu, dropped=%lu, total=%lu.\n", 
                                __func__, num, po->free_queued, 
                                skb_queue_len(&sk->sk_receive_queue), 
                                atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf, 
po->budget, po->queued, po->dropped, po->total);
#endif
}

static inline pte_t *get_pte(struct vm_area_struct *vma, unsigned long addr)
{
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

        pgd = pgd_offset(vma->vm_mm, addr);
        pud = pud_offset(pgd, addr);
        pmd = pmd_offset(pud, addr);

        if (pmd_none(*pmd))
                vma->vm_mm->nr_ptes--;

        pte = pte_offset_map(pmd, addr);

        printk("%s: addr=%08lx, pte=%p, %08lx, pmd=%p, pud=%p, pgd=%p, 
nr_pte=%ld.\n", 
                        __func__, addr, pte, pte_val(*pte), pmd, pud, pgd, 
vma->vm_mm->nr_ptes);
        

        return pte;
}

static inline void update_address(struct vm_area_struct *vma, unsigned long 
addr, unsigned long pfn)
{
        pte_t *pte;
        struct page *page;

        pte = get_pte(vma, addr);
        page = pfn_to_page(pfn);
                        
        printk("%s: pfn=%08lx, valid=%d, page=%p, res=%d, mapcount=%d.\n", 
                        __func__, pfn, pfn_valid(pfn), page, 
PageReserved(page), page_mapcount(page));
        
        pte_clear(vma->mm, addr, pte);
        pte_unmap(pte);
}

static void test_timer_func(void *data)
{
        struct sock *sk = (struct sock *)data;
        struct packet_sock *po;
        struct packet_shared *ps;
        struct sk_buff *skb;
        unsigned long virt, start;
        int num = 0;

        if (!sk)
                return;

        po = pkt_sk(sk);
        if (!po || !po->tsk || !po->tsk->mm || !test_bit(PACKET_SOCKET_RUNNING, 
&po->flags) || !test_bit(PACKET_SOCKET_MAPPED, &po->flags))
                return;

        down_write(&po->tsk->mm->mmap_sem);
#if 1
        printk("%s: free_queued=%d, qeued=%d [rmem=%d, max=%d], budget=%d, 
queued=%lu, dropped=%lu, total=%lu.\n", 
                        __func__, po->free_queued, 
                        skb_queue_len(&sk->sk_receive_queue), 
                        atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf, 
                        po->budget, po->queued, po->dropped, po->total);
#endif
        while (++num <= po->budget && (skb = 
skb_dequeue(&sk->sk_receive_queue))) {
                virt = (unsigned long)skb->mac.raw;
                if (!virt)
                        goto out;
        
                start = po->vma->vm_start + PAGE_SIZE*(1+po->last);
                ps = &((struct packet_shared *)po->page)[po->last];

                printk("s=%08lx, p=%p, pos=%d, offset=%04x, flags=%08lx.\n", 
start, virt_to_page(virt), ps->pos, ps->offset, ps->flags);
                if (0) {
                        //int i;
                
                        printk("offset=%4lx, num=%2d, last=%2d, users=%1d, 
dataref=%1d: ", 
                                        offset_in_page(virt), num, po->last, 
                                        atomic_read(&skb->users), 
atomic_read(&skb_shinfo(skb)->dataref));
                        dump_skb(skb);
#if 0
                        for (i=0; i<32; ++i)
                                printk("%02x ", ((unsigned char *)virt)[i]);
                        printk("\n");
#endif
                }

                /*
                 * This actually should not be flush_tlb(), 
                 * but it is the only one call that can be used in modules.
                 * --zbr
                 */
                update_address(po->vma, start, __pa(virt) >> PAGE_SHIFT);
                __flush_tlb();
                
                SetPageReserved(virt_to_page(virt));
                get_page(virt_to_page(virt));
                if (remap_pfn_range(po->vma, start, __pa(virt) >> PAGE_SHIFT, 
PAGE_SIZE, po->vma->vm_page_prot)) {
                        printk("Remapping error.\n");
                        ClearPageReserved(virt_to_page(virt));
                        goto out;
                }

                flush_dcache_page(virt_to_page(virt));

                if (test_bit(PACKET_MAPPED, &ps->flags))
                        packet_free_skbs(po, 1);

                ps->offset = offset_in_page(virt);
                set_bit(PACKET_MAPPED, &ps->flags);

                if (++po->last == po->budget)
                        po->last = 0;

                {
                        start = po->vma->vm_start;

                        while (start < po->vma->vm_end) {
                                pte_t *pte = get_pte(po->vma, start);

                                if (pte_present(*pte)) {
                                        struct page *page = NULL;
                                        unsigned long pfn = pte_pfn(*pte);
                                        if (pfn_valid(pfn)) {
                                                page = pfn_to_page(pfn);

                                                printk("s=%08lx, p=%p, r=%d, 
m=%d, pfn=%08lx.\n", 
                                                                start, page, 
PageReserved(page), page_mapcount(page), pfn);
                                        } else
                                                printk("p=NULL, pfn=%08lx.\n", 
pfn);

                                } else {
                                        printk("pte=%p is not present.\n", pte);
                                }

                                start += PAGE_SIZE;
                        }
                }

out:
                /*
                 * Actually here should be some smart algo, which will defer 
skb freeing
                 * until userspace "read" it, so userspace should provide some 
kind of callback,
                 * which will require write permisions to the area, so it 
should be splitted.
                 * Or better just to free it after some timeout, say 100 msec 
should be enough.
                 * --zbr
                 *
                 *  Tricky algo is to place skbs into new list, which will be 
traversed 
                 *  in a some interval and skbs will be unlinked and freed.
                 *  Actually, there is no need to lock this queue against 
freeing, since it happens
                 *  synchroniously, but if someday freeing will be separate 
nothing will be changed.
                 *  --zbr
                 */

                spin_lock_bh(&po->sk_free_queue.lock);
                po->free_queued++;
                __skb_queue_tail(&po->sk_free_queue, skb);
                spin_unlock_bh(&po->sk_free_queue.lock);
        }
#if 0
        if (time_after(jiffies, po->next_free)) {
                po->next_free = jiffies + msecs_to_jiffies(free_timeout);
                packet_free_skbs(po, 0);
        }
#endif
        printk("%s: UP: po->tsk->mm=%p.\n", __func__, po->tsk->mm);
        up_write(&po->tsk->mm->mmap_sem);

        printk("%s finished.\n", __func__);
}

static void packet_mm_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct inode *inode = file->f_dentry->d_inode;
        struct socket * sock = SOCKET_I(inode);
        struct sock *sk = sock->sk;
        
        printk( "%s, sk=%p.\n", __func__, sk);
}

static void packet_mm_close(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct inode *inode = file->f_dentry->d_inode;
        struct socket *sock = SOCKET_I(inode);
        struct sock *sk = sock->sk;
        
        printk( "%s, sk=%p.\n", __func__, sk);
                        
        if (sk) {
                struct packet_sock *po = pkt_sk(sk);

                if (po) {
                        down_write(&vma->vm_mm->mmap_sem);
                        clear_bit(PACKET_SOCKET_MAPPED, &po->flags);
                        up_write(&vma->vm_mm->mmap_sem);
                }
        }
}

static struct vm_operations_struct packet_mmap_ops = {
        .open           = packet_mm_open,
        .close          = packet_mm_close,
};

static int packet_mmap_test(struct socket *sock, struct vm_area_struct *vma)
{
        int i;
        struct timeval tv1, tv2;
        unsigned long start = vma->vm_start;
        u8 *data1, *data2;

        do_gettimeofday(&tv1);
        for (i=0; i<1000; i++) {

                update_address(vma, start, __pa(PAGE_OFFSET) >> PAGE_SHIFT);
                __flush_tlb();

                if (remap_pfn_range(vma, start,
                                     __pa(PAGE_OFFSET) >> PAGE_SHIFT,
                                     PAGE_SIZE,
                                     vma->vm_page_prot))
                        break;

                start += PAGE_SIZE;
        }
        do_gettimeofday(&tv2);

        printk("%s: 1000 remaps took %lu usec.\n", __func__, (tv2.tv_sec - 
tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec);

        data1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!data1)
                return -ENOMEM;
        data2 = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!data2) {
                kfree(data2);
                return -ENOMEM;
        }

        do_gettimeofday(&tv1);
        for (i=0; i<1000; i++) {
                memcpy(data1, data2, 1500);
        }
        do_gettimeofday(&tv2);

        printk("%s: 1000 copyings took %lu usec.\n", __func__, (tv2.tv_sec - 
tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec);

        kfree(data1);
        kfree(data2);

        return 0;
}

static int packet_mmap(struct file *file, struct socket *sock, struct 
vm_area_struct *vma)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        unsigned long size = vma->vm_end - vma->vm_start;
        int err = 0;
        
        vma->vm_ops = &packet_mmap_ops;

        //err = packet_mmap_test(sock, vma);
        if (err)
                return err;

        lock_sock(sk);
        po->budget = (size - PAGE_SIZE) / PAGE_SIZE;
        
        update_address(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT);
        __flush_tlb();

        SetPageReserved(virt_to_page(po->page));
        if (remap_pfn_range(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT, 
PAGE_SIZE, vma->vm_page_prot)) {
                ClearPageReserved(virt_to_page(po->page));
                err = -EIO;
                goto err_out_unlock;
        }

        po->vma = vma;

        release_sock(sk);

        INIT_WORK(&w, test_timer_func, sk);
        
        set_bit(PACKET_SOCKET_MAPPED, &po->flags);

        return 0;

err_out_unlock:
        release_sock(sk);
        return err;
}

static unsigned int packet_poll(struct file * file, struct socket *sock, 
poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        unsigned int mask = datagram_poll(file, sock, wait);

        spin_lock_bh(&sk->sk_receive_queue.lock);
        if (po->free_queued < po->total)
                mask |= POLLIN | POLLRDNORM;
        spin_unlock_bh(&sk->sk_receive_queue.lock);
        return mask;
}

static struct proto_ops packet_ops_spkt = {
        .family         = PF_PACKET,
        .owner          = THIS_MODULE,
        .release        = packet_release,
        .bind           = packet_bind,
        .connect        = sock_no_connect,
        .socketpair     = sock_no_socketpair,
        .accept         = sock_no_accept,
        .getname        = sock_no_getname,
        .poll           = packet_poll,
        .ioctl          = packet_ioctl,
        .listen         = sock_no_listen,
        .shutdown       = sock_no_shutdown,
        .setsockopt     = sock_no_setsockopt,
        .getsockopt     = sock_no_getsockopt,
        .sendmsg        = sock_no_sendmsg,
        .recvmsg        = sock_no_recvmsg,
        .mmap           = packet_mmap,
        .sendpage       = sock_no_sendpage,
};

static struct net_proto_family packet_family_ops = {
        .family         = PF_PACKET,
        .create         = packet_create,
        .owner          = THIS_MODULE,
};

static void __exit packet_exit(void)
{
        sock_unregister(PF_PACKET);
        proto_unregister(&packet_proto);
}

static int __init packet_init(void)
{
        int rc = proto_register(&packet_proto, 0);

        if (rc != 0)
                goto out;

        sock_register(&packet_family_ops);

        printk("%s: initialized at %lu.\n", __func__, jiffies);
out:
        return rc;
}

module_init(packet_init);
module_exit(packet_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_PACKET);

af_tlb.h

/*
 *      af_tlb.h
 * 
 * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]>
 * All rights reserved.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#ifndef __AF_TLB_H
#define __AF_TLB_H

enum packet_shared_flags {
        PACKET_MAPPED = 0,
};

struct packet_shared {
        __u16                   offset;
        __u16                   reserved;
        int                     pos;
        long                    flags;
} __attribute__ ((packed));

#ifdef __KERNEL__

enum packet_flags {
        PACKET_SOCKET_RUNNING = 0,
        PACKET_SOCKET_MAPPED,
};

struct packet_sock {
        struct sock             sk;
        struct packet_type      prot_hook;
        spinlock_t              bind_lock;
        
        long                    flags;
        int                     ifindex;
        unsigned short          num;

        struct vm_area_struct   *vma;

        struct task_struct      *tsk;

        int                     budget, last;
        unsigned long           page;

        struct sk_buff_head     sk_free_queue;
        int                     free_queued;

        unsigned long           next_free;

        unsigned long           queued;
        unsigned long           dropped;
        unsigned long           total;
};

#endif /* __KERNEL__ */

#endif /* __AF_TLB_H */

tlb_test.c
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/mman.h>
#include <sys/poll.h>

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <net/ethernet.h>

#include <linux/if_ether.h>
#include <linux/types.h>

#include "af_tlb.h"

#define PAGE_SIZE       4096
static size_t mmap_size = 17*PAGE_SIZE;

#define ulog(f, a...)   do { fprintf(stderr, f, ##a); fflush(stderr); } while 
(0)
#define NIPQUAD(addr) \
        ((unsigned char *)&addr)[0], \
        ((unsigned char *)&addr)[1], \
        ((unsigned char *)&addr)[2], \
        ((unsigned char *)&addr)[3]

static __inline__ void set_bit(int bit, uint32_t *f)
{
        *f |= (1<<bit);
}

static __inline__ void clear_bit(int bit, uint32_t *f)
{
        *f &= ~(1<<bit);
}

static __inline__ int test_bit(int bit, uint32_t *f)
{
        return ((*f >> bit) & 1);
}

static void dump_data(void *ptr, __u16 offset, int size)
{
        int i;
        unsigned char *data = ptr + offset;

        ulog("%p: ", ptr);
        for (i=0; i<size; ++i)
                ulog("%02x ", data[i]);
        ulog("\n");
}

static int dump_network(void *ptr, __u16 offset)
{
        struct ether_header *eth = ptr + offset;
        struct iphdr *ip;
        char *proto;
        int i;
        unsigned short ether_type;

        //ulog("offset=%x: ", offset);
        
        ether_type = ntohs(eth->ether_type);
        if (ether_type != ETH_P_IP && ether_type != ETH_P_ARP) {
                //ulog("\n");
                return -1;
        }

        ulog("MAC: proto=%04x, src=", eth->ether_type);
        for (i=0; i<ETH_ALEN-1; ++i)
                ulog("%02x:", eth->ether_shost[i]);
        ulog("%02x, dst=", eth->ether_shost[ETH_ALEN-1]);
        for (i=0; i<ETH_ALEN-1; ++i)
                ulog("%02x:", eth->ether_dhost[i]);
        ulog("%02x. ", eth->ether_dhost[ETH_ALEN-1]);

        if (ether_type != ETH_P_IP) {
                dump_data(ptr, offset + sizeof(*eth), 16);
                return 0;
        }

        ip = (struct iphdr *)(ptr + offset + sizeof(*eth));

        switch (ip->protocol) {
                case IPPROTO_TCP:
                        proto = "TCP ";
                        break;
                case IPPROTO_UDP:
                        proto = "UDP ";
                        break;
                case IPPROTO_ICMP:
                        proto = "ICMP";
                        break;
                default:
                        proto = "UNKN";
                        dump_data(ptr, offset + sizeof(*eth), 16);
                        return 0;
        }

        ulog("%s: ", proto);
        ulog("%u.%u.%u.%u -> %u.%u.%u.%u.\n", NIPQUAD(ip->saddr), 
NIPQUAD(ip->daddr));

        return 0;
}

int main(int argc, char *argv[])
{
        struct sockaddr sa;
        int s, err, num, i, j;
        socklen_t len = sizeof(sa);
        void *mmap_ptr;
        struct packet_shared *ps, *ops;
        void *old_ps;
        struct pollfd pfd;
        
        if (argc > 1)
                memcpy(sa.sa_data, argv[1], sizeof(sa.sa_data));
        else
                memcpy(sa.sa_data, "eth0", sizeof(sa.sa_data));

        old_ps = malloc(PAGE_SIZE);
        if (!old_ps) {
                ulog("Failed to allocate backup packet shared page.\n");
                return -ENOMEM;
        }

        memset(old_ps, 0, PAGE_SIZE);

        s = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
        if (s == -1) {
                ulog("Failed to create PF_PACKET socket: %s [%d].\n", 
                                strerror(errno), errno);
                err = -errno;
                goto err_out_free_old_ps;
        }

        mmap_ptr = mmap(NULL, mmap_size, PROT_READ, MAP_SHARED, s, 0);
        if (mmap_ptr == MAP_FAILED) {
                ulog("Failed to map socket %d: %s [%d].\n", s, strerror(errno), 
errno);
                err = -errno;
                goto err_out_close;
        }
        
        err = bind(s, &sa, len);
        if (err == -1) {
                ulog("Failed to bind socket %d to device %s: %s [%d].\n",
                                s, sa.sa_data, strerror(errno), errno);
                goto err_out_unmap;
        }

        pfd.fd = s;
        pfd.events = POLLIN;
        pfd.revents = 0;
        
        num = (mmap_size - PAGE_SIZE) / PAGE_SIZE;
        
        j = 0;
        while (1) {
                /*err = poll(&pfd, 1, -1);
                
                if ((err == 0 || err == -1) && (errno != EINTR)) {
                        err = -errno;
                        break;
                }*/

                ps = (struct packet_shared *)mmap_ptr;
                ops = (struct packet_shared *)old_ps;

                for (i=0; i<num; ++i) {
                        void *ptr = mmap_ptr + PAGE_SIZE*(i+1);

                        if (test_bit(PACKET_MAPPED, &ps->flags) && ps->offset 
!= ops->offset) {
                                err = dump_network(ptr, ps->offset);
                                if (++j > 1000)
                                        goto err_out_unmap;
                        }
#if 0
                        if (err && ps->offset)
                                dump_data(ptr, ps->offset, 32);
#endif
                        *ops++ = *ps++;
                }
                
                pfd.events = POLLIN;
                pfd.revents = 0;
        }

        err = 0;
        
err_out_unmap:
        munmap(mmap_ptr, mmap_size);
err_out_close:
        close(s);
err_out_free_old_ps:
        free(old_ps);

        return err;
}

Makefile.

obj-m           := af_tlb.o

KDIR    := /lib/modules/`uname -r`/build
#KDIR   := /usr/local/src/linux-2.6
PWD     := $(shell pwd)
UCFLAGS := -W -Wall

default:
        $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

test:
        gcc $(UCFLAGS) tlb_test.c -o tlb_test

clean:
        $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) clean
        @rm -f *~

-- 
        Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to