Extending sFlow's counter polling to export system performance metrics go me 
thinking about ways to extend sampling into servers. This thinking was in part 
motivated by the following problem.

A large data center comprises a compute cluster and a large NAS array 
(NFS/CIFS). If all the servers in the cluster start accessing the same file in 
the NAS cluster this can become a bottleneck, dramatically reducing throughput. 
This type of problem can occur in a variety of data center services, http, DNS, 
memcache etc.

Extending sFlow's sampling mechanism to application layer transactions provides 
a scalable way to monitor these types of distributed services. An sFlow agent 
embedded in the application would randomly sample completed transactions. This 
minimally impacts application performance, adding a counter decrement and test 
to the critical path. When a sample is taken, key transaction metrics are 
captured (e.g.,service_direction (client/server), type (e.g. read, write, get, 
head,..), status (succeeded, failed, error code), path(file path, url, lun), 
bytes_in, bytes_out, duration). In addition to the transaction statistics, the 
socket information associated with the request (remote ip, remote port, local 
ip, local port) is also captured and exported along with the transaction 
metrics. The socket information lets you link the application view to the 
network topology and traffic data that sFlow in the switches provides.

You could apply this instrumentation at either, or both ends of a transaction. 
Getting back to the NAS example, implementing sFlow transaction monitoring in 
the NFS/CIFS storage array would provide the information needed to identify 
hotspots in real-time. For large web farms it would be easy to implement 
transaction sampling in the logging module of the web servers (for example 
creating an Apache logging module that exported sampled web requests as sFlow). 
Layer 4-7 switches, proxies and caches could also sample and export layer

The following sFlow structures are a first cut at defining layer 7 samples:

/* Application transaction sampling
/* Note: Transactions are sampled upon completion

enum status_value {
   succeeded = 0,
   generic_failure = 1,
   outofmemory = 2,
   timeout = 3,
   notpermitted = 4
}

enum service_direction {
   client = 1,
   server = 2
}

/* Generic Application Transaction record */
/* Every Application Transaction sample must start with a generic transaction 
record */
/* opaque = flow_data; enterprise = 0; format = 2000 */
struct transaction {
   service_direction direction;    /* was this transaction observed by the 
server or the client */
   unsigned int wait;              /* time in microseconds that transaction was 
queued
                                      before processing started */
   unsigned int duration;          /* time in microseconds from start of 
processing to completion */
   status_value status;            /* status of transaction */
   unsigned hyper bytes_received;  /* bytes received */
   unsigned hyper bytes_send;      /* bytes sent */
}

/* Extended socket information,
   Must be filled in for all transactions associated with a network socket
   Omit if transaction associated with non-network IPC  */

/* IPv4 Socket */
/* opaque = flow_data; enterprise = 0; format = 2100 */
struct extended_socket_ipv4 {
   unsigned int protocol;     /* IP Protocol type
                                 (for example, TCP = 6, UDP = 17) */
   ip_v4 local_ip;            /* local IP address */
   ip_v4 remote_ip;           /* remote IP address */
   unsigned int local_port;   /* TCP/UDP local port number or equivalent */
   unsigned int remote_port;  /* TCP/UDP remote port number of equivalent */
}

/* IPv6 Socket */
/* opaque = flow_data; enterprise = 0; format = 2101 */
struct extended_socket_ipv6 {
   unsigned int protocol;     /* IP Protocol type
                                 (for example, TCP = 6, UDP = 17) */
   ip_v6 local_ip;            /* local IP address */
   ip_v6 remote_ip;           /* remote IP address */
   unsigned int local_port;   /* TCP/UDP local port number or equivalent */
   unsigned int remote_port;  /* TCP/UDP remote port number of equivalent */
}

/* Extended NFS transaction */
/* see RFC 3530 */
/* opaque = flow_data; enterprise = 0; format = 2001 */
struct extended_nfs_storage_transaction {
   opaque<> path;           /* canonical path to file or directory
                               associated with operation file handle
                               UTF8 encoded string */
   unsigned int operation;  /* NFS operation */
   unsigned int status;     /* NFS operation status - nfsstat4 */
}

/* Extended SCSI transaction */
/* opaque = flow_data; enterprise = 0; format = 2002 */
struct extended_scsi_storage_transaction {
   unsigned int lun;       /* LUN */
   unsigned int operation; /* use maxint to encode unknown operation */
   unsigned int status ;    /* SCSI status code reporting result of operation */
}

/* Extended Web transaction */
/* opaque = flow_data; enterprise = 0; format = 2003 */
struct extended_http_transaction {
   string<> url;       /* The HTTP request-line (see RFC 2616) */
   string<> host;      /* The host field from the HTTP header */
   string<> referer;   /* The referer field from the HTTP header */
   string<> useragent; /* The user agent from the HTTP header */
   string<> user;      /* The authenticated user */
   unigned int status; /* Status code returned with response */
}

There has been some dialog on sharing metrics with Ganglia on the Ganglia 
developers mailing list:
http://www.mail-archive.com/ganglia-develop...@lists.sourceforge.net/

The following is a first attempt at defining host performance metrics for sFlow:

/* Physical Server CPU */
/* opaque = counter_data; enterprise = 0; format = 2003 */

struct host_cpu {
   float load_one;              /* 1 minute load avg. */
   float load_five;             /* 5 minute load avg. */
   float load_fifteen;          /* 15 minute load avg. */
   unsigned int proc_run;       /* total number of running processes */
   unsigned int proc_total;     /* total number of processes */
   unsigned int cpu_num;        /* number of CPUs */
   unsigned int cpu_speed;      /* speed in MHz of CPU */
   unsigned int uptime;         /* seconds since last reboot */
   unsigned int cpu_user;       /* user time (ms) */
   unsigned int cpu_nice;       /* nice time (ms) */
   unsigned int cpu_system;     /* system time (ms) */
   unsigned int cpu_idle;       /* idle time (ms) */
   unsigned int cpu_wio;        /* time waiting for I/O to complete (ms) */
   unsigned int cpu_intr;       /* time servicing interrupts (ms) */
   unsigned int cpu_sintr;      /* time servicing soft interrupts (ms) */
   unsigned int interrupts;     /* interrupt count */
   unsigned int context;        /* context switch count */
}

/* Physical Server Memory */
/* opaque = counter_data; enterprise = 0; format = 2004 */

struct host_memory {
    unsigned int mem_total;   /* total kB */
    unsigned int mem_free;    /* free kB */
    unsigned int mem_shared;  /* shared kB */
    unsigned int mem_buffers; /* buffers kB */
    unsigned int mem_cached;  /* cached kB */
    unsigned int swap_total;  /* swap total kB */
    unsigned int swap_free;   /* swap free kB */
    unsigned int page_in;     /* page in count */
    unsigned int page_out;    /* page out count */
    unsigned int swap_in;     /* swap in count */
    unsigned int swap_out;    /* swap out count */
}

/* Physical Server Disk I/O */
/* opaque = counter_data; enterprise = 0; format = 2005 */

struct host_disk_io {
   unsigned int reads;           /* reads issued */
   unsigned int reads_merged;    /* reads merged */
   unsigned int sectors_read;    /* sectors read */
   unsigned int read_time;       /* read time (ms) */
   unsigned int writes;          /* writes completed */
   unsigned int writes_merged; /* writes merged */
   unsigned int sectors_written; /* sectors written */
   unsigned int write_time;      /* write time (ms) */
}

/* Physical Server Network I/O */
/* opaque = counter_data; enterprise = 0; format = 2006 */

struct host_net_io {
   unsigned hyper bytes_in;  /* total bytes in */
   unsigned int pkts_in;     /* total packets in */
   unsigned int errs_in;     /* total errors in */
   unsigned int drops_in;    /* total drops in */
   unsigned hyper bytes_out; /* total bytes out */
   unsigned int packets_out; /* total packets out */
   unsigned int errs_out;    /* total errors out */
   unsigned int drops_out;   /* total drops out */
}

Reply via email to