Hi,

We are facing the problem that a post-mortem investigation of a
multi-threaded program is not possible under RedHat Linux 7.1 (kernel
version 2.4.2-2), because loading the core into gdb 5.0 does not show
the correct crash location.

Attached is a test program, linux.c.
This is a producer-consumer program, with pairs of threads sending each
other messages. Its structure is described in comments at the very
beginning of source code. If 0 is typed as stdin, a divide-by-zero
exception happens.

If you use GDB to look into the core, you'll see it gets some errors
while loading, and doesn't show the correct line which causes the
arithmetic exception:

rock[19] gdb yahuitest/p15 core
GNU gdb 5.0rh-5 Red Hat Linux 7.1
Copyright 2001 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you
are
welcome to change it and/or distribute copies of it under certain
conditions.
Type "show copying" to see the conditions.
There is absolutely no warranty for GDB.  Type "show warranty" for
details.
This GDB was configured as "i386-redhat-linux"...
Core was generated by `yahuitest/p15 100000'.
Program terminated with signal 6, Aborted.
Reading symbols from /lib/i686/libpthread.so.0...done.

warning: Unable to set global thread event mask: generic error
[New Thread 1024 (LWP 5458)]
Error while reading shared library symbols:
Can't attach LWP 5458: No such process
...

Obviously, if one runs the program in the debugger, it shows the correct
location.

Another problem is that core is not always dumpable. I repeat running
this program with 0.csh script as attached. You'll find it stops with no
core dumped after few iterations.

Does anyone know if this is a problem that we can work-around with the
current versions of the kernel (eg. setting a signal handler, etc)? Do
we need a newer version of GDB?

Thanks.

Yahui

0.csh

/* linux.c
 *
 * This is a N-paired program. Each consumer takes message from the buffer in which
 * the corresponding producer puts data. Each pair is independent of the others. 
 * A producer is randomly picked to read from stdin. If 0 is typed as stdin, an
 * arithmetic exception happens. Core file is expected to dump at this point.
 * Mutex is used to synchronize the communication between producer and consumer. 
 * To guarantee each message is consumed before producer puts in new one, condition 
 * variables are used.
 *
 * command line: linux 1000000
 * The second argument is the time (microseconds) the producer sleeps aftet each
 * iteration.
 *
 * Written by Yahui Lin
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>

#define  NUMPAIRS       15

int     buffer[NUMPAIRS];
int     seg_fault;
int     p_sleep;

pthread_mutex_t     prod_mutex[NUMPAIRS], cons_mutex[NUMPAIRS];
pthread_t           prod_id[NUMPAIRS], cons_id[NUMPAIRS];
pthread_cond_t      prod_cond[NUMPAIRS], cons_cond[NUMPAIRS];

void *producer(void *arg);
void *consumer(void *arg);

void  sighandler(int sig)
{
     if( sig != SIGFPE )
        fprintf(stderr, "unexpected signal in handler: %d\n", sig);
     else
        abort();
}


int main(int argc, char *argv[])
{
     int                i;
     struct sigaction   act;
     sigset_t           mask_sig;
     

     if( argc != 2 )
     {
        fprintf(stderr, "usage: %s prod_sleep_time\n", argv[0]);
        exit(1);
     }
     if( (p_sleep  = atoi(argv[1])) < 0 )
        p_sleep = 0;

     sigemptyset(&mask_sig);
     sigaddset(&mask_sig, SIGFPE);

     act.sa_handler = sighandler;
     act.sa_mask = mask_sig;
     act.sa_flags = 0;
     if(sigaction(SIGFPE, &act, NULL) == -1){
        perror("sigaction error");
        exit(1);
     }

     for(i=0; i<NUMPAIRS; i++){
         buffer[i] = 999;
         pthread_mutex_init(&prod_mutex[i], NULL);
         pthread_mutex_init(&cons_mutex[i], NULL);
         if(pthread_cond_init(&prod_cond[i], NULL) != 0 )
         {
            perror("producer condition init error");
            exit(1);
         }
         if(pthread_cond_init(&cons_cond[i], NULL) != 0 )
         {
            perror("consumer condition init error");
            exit(1);
         }
     }
     srand( (unsigned)time( NULL ) );
     seg_fault = rand() % NUMPAIRS;
     fprintf(stderr, "producer %d can signal consumer to generate an exception\n",
             seg_fault);

     for(i=0; i<NUMPAIRS; i++)
     {
         if(pthread_create(&prod_id[i], NULL, producer, (void *)i) != 0)
         {
            perror("producer thread create error");
            exit(2);
         }
         if(pthread_create(&cons_id[i], NULL, consumer, (void *)i) != 0)
         {
            perror("consumer thread create error");
            exit(3);
         }
     }

     for(i=0; i<NUMPAIRS; i++)
         if( pthread_join(cons_id[i], NULL) != 0 )
            perror("pthread join error");

     while(1)
         sleep(1);

     fprintf(stderr, "unexpected exit\n");  /* we don't want this happened */
     exit(0);
}


void *producer(void *arg)
{
    int    id;
    char   input[10];

    id = (int)arg;
    fprintf(stderr, "producer %2d starts\n", id);

    while(1)
    {
        pthread_mutex_lock(&prod_mutex[id]);
        while( buffer[id] != 999 )  /* number isn't consumed yet */
              pthread_cond_wait(&prod_cond[id], &prod_mutex[id]);
        pthread_mutex_unlock(&prod_mutex[id]);
        if( id == seg_fault )
        {
           fgets(input, sizeof(input), stdin);
           buffer[id] = strtol(input, NULL, 10);
        }
        else
           buffer[id] = 100;
        pthread_mutex_lock(&cons_mutex[id]);
        pthread_cond_signal(&cons_cond[id]);
        pthread_mutex_unlock(&cons_mutex[id]);

        if( p_sleep )
           usleep(p_sleep);
    }
}


void *consumer (void *arg)
{
    int   test, id;

    id = (int)arg;

    fprintf(stderr, "consumer %2d starts\n", id);

    while(1)
    {
        pthread_mutex_lock(&cons_mutex[id]);
        while( buffer[id] == 999 )  /* number isn't put in yet */
              pthread_cond_wait(&cons_cond[id], &cons_mutex[id]);
        pthread_mutex_unlock(&cons_mutex[id]);

        /* exception will happen here if buffer[id]=0 */
        if( buffer[id] == 0 )
           fprintf(stderr, "consumer %d is going to cause an exception\n", id);
        test = 10 / buffer[id];
        buffer[id] = 999;
        pthread_mutex_lock(&prod_mutex[id]);
        pthread_cond_signal(&prod_cond[id]);
        pthread_mutex_unlock(&prod_mutex[id]);
    }
}


Reply via email to