Re: [OMPI users] OpenMPI 1.1: Signal:10 info.si_errno:0(Unknown, error: 0), si_code:1(BUS_ADRALN) (Terry D. Dontje)
@ Terry (and All)! Enclose you'll find a (minor) bugfix with respect to the BUS_ADRALN I've reported recently when submitting jobs to the XGrid with OpenMPI 1.1. The BUS_ADRALN error on SPARC systems might be caused by a similar code segment. For the "bugfix" see line 55ff of the attached code file pls_xgrid_cliemt.m I haven't check this yet, but it's very likely that the same code segment causes the BUS_ADRALN error in the trunk-tarballs when submitting jobs to with XGrid with those releases. Hope this will help you, too, Eric. Frank /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #import "orte_config.h" #import #import "orte/mca/pls/base/base.h" #import "orte/orte_constants.h" #import "orte/mca/ns/ns.h" #import "orte/mca/ras/base/ras_base_node.h" #import "orte/mca/gpr/gpr.h" #import "orte/mca/rml/rml.h" #import "opal/util/path.h" #import "pls_xgrid_client.h" char **environ; /** * Set the daemons name in the registry. */ static int mca_pls_xgrid_set_node_name(orte_ras_node_t* node, orte_jobid_t jobid, orte_process_name_t* name) { orte_gpr_value_t *values[1], *value; orte_gpr_keyval_t *kv; char* jobid_string; size_t i; int rc; values[0] = OBJ_NEW(orte_gpr_value_t); if (NULL == values[0]) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } // BUS_ADRALIN error in line value->cnt = 1, if value isn't assigned first value = values[0]; value->cnt = 1; value->addr_mode = ORTE_GPR_OVERWRITE; value->segment = strdup(ORTE_NODE_SEGMENT); // value = values[0]; value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt * sizeof(orte_gpr_keyval_t*)); if (NULL == value->keyvals) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_RELEASE(value); return ORTE_ERR_OUT_OF_RESOURCE; } value->keyvals[0] = OBJ_NEW(orte_gpr_keyval_t); if (NULL == value->keyvals[0]) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_RELEASE(value); return ORTE_ERR_OUT_OF_RESOURCE; } kv = value->keyvals[0]; if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens), node->node_cellid, node->node_name))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); free(jobid_string); return rc; } asprintf(&(kv->key), "%s-%s", ORTE_NODE_BOOTPROXY_KEY, jobid_string); kv->value = OBJ_NEW(orte_data_value_t); if (NULL == kv->value) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_RELEASE(value); return ORTE_ERR_OUT_OF_RESOURCE; } kv->value->type = ORTE_NAME; if (ORTE_SUCCESS != (rc = orte_dss.copy(&(kv->value->data), name, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } rc = orte_gpr.put(1, values); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } OBJ_RELEASE(value); return rc; } @implementation PlsXGridClient /* init / finalize */ -(id) init { return [self initWithControllerHostname: NULL AndControllerPassword: NULL AndOrted: NULL AndCleanup: 1]; } -(id) initWithControllerHostname: (char*) hostname AndControllerPassword: (char*) password AndOrted: (char*) ortedname AndCleanup: (int) val { if (self = [super init]) { /* class-specific initialization goes here */ OBJ_CONSTRUCT(&state_cond, opal_condition_t); OBJ_CONSTRUCT(&state_mutex, opal_mutex_t); if (NULL != password) { controller_password = [NSString stringWithCString: password]; } if (NULL != hostname) { controller_hostname = [NSString stringWithCString: hostname]; } cleanup = val; if (NULL != ortedname) { orted = [NSString stringWithCString: ortedname]; } active_jobs = [NSMutableDictionary dictionary]; } return self; } -(void) dealloc { /* if supposed to clean up job
[OMPI users] MPI_Op_reduce()
Hello list, I was searching for the ability to call any predefined mpi ops on arbitrary user buffers, without any communications, espacialy collective communications. but found nothing. so I 'invented' this function to perform this. it is just a wrapper around the internal ompi_op_reduce() function from ompi/op/op.h:573. so it isn't any magic behind this. the only thing i didn't do, is marking this function as OMPI special! maybe another prefix should be used. also there are only C bindings. Hope this is helpful for other users too. greetings Bert Wesarg diff -urN openmpi-1.1/ompi/include/mpi.h.in openmpi-1.1-mpi_op_reduce/ompi/include/mpi.h.in --- openmpi-1.1/ompi/include/mpi.h.in 2006-05-30 22:41:47.0 +0200 +++ openmpi-1.1-mpi_op_reduce/ompi/include/mpi.h.in 2006-07-01 11:59:23.893437057 +0200 @@ -1230,6 +1230,8 @@ OMPI_DECLSPEC int MPI_Win_wait(MPI_Win win); OMPI_DECLSPEC double MPI_Wtick(void); OMPI_DECLSPEC double MPI_Wtime(void); +OMPI_DECLSPEC int MPI_Op_reduce(void *source, void *target, int count, + MPI_Datatype datatype, MPI_Op op); /* @@ -1738,6 +1740,8 @@ OMPI_DECLSPEC int PMPI_Win_wait(MPI_Win win); OMPI_DECLSPEC double PMPI_Wtick(void); OMPI_DECLSPEC double PMPI_Wtime(void); +OMPI_DECLSPEC int PMPI_Op_reduce(void *source, void *target, int count, + MPI_Datatype datatype, MPI_Op op); #if defined(c_plusplus) || defined(__cplusplus) } diff -urN openmpi-1.1/ompi/mca/io/romio/romio/adio/include/mpipr.h openmpi-1.1-mpi_op_reduce/ompi/mca/io/romio/romio/adio/include/mpipr.h --- openmpi-1.1/ompi/mca/io/romio/romio/adio/include/mpipr.h2006-05-30 22:42:01.0 +0200 +++ openmpi-1.1-mpi_op_reduce/ompi/mca/io/romio/romio/adio/include/mpipr.h 2006-07-01 15:28:32.453154536 +0200 @@ -200,6 +200,8 @@ #define MPI_Op_create PMPI_Op_create #undef MPI_Op_free #define MPI_Op_free PMPI_Op_free +#undef MPI_Op_reduce +#define MPI_Op_reduce PMPI_Op_reduce #undef MPI_Pack #define MPI_Pack PMPI_Pack #undef MPI_Pack_size diff -urN openmpi-1.1/ompi/mpi/c/Makefile.am openmpi-1.1-mpi_op_reduce/ompi/mpi/c/Makefile.am --- openmpi-1.1/ompi/mpi/c/Makefile.am 2006-05-30 22:41:50.0 +0200 +++ openmpi-1.1-mpi_op_reduce/ompi/mpi/c/Makefile.am2006-07-01 11:58:40.884442991 +0200 @@ -252,6 +252,7 @@ op_create.c \ op_f2c.c \ op_free.c \ +op_reduce.c \ open_port.c \ pack_external.c \ pack_external_size.c \ diff -urN openmpi-1.1/ompi/mpi/c/op_reduce.c openmpi-1.1-mpi_op_reduce/ompi/mpi/c/op_reduce.c --- openmpi-1.1/ompi/mpi/c/op_reduce.c 1970-01-01 01:00:00.0 +0100 +++ openmpi-1.1-mpi_op_reduce/ompi/mpi/c/op_reduce.c2006-07-01 15:22:16.29672 +0200 @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/op/op.h" + +#if OMPI_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Op_reduce = PMPI_Op_reduce +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Op_reduce"; + + +int MPI_Op_reduce(void *source, void *target, int count, + MPI_Datatype datatype, MPI_Op op) +{ + int err = MPI_SUCCESS; + + /* Error checking */ + + if (MPI_PARAM_CHECK) { +char *msg; +OMPI_ERR_INIT_FINALIZE(FUNC_NAME); +if (MPI_OP_NULL == op) { + err = MPI_ERR_OP; +} else if (!ompi_op_is_valid(op, datatype, &msg, FUNC_NAME)) { +int ret = OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_OP, msg); +free(msg); +return ret; +} else { + OMPI_CHECK_DATATYPE_FOR_SEND(err, datatype, count); + OMPI_CHECK_USER_BUFFER(err, source, datatype, count); + OMPI_CHECK_USER_BUFFER(err, target, datatype, count); +} +OMPI_ERRHANDLER_CHECK(err, MPI_COMM_WORLD, err, FUNC_NAME); + } + + if (0 == count) { +return MPI_SUCCESS; + } + + OBJ_RETAIN(op); + ompi_op_reduce(op, source, target, count, datatype); + OBJ_RELEASE(op); + + OMPI_ERRHANDLER_RETURN(err, MPI_COMM_WORLD, MPI_ERR_INTERN, FUNC_NAME); +} diff -urN openmpi-1.1/ompi/mpi/c/profile/defines.h openmpi-1.1-mpi_op_reduce/ompi/mpi/c/profile/defines.h --- open
Re: [OMPI users] OpenMPI 1.1: Signal:10 info.si_errno:0(Unknown, error: 0), si_code:1(BUS_ADRALN) (Terry D. Dontje)
On Jul 1, 2006, at 4:42 AM, openmpi-user wrote: @ Terry (and All)! Enclose you'll find a (minor) bugfix with respect to the BUS_ADRALN I've reported recently when submitting jobs to the XGrid with OpenMPI 1.1. The BUS_ADRALN error on SPARC systems might be caused by a similar code segment. For the "bugfix" see line 55ff of the attached code file pls_xgrid_cliemt.m I haven't check this yet, but it's very likely that the same code segment causes the BUS_ADRALN error in the trunk-tarballs when submitting jobs to with XGrid with those releases. Thanks for the patch. The XGrid code is OS X only, so we still have some work to do on Solaris. I'm not sure how this bug lived through testing. I've applied it to our Subversion source and it will be part of the Open MPI 1.1.1 release. Thanks, Brian -- Brian Barrett Open MPI developer http://www.open-mpi.org/