Currently, the nvptx libgomp plugin indiscriminately sets num_gangs to 32 regardless of the underlying CUDA hardware. Depending on the GPU, this value can be extremely conservative. The attached patch implements a more sophisticated approach, which probes the hardware at run time to calculate the number of gangs that would saturate the hardware's resources. It should be noted that this solution may not be optimal; I've seen other approaches where the compiler works with the runtime to set num_gangs to be the number of loop iterations in the gang loop. However, the approach taken in this patch greatly increases the performance of OpenACC parallel code inside SPEC_ACCEL.
Besides for selecting num_gangs dynamically, this patch also teaches the GOMP_OPENACC_DIM environment variable parser to accept a '-' argument for the num_gang field. That argument allows the runtime to dynamically set num_gangs, while still enabling the end user to specify num_workers and vector_length. Because nvptx port does not preform any register allocation (that gets deferred to the CUDA driver JIT), there are situations where the hardware doesn't have sufficient resources to satisfy the default num_workers. As a stopgap solution, this patch teaches the nvptx plugin how to gracefully error whenever it encounters such a situation. Furthermore, it will inform the user how to adjust num_workers to get the program to work. The latter two changes are extremely small, so I clumped them into a single patch. Is this OK for trunk? Thanks, Cesar
2017-07-05 Cesar Philippidis <ce...@codesourcery.com> libgomp/ * plugin/cuda/cuda.h (CUdevice_attribute): Add CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. (CUfunction_attribute): Add CU_FUNC_ATTRIBUTE_BINARY_VERSION. * plugin/plugin-nvptx.c (struct targ_fn_descriptor): Add num_regs member. (struct ptx_device): Rename num_sms, regs_per_block, regs_per_sm as multiprocessor_count, max_registers_per_block, max_registers_per_multiprocessor, respectively. Add members warp_size, multiprocessor_count, max_shared_memory_per_multiprocessor, binary_version, register_allocation_unit_size, register_allocation_granularity. (nvptx_open_device): Initialize new and renamed members in ptx_device. (nvptx_exec): Dynamically set num_gangs based on hardware resources. Add support for '-' gang argument to GOMP_OPENACC_DIM environment variable. Describe how to reduce num_workers when the hardware lacks sufficient resources for the default. (GOMP_OFFLOAD_load_image): Initialize new and renamed targ_fn_descriptor members. (nvptx_adjust_launch_bounds): Adjust names of regs_per_sm and num_sms. diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 25d5d19..3199a93 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -69,6 +69,7 @@ typedef enum { CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 } CUdevice_attribute; @@ -79,7 +80,8 @@ enum { typedef enum { CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, - CU_FUNC_ATTRIBUTE_NUM_REGS = 4 + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6 } CUfunction_attribute; typedef enum { diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 71630b5..802f76d 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -372,6 +372,9 @@ struct targ_fn_descriptor const struct targ_fn_launch *launch; int regs_per_thread; int max_threads_per_block; + + /* Cuda function properties. */ + int num_regs; }; /* A loaded PTX image. */ @@ -408,9 +411,21 @@ struct ptx_device bool mkern; int mode; int clock_khz; - int num_sms; - int regs_per_block; - int regs_per_sm; + int max_threads_per_block; + int warp_size; + int multiprocessor_count; + int max_threads_per_multiprocessor; + int max_registers_per_block; + int max_registers_per_multiprocessor; + int max_shared_memory_per_multiprocessor; + + int binary_version; + + /* register_allocation_unit_size and register_allocation_granularity + were extracted from the "Register Allocation Granularity" in + Nvidia's CUDA Occupancy Calculator spreadsheet. */ + int register_allocation_unit_size; + int register_allocation_granularity; struct ptx_image_data *images; /* Images loaded on device. */ pthread_mutex_t image_lock; /* Lock for above list. */ @@ -722,6 +737,9 @@ nvptx_open_device (int n) ptx_dev->ord = n; ptx_dev->dev = dev; ptx_dev->ctx_shared = false; + ptx_dev->binary_version = 0; + ptx_dev->register_allocation_unit_size = 0; + ptx_dev->register_allocation_granularity = 0; r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) @@ -770,33 +788,46 @@ nvptx_open_device (int n) ptx_dev->clock_khz = pi; CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); + ptx_dev->max_threads_per_block = pi; + + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); + if (pi != 32) + { + GOMP_PLUGIN_error ("Only warp size 32 is supported"); + return NULL; + } + ptx_dev->warp_size = pi; + + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); - ptx_dev->num_sms = pi; + ptx_dev->multiprocessor_count = pi; + + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); + ptx_dev->max_threads_per_multiprocessor = pi; CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); - ptx_dev->regs_per_block = pi; + ptx_dev->max_registers_per_block = pi; /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only in CUDA 6.0 and newer. */ - r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev); - /* Fallback: use limit of registers per block, which is usually equal. */ + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, dev); if (r == CUDA_ERROR_INVALID_VALUE) - pi = ptx_dev->regs_per_block; + pi = ptx_dev->max_registers_per_block; else if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r)); return NULL; } - ptx_dev->regs_per_sm = pi; + ptx_dev->max_registers_per_multiprocessor = pi; - CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, - &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); - if (pi != 32) - { - GOMP_PLUGIN_error ("Only warp size 32 is supported"); - return NULL; - } + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, + dev); + ptx_dev->max_shared_memory_per_multiprocessor = pi; r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); @@ -806,6 +837,22 @@ nvptx_open_device (int n) ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); + GOMP_PLUGIN_debug (0, "Nvidia device %d:\n\tGPU_OVERLAP = %d\n" + "\tCAN_MAP_HOST_MEMORY = %d\n\tCONCURRENT_KERNELS = %d\n" + "\tCOMPUTE_MODE = %d\n\tINTEGRATED = %d\n" + "\tMAX_THREADS_PER_BLOCK = %d\n\tWARP_SIZE = %d\n" + "\tMULTIPROCESSOR_COUNT = %d\n" + "\tMAX_THREADS_PER_MULTIPROCESSOR = %d\n" + "\tMAX_REGISTERS_PER_MULTIPROCESSOR = %d\n" + "\tMAX_SHARED_MEMORY_PER_MULTIPROCESSOR = %d\n", + ptx_dev->ord, ptx_dev->overlap, ptx_dev->map, + ptx_dev->concur, ptx_dev->mode, ptx_dev->mkern, + ptx_dev->max_threads_per_block, ptx_dev->warp_size, + ptx_dev->multiprocessor_count, + ptx_dev->max_threads_per_multiprocessor, + ptx_dev->max_registers_per_multiprocessor, + ptx_dev->max_shared_memory_per_multiprocessor); + if (!init_streams_for_device (ptx_dev, async_engines)) return NULL; @@ -1070,6 +1117,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; + int cpu_size = nvptx_thread ()->ptx_dev->max_threads_per_multiprocessor; + int block_size = nvptx_thread ()->ptx_dev->max_threads_per_block; + int dev_size = nvptx_thread ()->ptx_dev->multiprocessor_count; + int warp_size = nvptx_thread ()->ptx_dev->warp_size; + int rf_size = nvptx_thread ()->ptx_dev->max_registers_per_multiprocessor; + int reg_unit_size = nvptx_thread ()->ptx_dev->register_allocation_unit_size; + int reg_granularity = nvptx_thread ()->ptx_dev + ->register_allocation_granularity; function = targ_fn->fn; @@ -1083,107 +1138,135 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, for (i = 0; i != GOMP_DIM_MAX; i++) { if (targ_fn->launch->dim[i]) - dims[i] = targ_fn->launch->dim[i]; + dims[i] = targ_fn->launch->dim[i]; if (!dims[i]) - seen_zero = 1; + seen_zero = 1; } - if (seen_zero) - { - /* See if the user provided GOMP_OPENACC_DIM environment - variable to specify runtime defaults. */ - static int default_dims[GOMP_DIM_MAX]; + /* See if the user provided GOMP_OPENACC_DIM environment variable to + specify runtime defaults. */ + static int default_dims[GOMP_DIM_MAX]; - pthread_mutex_lock (&ptx_dev_lock); - if (!default_dims[0]) + pthread_mutex_lock (&ptx_dev_lock); + if (!default_dims[0]) + { + const char *var_name = "GOMP_OPENACC_DIM"; + /* We only read the environment variable once. You can't + change it in the middle of execution. The syntax is + the same as for the -fopenacc-dim compilation option. */ + const char *env_var = getenv (var_name); + notify_var (var_name, env_var); + if (env_var) { - const char *var_name = "GOMP_OPENACC_DIM"; - /* We only read the environment variable once. You can't - change it in the middle of execution. The syntax is - the same as for the -fopenacc-dim compilation option. */ - const char *env_var = getenv (var_name); - notify_var (var_name, env_var); - if (env_var) - { - const char *pos = env_var; + const char *pos = env_var; - for (i = 0; *pos && i != GOMP_DIM_MAX; i++) + for (i = 0; *pos && i != GOMP_DIM_MAX; i++) + { + if (i && *pos++ != ':') + break; + if (*pos != ':') { - if (i && *pos++ != ':') + const char *eptr; + + errno = 0; + long val = strtol (pos, (char **)&eptr, 10); + if (errno || val < 0 || (unsigned)val != val) break; - if (*pos != ':') - { - const char *eptr; - - errno = 0; - long val = strtol (pos, (char **)&eptr, 10); - if (errno || val < 0 || (unsigned)val != val) - break; - default_dims[i] = (int)val; - pos = eptr; - } + default_dims[i] = (int)val; + pos = eptr; } } + } - int warp_size, block_size, dev_size, cpu_size; - CUdevice dev = nvptx_thread()->ptx_dev->dev; - /* 32 is the default for known hardware. */ - int gang = 0, worker = 32, vector = 32; - CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm; - - cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK; - cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE; - cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT; - cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR; - - if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb, - dev) == CUDA_SUCCESS - && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws, - dev) == CUDA_SUCCESS - && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc, - dev) == CUDA_SUCCESS - && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm, - dev) == CUDA_SUCCESS) - { - GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," - " dev_size=%d, cpu_size=%d\n", - warp_size, block_size, dev_size, cpu_size); - gang = (cpu_size / block_size) * dev_size; - worker = block_size / warp_size; - vector = warp_size; - } + /* 32 is the default for known hardware. */ + int gang = 0, worker = 32, vector = 32; + + gang = (cpu_size / block_size) * dev_size; + worker = block_size / warp_size; + vector = warp_size; + + /* If the user hasn't specified the number of gangs, determine + it dynamically based on the hardware configuration. */ + if (default_dims[GOMP_DIM_GANG] == 0) + default_dims[GOMP_DIM_GANG] = -1; + /* The worker size must not exceed the hardware. */ + if (default_dims[GOMP_DIM_WORKER] < 1 + || (default_dims[GOMP_DIM_WORKER] > worker && gang)) + default_dims[GOMP_DIM_WORKER] = worker; + /* The vector size must exactly match the hardware. */ + if (default_dims[GOMP_DIM_VECTOR] < 1 + || (default_dims[GOMP_DIM_VECTOR] != vector && gang)) + default_dims[GOMP_DIM_VECTOR] = vector; + + GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n", + default_dims[GOMP_DIM_GANG], + default_dims[GOMP_DIM_WORKER], + default_dims[GOMP_DIM_VECTOR]); + } + pthread_mutex_unlock (&ptx_dev_lock); - /* There is no upper bound on the gang size. The best size - matches the hardware configuration. Logical gangs are - scheduled onto physical hardware. To maximize usage, we - should guess a large number. */ - if (default_dims[GOMP_DIM_GANG] < 1) - default_dims[GOMP_DIM_GANG] = gang ? gang : 1024; - /* The worker size must not exceed the hardware. */ - if (default_dims[GOMP_DIM_WORKER] < 1 - || (default_dims[GOMP_DIM_WORKER] > worker && gang)) - default_dims[GOMP_DIM_WORKER] = worker; - /* The vector size must exactly match the hardware. */ - if (default_dims[GOMP_DIM_VECTOR] < 1 - || (default_dims[GOMP_DIM_VECTOR] != vector && gang)) - default_dims[GOMP_DIM_VECTOR] = vector; - - GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n", - default_dims[GOMP_DIM_GANG], - default_dims[GOMP_DIM_WORKER], - default_dims[GOMP_DIM_VECTOR]); - } - pthread_mutex_unlock (&ptx_dev_lock); + /* Calculate the optimal number of gangs for the current device. */ + int reg_used = targ_fn->num_regs; + int reg_per_warp = ((reg_used * warp_size + reg_unit_size - 1) + / reg_unit_size) * reg_unit_size; + int threads_per_sm = (rf_size / reg_per_warp / reg_granularity) + * reg_granularity * warp_size; + + if (threads_per_sm > cpu_size) + threads_per_sm = cpu_size; + if (seen_zero) + { for (i = 0; i != GOMP_DIM_MAX; i++) if (!dims[i]) - dims[i] = default_dims[i]; + { + if (default_dims[i] > 0) + dims[i] = default_dims[i]; + else + switch (i) { + case GOMP_DIM_GANG: + /* The constant 2 was chosen somewhat emperically. The + justification behind it is to prevent the hardware + from idling by throwing twice the amount of work + that it can physically handle. */ + dims[i] = (reg_granularity > 0) + ? 2 * threads_per_sm / warp_size * dev_size + : 2 * dev_size; + break; + case GOMP_DIM_WORKER: + case GOMP_DIM_VECTOR: + dims[i] = warp_size; + break; + default: + abort (); + } + } + } + + /* Check if the accelerator has sufficient hardware resources to + launch the offloaded kernel. */ + if (dims[GOMP_DIM_WORKER] > 1) + { + int threads_per_block = threads_per_sm > block_size + ? block_size : threads_per_sm; + + threads_per_block /= warp_size; + + if (reg_granularity > 0 && dims[GOMP_DIM_WORKER] > threads_per_block) + GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources " + "to launch '%s'; recompile the program with " + "'num_workers = %d' on that offloaded region or " + "'-fopenacc-dim=-:%d'.\n", + targ_fn->launch->fn, threads_per_block, + threads_per_block); } /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ + pthread_mutex_lock (&ptx_event_lock); map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); + pthread_mutex_unlock (&ptx_event_lock); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); @@ -1831,19 +1914,53 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) { CUfunction function; - int nregs, mthrs; + int val; CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, fn_descs[i].fn); - CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs, - CU_FUNC_ATTRIBUTE_NUM_REGS, function); - CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function); targ_fns->fn = function; targ_fns->launch = &fn_descs[i]; - targ_fns->regs_per_thread = nregs; - targ_fns->max_threads_per_block = mthrs; + + CUDA_CALL_ERET (-1, cuFuncGetAttribute, &val, + CU_FUNC_ATTRIBUTE_NUM_REGS, function); + targ_fns->num_regs = val; + + CUDA_CALL_ERET (-1, cuFuncGetAttribute, &val, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function); + targ_fns->max_threads_per_block = val; + + if (!dev->binary_version) + { + CUDA_CALL_ERET (-1, cuFuncGetAttribute, &val, + CU_FUNC_ATTRIBUTE_BINARY_VERSION, function); + dev->binary_version = val; + + /* These values were obtained from the CUDA Occupancy Calculator + spreadsheet. */ + if (dev->binary_version == 20 + || dev->binary_version == 21) + { + dev->register_allocation_unit_size = 128; + dev->register_allocation_granularity = 2; + } + else if (dev->binary_version == 60) + { + dev->register_allocation_unit_size = 256; + dev->register_allocation_granularity = 2; + } + else if (dev->binary_version <= 62) + { + dev->register_allocation_unit_size = 256; + dev->register_allocation_granularity = 4; + } + else + { + /* Fallback to -1 to for unknown targets. */ + dev->register_allocation_unit_size = -1; + dev->register_allocation_granularity = -1; + } + } targ_tbl->start = (uintptr_t) targ_fns; targ_tbl->end = targ_tbl->start + 1; @@ -2082,7 +2199,8 @@ nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn, /* This is an estimate of how many blocks the device can host simultaneously. Actual limit, which may be lower, can be queried with "occupancy control" driver interface (since CUDA 6.0). */ - int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms; + int max_blocks = ptx_dev->max_registers_per_multiprocessor + / regs_per_block * ptx_dev->multiprocessor_count; if (*teams_p <= 0 || *teams_p > max_blocks) *teams_p = max_blocks; }