Package: libarrayfire-opencl3 Version: 3.3.2+dfsg1-4 Severity: important Dear Maintainer,
* What led up to the situation? As a trivial example, I am trying to implement an OLS solver based on arrayfire. * What exactly did you do (or not do) that was effective (or ineffective)? Consider the following C++ file (fastLm.cpp): #include <arrayfire.h> #include <iostream> int main() { dim_t n = 100000; dim_t p = 40; // af::setBackend(AF_BACKEND_CPU); af::array X = af::randn(n, p); af::array y = af::matmul(X, af::constant(0.5, p)) + af::randn(n); std::cout << "X: " << X.dims() << std::endl; std::cout << "y: " << y.dims() << std::endl; af::array tXX = af::matmulTN(X, X); std::cout << "tXX: " << tXX.dims() << std::endl; af::array tXy = af::matmulTN(X, y); std::cout << "tXy: " << tXy.dims() << std::endl; af::array coef = af::solve(tXX, tXy); std::cout << "coef: " << coef.dims() << std::endl; return 0; } I compiled this file with $ g++ -laf -o fastLm fastLm.cpp * What was the outcome of this action? The follwoing error was displayed: $ ./fastLm ======================================================== AN INTERNAL KERNEL BUILD ERROR OCCURRED! device name = Intel(R) HD Graphics Skylake ULT GT2 error = -11 memory pattern = Cached global memory based block gemv, computing kernel generator Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 1, dims[0].y = 32, dims[0].x = 1, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 1, dims[1].y = 4, dims[1].x = 1, dims[1].bwidth = 4; ; Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64 Kernel extra flags: 789680 Source: typedef union GPtr { __global float *f; __global float2 *f2v; __global float4 *f4v; __global float8 *f8v; __global float16 *f16v; } GPtr; typedef union LPtr { __local float *f; __local float2 *f2v; __local float4 *f4v; __local float8 *f8v; __local float16 *f16v; } LPtr; typedef union PPtr { float *f; float2 *f2v; float4 *f4v; float8 *f8v; float16 *f16v; } PPtr; __attribute__((reqd_work_group_size(64, 1, 1))) void __kernel sgemv( uint M, uint N, const float alpha, const __global float *restrict A, const __global float *restrict X, __global float *Y, uint lda) { // M always denotes length of Y and N denotes length of X in the kernel float4 a0; float4 x0; float4 y0; y0 = 0; __local float4 localRes[64][1]; uint coordA = (get_group_id(0) * 8 + get_local_id(0) % 8) * 4; uint k0 = (get_local_id(0) / 8) * 4; if (coordA < M && k0 < N) { const GPtr Ag = {(__global float*)A}; const GPtr Xg = {(__global float*)X}; uint Ntail = N % 4; N -= Ntail; uint k = k0; for (; k < N; k += 32) { const uint xk = k / 4; x0 = Xg.f4v[xk + 0]; /* -- Tiles multiplier -- */ const uint ay = (uint)(coordA >> 2); const uint4 ak = {mad24(k, (lda >> 2), 0u), mad24(k + 1, (lda >> 2), 0u), mad24(k + 2, (lda >> 2), 0u), mad24(k + 3, (lda >> 2), 0u)}; a0 = Ag.f4v[ay + ak.s0]; y0 += a0 * x0.s0; a0 = Ag.f4v[ay + ak.s1]; y0 += a0 * x0.s1; a0 = Ag.f4v[ay + ak.s2]; y0 += a0 * x0.s2; a0 = Ag.f4v[ay + ak.s3]; y0 += a0 * x0.s3; /* ---------------------- */ } N += Ntail; if (k < N) { x0.s0 = X[k + 0 < N ? k : 0]; x0.s1 = X[k + 1 < N ? k + 1 : 0]; x0.s2 = X[k + 2 < N ? k + 2 : 0]; x0.s3 = X[k + 3 < N ? k + 3 : 0]; x0.s0 = k + 0 < N ? x0.s0 : 0; x0.s1 = k + 1 < N ? x0.s1 : 0; x0.s2 = k + 2 < N ? x0.s2 : 0; x0.s3 = k + 3 < N ? x0.s3 : 0; /* -- Tiles multiplier -- */ const uint ay = (uint)(coordA >> 2); const uint4 ak = {mad24(k % N, (lda >> 2), 0u), mad24((k + 1) % N, (lda >> 2), 0u), mad24((k + 2) % N, (lda >> 2), 0u), mad24((k + 3) % N, (lda >> 2), 0u)}; a0 = Ag.f4v[ay + ak.s0]; y0 += a0 * x0.s0; a0 = Ag.f4v[ay + ak.s1]; y0 += a0 * x0.s1; a0 = Ag.f4v[ay + ak.s2]; y0 += a0 * x0.s2; a0 = Ag.f4v[ay + ak.s3]; y0 += a0 * x0.s3; /* ---------------------- */ } } localRes[get_local_id(0)][0] = y0; barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) < 8 && coordA < M && k0 < N) { for (uint i = 1; i < 8; i++) { y0 += localRes[get_local_id(0) + i*8][0]; } Y += coordA; float4 r0; GPtr uC; uC.f = Y; r0 = uC.f4v[0]; r0 = alpha * y0; uC.f4v[0] = r0; } } -------------------------------------------------------- Build log: error: unknown argument: '-g' ======================================================== Speicherzugriffsfehler * What outcome did you expect instead? The same output as I get from the CPU backend after uncommenting "af::setBackend", i.e.: $ ./fastLm X: 100000 40 1 1 y: 100000 1 1 1 tXX: 40 40 1 1 tXy: 40 1 1 1 coef: 40 1 1 1 When I install the upstream version 3.5.1 via upstream's installer in a Docker image, fastLm.cpp works with both OpenCL and CPU backend. Greetings Ralf -- System Information: Debian Release: 9.1 APT prefers stable APT policy: (500, 'stable') Architecture: amd64 (x86_64) Kernel: Linux 4.9.0-4-amd64 (SMP w/4 CPU cores) Locale: LANG=de_DE.UTF-8, LC_CTYPE=de_DE.UTF-8 (charmap=UTF-8), LANGUAGE=de_DE.UTF-8 (charmap=UTF-8) Shell: /bin/sh linked to /bin/dash Init: systemd (via /run/systemd/system) Versions of packages libarrayfire-opencl3 depends on: ii libblas3 [libblas.so.3] 3.7.0-2 ii libc6 2.24-11+deb9u1 ii libclblas2 2.12-1 ii libclfft2 2.12.2-1+b1 ii libfreeimage3 3.17.0+ds1-5 ii libgcc1 1:6.3.0-18 ii liblapacke 3.7.0-2 ii libopenblas-base [libblas.so.3] 0.2.19-3 ii libstdc++6 6.3.0-18 ii ocl-icd-libopencl1 [libopencl1] 2.2.11-1 libarrayfire-opencl3 recommends no packages. libarrayfire-opencl3 suggests no packages. -- no debconf information