Dnia 2013-10-02, śro o godzinie 01:12 +0200, Cyril Brulebois pisze: > Hi again Tomasz, > > Cyril Brulebois <k...@debian.org> (2013-09-23): > > The squeeze.diff one seems to have unrelated noise in patches, > > presumably because something refreshed them while you were preparing > > the diff? Having a targeted patched like the first one would be nice, > > so please follow up to 723...@bugs.debian.org with a cleaner debdiff. > > kind reminder: o-p-u NEW freeze is less than 2 weeks from now. Fixing > this issue in a later point release is of course perfectly OK, I just > thought I'd give you a heads-up. >
Thanks for the reminder. I was aware of this, but other things, completely unrelated to software worlds, took my time and energy. I attach new patch with proposed changes. It only changes changelog, debian/rules (removal of offending file) and mentioned file. Best regards. -- Tomasz Rybak GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
diff -Nru pyopencl-0.92/debian/changelog pyopencl-0.92.dfsg/debian/changelog --- pyopencl-0.92/debian/changelog 2010-11-11 23:10:57.000000000 +0100 +++ pyopencl-0.92.dfsg/debian/changelog 2013-10-04 18:00:49.000000000 +0200 @@ -1,3 +1,9 @@ +pyopencl (0.92.dfsg-1) oldstable; urgency=low + + * Remove non-free file from examples (#722014, #723793). + + -- Tomasz Rybak <tomasz.ry...@post.pl> Fri, 04 Oct 2013 17:54:19 +0200 + pyopencl (0.92-1) unstable; urgency=high * New upstream release diff -Nru pyopencl-0.92/debian/rules pyopencl-0.92.dfsg/debian/rules --- pyopencl-0.92/debian/rules 2010-11-11 13:30:27.000000000 +0100 +++ pyopencl-0.92.dfsg/debian/rules 2013-10-04 18:00:49.000000000 +0200 @@ -36,6 +36,7 @@ git clone $(GIT_URL) $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION) cd $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION) && git checkout $(GIT_REVISION) rm -rf $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/.git $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/.gitignore $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/.gitmodules + rm -rf $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/examples/matrix-multiply.py tar czf $(MODULE_NAME)_$(DEB_UPSTREAM_VERSION).orig.tar.gz $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION) rm -rf $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION) diff -Nru pyopencl-0.92/examples/matrix-multiply.py pyopencl-0.92.dfsg/examples/matrix-multiply.py --- pyopencl-0.92/examples/matrix-multiply.py 2010-10-21 19:10:19.000000000 +0200 +++ pyopencl-0.92.dfsg/examples/matrix-multiply.py 1970-01-01 01:00:00.000000000 +0100 @@ -1,241 +0,0 @@ -# example provided by Eilif Muller - -from __future__ import division - -KERNEL_CODE = """ - -// Thread block size -#define BLOCK_SIZE %(block_size)d - -// Matrix dimensions -// (chosen as multiples of the thread block size for simplicity) -#define WA %(w_a)d // Matrix A width -#define HA %(h_a)d // Matrix A height -#define WB %(w_b)d // Matrix B width -#define HB WA // Matrix B height -#define WC WB // Matrix C width -#define HC HA // Matrix C height - - -/* - * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - -/* Matrix multiplication: C = A * B. - * Device code. - */ - -#define AS(j, i) As[i + j * BLOCK_SIZE] -#define BS(j, i) Bs[i + j * BLOCK_SIZE] - -//////////////////////////////////////////////////////////////////////////////// -//! Matrix multiplication on the device: C = A * B -//! WA is A's width and WB is B's width -//////////////////////////////////////////////////////////////////////////////// -__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1))) -void -matrixMul( __global float* C, __global float* A, __global float* B) -{ - __local float As[BLOCK_SIZE*BLOCK_SIZE]; - __local float Bs[BLOCK_SIZE*BLOCK_SIZE]; - - // Block index - int bx = get_group_id(0); - int by = get_group_id(1); - - // Thread index - int tx = get_local_id(0); - int ty = get_local_id(1); - - // Index of the first sub-matrix of A processed by the block - int aBegin = WA * BLOCK_SIZE * by; - - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + WA - 1; - - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; - - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * bx; - - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * WB; - - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0.0f; - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; - a <= aEnd; - a += aStep, b += bStep) { - - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - AS(ty, tx) = A[a + WA * ty + tx]; - BS(ty, tx) = B[b + WB * ty + tx]; - - // Synchronize to make sure the matrices are loaded - barrier(CLK_LOCAL_MEM_FENCE); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix - for (int k = 0; k < BLOCK_SIZE; ++k) - Csub += AS(ty, k) * BS(k, tx); - - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub; - -} - -""" - -import pyopencl as cl -from time import time -import numpy - -block_size = 16 - -ctx = cl.create_some_context() - -for dev in ctx.devices: - assert dev.local_mem_size > 0 - -queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - -#queue = cl.CommandQueue(ctx) - -if False: - a_height = 4096 - #a_height = 1024 - a_width = 2048 - #a_width = 256 - #b_height == a_width - b_width = a_height - -elif False: - # like PyCUDA - a_height = 2516 - a_width = 1472 - b_height = a_width - b_width = 2144 - -else: - # CL SDK - a_width = 50*block_size - a_height = 100*block_size - b_width = 50*block_size - b_height = a_width - -c_width = b_width -c_height = a_height - -h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32) -h_b = numpy.random.rand(b_height, b_width).astype(numpy.float32) -h_c = numpy.empty((c_height, c_width)).astype(numpy.float32) - - -kernel_params = {"block_size": block_size, - "w_a":a_width, "h_a":a_height, "w_b":b_width} - -if "NVIDIA" in queue.device.vendor: - options = "-cl-mad-enable -cl-fast-relaxed-math" -else: - options = None -prg = cl.Program(ctx, KERNEL_CODE % kernel_params, - ).build(options=options) -kernel = prg.matrixMul -#print prg.binaries[0] - -assert a_width % block_size == 0 -assert a_height % block_size == 0 -assert b_width % block_size == 0 - -# transfer host -> device ----------------------------------------------------- -mf = cl.mem_flags - -t1 = time() - -d_a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_a) -d_b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_b) -d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes) - -push_time = time()-t1 - -# warmup ---------------------------------------------------------------------- -for i in range(5): - event = kernel(queue, h_c.shape, (block_size, block_size), - d_c_buf, d_a_buf, d_b_buf) - event.wait() - -queue.finish() - -# actual benchmark ------------------------------------------------------------ -t1 = time() - -count = 20 -for i in range(count): - event = kernel(queue, h_c.shape, (block_size, block_size), - d_c_buf, d_a_buf, d_b_buf) - -event.wait() - -gpu_time = (time()-t1)/count - -# transfer device -> host ----------------------------------------------------- -t1 = time() -cl.enqueue_read_buffer(queue, d_c_buf, h_c).wait() -pull_time = time()-t1 - -# timing output --------------------------------------------------------------- -gpu_total_time = gpu_time+push_time+pull_time - -print "GPU push+compute+pull total [s]:", gpu_total_time -print "GPU push [s]:", push_time -print "GPU pull [s]:", pull_time -print "GPU compute (host-timed) [s]:", gpu_time -print "GPU compute (event-timed) [s]: ", (event.profile.end-event.profile.start)*1e-9 - -gflop = h_c.size * (a_width * 2.) / (1000**3.) -gflops = gflop / gpu_time - -print -print "GFlops/s:", gflops - -# cpu comparison -------------------------------------------------------------- -t1 = time() -h_c_cpu = numpy.dot(h_a,h_b) -cpu_time = time()-t1 - -print -print "GPU==CPU:",numpy.allclose(h_c, h_c_cpu) -print -print "CPU time (s)", cpu_time -print - -print "GPU speedup (with transfer): ", cpu_time/gpu_total_time -print "GPU speedup (without transfer): ", cpu_time/gpu_time -