Coming back to my original problem, I did a simplified version of it,
which is about 10x slower than a vectorized matlab version. Have I missed
anything here ?
A = ones(50,40,40);
B = ones(50,100)/2;
C = ones(40,100)/3;
D = ones(40,100)/4;
E = ones(100,100)/5;
idx = int([100:-1:1]);
function
testSum(A::Array{Float64,3},B::Array{Float64,2},C::Array{Float64,2},
D::Array{Float64,2},E::Array{Float64,2},idx::Array{Int64,1})
alpha = zeros(100)
tmp_1 = zero(Float64)
tmp_2 = zero(Float64)
tmp_3 = zero(Float64)
@inbounds for t = 1:100
for thp = 1:50
tmp_3 = zero(tmp_3)
for x_3 = 1:40
tmp_2 = zero(tmp_2)
for x_2 = 1:40
tmp_1 = zero(tmp_1)
@simd for x_1 = 1:50
tmp_1 += A[x_1,x_2,x_3] * B[x_1,t]
end
tmp_2 += tmp_1 * C[x_2,t]
end
tmp_3 += tmp_2 * D[x_3,t]
end
alpha[t] = E[t,idx[t]] * tmp_3
end
end
return alpha
end