Let's assume that we have a function that multiplies two arrays of 1000000 doubles each. In C/C++ the function looks like this:
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
The compiler produces the following assembly with -O2
:
mul_c(double*, double*):
xor eax, eax
.L2:
movsd xmm0, QWORD PTR [rdi+rax]
mulsd xmm0, QWORD PTR [rsi+rax]
movsd QWORD PTR [rdi+rax], xmm0
add rax, 8
cmp rax, 8000000
jne .L2
rep ret
From the above assembly it seems that the compiler uses the SIMD instructions, but it only multiplies one double each iteration. So I decided to write the same function in inline assembly instead, where I make full use of the xmm0
register and multiply two doubles in one go:
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix
"
"xor rax, rax
"
"0:
"
"movupd xmm0, xmmword ptr [rdi+rax]
"
"mulpd xmm0, xmmword ptr [rsi+rax]
"
"movupd xmmword ptr [rdi+rax], xmm0
"
"add rax, 16
"
"cmp rax, 8000000
"
"jne 0b
"
".att_syntax noprefix
"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
After measuring the execution time individually for both of these functions, it seems that both of them takes 1 ms to complete:
> gcc -O2 main.cpp
> ./a.out < input
mul_c: 1 ms
mul_asm: 1 ms
[a lot of doubles...]
I expected the SIMD implementation to be atleast twice as fast (0 ms) as there is only half the amount of multiplications/memory instructions.
So my question is: Why isn't the SIMD implementation faster than the ordinary C/C++ implementation when the SIMD implementation only does half the amount of multiplications/memory instructions?
Here's the full program:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix
"
"xor rax, rax
"
"0:
"
"movupd xmm0, xmmword ptr [rdi+rax]
"
"mulpd xmm0, xmmword ptr [rsi+rax]
"
"movupd xmmword ptr [rdi+rax], xmm0
"
"add rax, 16
"
"cmp rax, 8000000
"
"jne 0b
"
".att_syntax noprefix
"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
int main()
{
struct timeval t1;
struct timeval t2;
unsigned long long time;
double* a = (double*)malloc(sizeof(double) * 1000000);
double* b = (double*)malloc(sizeof(double) * 1000000);
double* c = (double*)malloc(sizeof(double) * 1000000);
for (int i = 0; i != 1000000; ++i)
{
double v;
scanf("%lf", &v);
a[i] = v;
b[i] = v;
c[i] = v;
}
gettimeofday(&t1, NULL);
mul_c(a, b);
gettimeofday(&t2, NULL);
time = 1000 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000;
printf("mul_c: %llu ms
", time);
gettimeofday(&t1, NULL);
mul_asm(b, c);
gettimeofday(&t2, NULL);
time = 1000 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000;
printf("mul_asm: %llu ms
", time);
for (int i = 0; i != 1000000; ++i)
{
printf("%lf%lf
", a[i], b[i]);
}
return 0;
}
I also tried to to make use of all xmm
registers (0-7) and remove instruction dependencies to get better parallell computing:
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix
"
"xor rax, rax
"
"0:
"
"movupd xmm0, xmmword ptr [rdi+rax]
"
"movupd xmm1, xmmword ptr [rdi+rax+16]
"
"movupd xmm2, xmmword ptr [rdi+rax+32]
"
"movupd xmm3, xmmword ptr [rdi+rax+48]
"
"movupd xmm4, xmmword ptr [rdi+rax+64]
"
"movupd xmm5, xmmword ptr [rdi+rax+80]
"
"movupd xmm6, xmmword ptr [rdi+rax+96]
"
"movupd xmm7, xmmword ptr [rdi+rax+112]
"
"mulpd xmm0, xmmword ptr [rsi+rax]
"
"mulpd xmm1, xmmword ptr [rsi+rax+16]
"
"mulpd xmm2, xmmword ptr [rsi+rax+32]
"
"mulpd xmm3, xmmword ptr [rsi+rax+48]
"
"mulpd xmm4, xmmword ptr [rsi+rax+64]
"
"mulpd xmm5, xmmword ptr [rsi+rax+80]
"
"mulpd xmm6, xmmword ptr [rsi+rax+96]
"
"mulpd xmm7, xmmword ptr [rsi+rax+112]
"
"movupd xmmword ptr [rdi+rax], xmm0
"
"movupd xmmword ptr [rdi+rax+16], xmm1
"
"movupd xmmword ptr [rdi+rax+32], xmm2
"
"movupd xmmword ptr [rdi+rax+48], xmm3
"
"movupd xmmword ptr [rdi+rax+64], xmm4
"
"movupd xmmword ptr [rdi+rax+80], xmm5
"
"movupd xmmword ptr [rdi+rax+96], xmm6
"
"movupd xmmword ptr [rdi+rax+112], xmm7
"
"add rax, 128
"
"cmp rax, 8000000
"
"jne 0b
"
".att_syntax noprefix
"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
But it still runs at 1 ms, the same speed as the ordinary C/C++ implementation.
UPDATES
As suggested by answers/comments, I've implemented another way of measuring the execution time:
#include <stdio.h>
#include <stdlib.h>
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix
"
"xor rax, rax
"
"0:
"
"movupd xmm0, xmmword ptr [rdi+rax]
"
"mulpd xmm0, xmmword ptr [rsi+rax]
"
"movupd xmmword ptr [rdi+rax], xmm0
"
"add rax, 16
"
"cmp rax, 8000000
"
"jne 0b
"
".att_syntax noprefix
"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
void mul_asm2(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix
"
"xor rax, rax
"
"0:
"
"movupd xmm0, xmmword ptr [rdi+rax]
"
"movupd xmm1, xmmword ptr [rdi+rax+16]
"
"movupd xmm2, xmmword ptr [rdi+rax+32]
"
"movupd xmm3, xmmword ptr [rdi+rax+48]
"
"movupd xmm4, xmmword ptr [rdi+rax+64]
"
"movupd xmm5, xmmword ptr [rdi+rax+80]
"
"movupd xmm6, xmmword ptr [rdi+rax+96]
"
"movupd xmm7, xmmword ptr [rdi+rax+112]
"
"mulpd xmm0, xmmword ptr [rsi+rax]
"
"mulpd xmm1, xmmword ptr [rsi+rax+16]
"
"mulpd xmm2, xmmword ptr [rsi+rax+32]
"
"mulpd xmm3, xmmword ptr [rsi+rax+48]
"
"mulpd xmm4, xmmword ptr [rsi+rax+64]
"
"mulpd xmm5, xmmword ptr [rsi+rax+80]
"
"mulpd xmm6, xmmword ptr [rsi+rax+96]
"
"mulpd xmm7, xmmword ptr [rsi+rax+112]
"
"movupd xmmword ptr [rdi+rax], xmm0
"
"movupd xmmword ptr [rdi+rax+16], xmm1
"
"movupd xmmword ptr [rdi+rax+32], xmm2
"
"movupd xmmword ptr [rdi+rax+48], xmm3
"
"movupd xmmword ptr [rdi+rax+64], xmm4
"
"movupd xmmword ptr [rdi+rax+80], xmm5
"
"movupd xmmword ptr [rdi+rax+96], xmm6
"
"movupd xmmword ptr [rdi+rax+112], xmm7
"
"add rax, 128
"
"cmp rax, 8000000
"
"jne 0b
"
".att_syntax noprefix
"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
unsigned long timestamp()
{
unsigned long a;
asm volatile
(
".intel_syntax noprefix
"
"xor rax, rax
"
"xor rdx, rdx
"
"RDTSCP
"
"shl rdx, 32
"
"or rax, rdx
"
".att_syntax noprefix
"
: "=a" (a)
:
: "memory", "cc"
);
return a;
}
int main()
{
unsigned long t1;
unsigned long t2;
double* a;
double* b;
a = (double*)malloc(sizeof(double) * 1000000);
b = (double*)malloc(sizeof(double) * 1000000);
for (int i = 0; i != 1000000; ++i)
{
double v;
scanf("%lf", &v);
a[i] = v;
b[i] = v;
}
t1 = timestamp();
mul_c(a, b);
//mul_asm(a, b);
//mul_asm2(a, b);
t2 = timestamp();
printf("mul_c: %lu cycles
", t2 - t1);
for (int i = 0; i != 1000000; ++i)
{
printf("%lf%lf
", a[i], b[i]);
}
return 0;
}
When I run the program with this measurement, I get this result:
mul_c: ~2163971628 cycles
mul_asm: ~2532045184 cycles
mul_asm2: ~5230488 cycles <-- what???
Two things are worth a notice here, first of all, the cycles count vary a LOT, and I assume that's because of the operating system allowing other processes to run inbetween. Is there any way to prevent that or only count the cycles while my program is executed? Also, mul_asm2
produces identical output compared to the other two, but it so much faster, how?
I tried Z boson's program on my system together with my 2 implementations and got the following result:
> g++ -O2 -fopenmp main.cpp
> ./a.out
mul time 1.33, 18.08 GB/s
mul_SSE time 1.13, 21.24 GB/s
mul_SSE_NT time 1.51, 15.88 GB/s
mul_SSE_OMP time 0.79, 30.28 GB/s
mul_SSE_v2 time 1.12, 21.49 GB/s
mul_v2 time 1.26, 18.99 GB/s
mul_asm time 1.12, 21.50 GB/s
mul_asm2 time 1.09, 22.08 GB/s
See Question&Answers more detail:os