RVV的性能测试


进行数组上的数据相加合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#include <stdio.h>
#include <stdlib.h>
#include <riscv_vector.h>
#include <time.h>

// 正常的数组相加函数
void array_add_normal(size_t n, const float *a, const float *b, float *c) {
for (size_t i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}

// 使用 RVV 向量化的数组相加函数
void array_add_rvv(size_t n, const float *a, const float *b, float *c) {
size_t l;
vfloat32m8_t va, vb, vc;

for (; n > 0; n -= l) {
l = __riscv_vsetvl_e32m8(n); // 设置向量长度
va = __riscv_vle32_v_f32m8(a, l); // 加载 a
vb = __riscv_vle32_v_f32m8(b, l); // 加载 b
vc = __riscv_vfadd_vv_f32m8(va, vb, l); // 向量加法
__riscv_vse32_v_f32m8(c, vc, l); // 存回 c
a += l;
b += l;
c += l;
}
}

int main() {
size_t N = 1024; // 数组大小
float *a, *b, *c, *d;

// 分配内存,确保数据对齐
posix_memalign((void**)&a, 32, N * sizeof(float));
posix_memalign((void**)&b, 32, N * sizeof(float));
posix_memalign((void**)&c, 32, N * sizeof(float));
posix_memalign((void**)&d, 32, N * sizeof(float));

// 初始化数据
for (size_t i = 0; i < N; i++) {
a[i] = i * 1.0f;
b[i] = i * 2.0f;
}

struct timespec start, end;

// 使用正常的数组加法
clock_gettime(CLOCK_REALTIME, &start);
array_add_normal(N, a, b, c);
clock_gettime(CLOCK_REALTIME, &end);
long seconds = end.tv_sec - start.tv_sec;
long nanoseconds = end.tv_nsec - start.tv_nsec;
if (start.tv_nsec > end.tv_nsec) {
--seconds;
nanoseconds += 1000000000;
}
printf("Elapsed time for normal add: %ld.%09ld seconds\n", seconds, nanoseconds);

// 使用 RVV 向量化的加法
clock_gettime(CLOCK_REALTIME, &start);
array_add_rvv(N, a, b, d);
clock_gettime(CLOCK_REALTIME, &end);
seconds = end.tv_sec - start.tv_sec;
nanoseconds = end.tv_nsec - start.tv_nsec;
if (start.tv_nsec > end.tv_nsec) {
--seconds;
nanoseconds += 1000000000;
}
printf("Elapsed time for RVV add: %ld.%09ld seconds\n", seconds, nanoseconds);

// 可以在这里打印部分结果,验证两种方法是否一致
printf("c[0]: %f, d[0]: %f\n", c[0], d[0]);

// 释放内存
free(a);
free(b);
free(c);
free(d);

return 0;
}

qemu测试结果

-O -O1 -O2 -O3
normal add 0.000056136 seconds 0.000057337 seconds 0.000139442 seconds 0.000212970 seconds
RVV add 0.000126027 seconds 0.000147346 seconds 0.000139222 seconds 0.000095841 seconds

实机测试结果

-O -O1 -O2 -O3
normal add 0.000000500 seconds 0.000000500 seconds 0.000012375 0.000008125
RVV add 0.000007459 seconds 0.0006959 seconds 0.000000833 0.000000667

总结

简单加减法上,qemu上编译器优化会增加常规函数的运行时间(因为qemu对向量指令开销大),数据量不大时候常规未经优化标量的运行时间比优化后的RVV还要快。

线性运算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <riscv_vector.h>
#include <stddef.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#define N 31

float input[N] = {-0.4325648115282207, -1.6655843782380970, 0.1253323064748307,
0.2876764203585489, -1.1464713506814637, 1.1909154656429988,
1.1891642016521031, -0.0376332765933176, 0.3272923614086541,
0.1746391428209245, -0.1867085776814394, 0.7257905482933027,
-0.5883165430141887, 2.1831858181971011, -0.1363958830865957,
0.1139313135208096, 1.0667682113591888, 0.0592814605236053,
-0.0956484054836690, -0.8323494636500225, 0.2944108163926404,
-1.3361818579378040, 0.7143245518189522, 1.6235620644462707,
-0.6917757017022868, 0.8579966728282626, 1.2540014216025324,
-1.5937295764474768, -1.4409644319010200, 0.5711476236581780,
-0.3998855777153632};

float output_golden[N] = {
1.7491401329284098, 0.1325982188803279, 0.3252281811989881,
-0.7938091410349637, 0.3149236145048914, -0.5272704888029532,
0.9322666565031119, 1.1646643544607362, -2.0456694357357357,
-0.6443728590041911, 1.7410657940825480, 0.4867684246821860,
1.0488288293660140, 1.4885752747099299, 1.2705014969484090,
-1.8561241921210170, 2.1343209047321410, 1.4358467535865909,
-0.9173023332875400, -1.1060770780029008, 0.8105708062681296,
0.6985430696369063, -0.4015827425012831, 1.2687512030669628,
-0.7836083053674872, 0.2132664971465569, 0.7878984786088954,
0.8966819356782295, -0.1869172943544062, 1.0131816724341454,
0.2484350696132857};

float output[N] = {
1.7491401329284098, 0.1325982188803279, 0.3252281811989881,
-0.7938091410349637, 0.3149236145048914, -0.5272704888029532,
0.9322666565031119, 1.1646643544607362, -2.0456694357357357,
-0.6443728590041911, 1.7410657940825480, 0.4867684246821860,
1.0488288293660140, 1.4885752747099299, 1.2705014969484090,
-1.8561241921210170, 2.1343209047321410, 1.4358467535865909,
-0.9173023332875400, -1.1060770780029008, 0.8105708062681296,
0.6985430696369063, -0.4015827425012831, 1.2687512030669628,
-0.7836083053674872, 0.2132664971465569, 0.7878984786088954,
0.8966819356782295, -0.1869172943544062, 1.0131816724341454,
0.2484350696132857};

void saxpy_golden(size_t n, const float a, const float *x, float *y) {
for (size_t i = 0; i < n; ++i) {
y[i] = a * x[i] + y[i];
}
}

// reference https://github.com/riscv/riscv-v-spec/blob/master/example/saxpy.s
void saxpy_vec(size_t n, const float a, const float *x, float *y) {
size_t l;

vfloat32m8_t vx, vy;

for (; n > 0; n -= l) {
l = __riscv_vsetvl_e32m8(n);
vx = __riscv_vle32_v_f32m8(x, l);
x += l;
vy = __riscv_vle32_v_f32m8(y, l);
vy = __riscv_vfmacc_vf_f32m8(vy, a, vx, l);
__riscv_vse32_v_f32m8 (y, vy, l);
y += l;
}
}

int fp_eq(float reference, float actual, float relErr)
{
// if near zero, do absolute error instead.
float absErr = relErr * ((fabsf(reference) > relErr) ? fabsf(reference) : relErr);
return fabsf(actual - reference) < absErr;
}


int main() {

struct timespec start, end;
clock_gettime(CLOCK_REALTIME, &start);
saxpy_golden(N, 55.66, input, output_golden);
clock_gettime(CLOCK_REALTIME, &end);
long seconds = end.tv_sec - start.tv_sec;
long nanoseconds = end.tv_nsec - start.tv_nsec;
if (start.tv_nsec > end.tv_nsec) {
--seconds;
nanoseconds += 1000000000;
}
printf("Elapsed time for normal add: %ld.%09ld seconds\n", seconds, nanoseconds);

clock_gettime(CLOCK_REALTIME, &start);
saxpy_vec(N, 55.66, input, output);
clock_gettime(CLOCK_REALTIME, &end);
seconds = end.tv_sec - start.tv_sec;
nanoseconds = end.tv_nsec - start.tv_nsec;
if (start.tv_nsec > end.tv_nsec) {
--seconds;
nanoseconds += 1000000000;
}
printf("Elapsed time for RVV add: %ld.%09ld seconds\n", seconds, nanoseconds);


int pass = 1;
for (int i = 0; i < N; i++) {
if (!fp_eq(output_golden[i], output[i], 1e-6)) {
printf("failed, %f=!%f\n", output_golden[i], output[i]);
pass = 0;
}
}
if (pass)
printf("passed\n");
return (pass == 0);
}

qemu测试结果

RVV -O -O1 -O2 -O3
1 0.000066337 seconds 0.000063587 seconds 0.000053899 seconds 0.000044271 seconds
2 0.000063456 seconds 0.000076642 seconds 0.000063870 seconds 0.000059106 seconds
3 0.000071483 seconds 0.000076432 seconds 0.000051195 seconds 0.000044282 seconds
4 0.000065239 seconds 0.000072144 seconds 0.000046892 seconds 0.000049881 seconds
normal -O -O1 -O2 -O3
1 0.000061757 seconds 0.000055580 seconds 0.000141593 seconds 0.000122429 seconds
2 0.000055796 seconds 0.000055420 seconds 0.000138452 seconds 0.000122429 seconds

总结

线性运算上,通过编译器的优化,发现RVV运行时间是有优化的,优化后的RVV比常规函数运行时间少,运行时间少20~28%。

矩阵运算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#include "common.h"
#include <riscv_vector.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
void free_array_2d(double **array, int rows) {
for (int i = 0; i < rows; ++i) {
free(array[i]); // 释放每一行
}
free(array); // 释放指向行指针的数组
}

// 矩阵乘法的黄金实现
void matmul_golden(double **a, double **b, double **c, int n, int m, int o) {
for (int i = 0; i < n; ++i)
for (int j = 0; j < m; ++j) {
c[i][j] = 0;
for (int k = 0; k < o; ++k)
c[i][j] += a[i][k] * b[j][k];
}
}

// 使用 RISC-V 向量扩展优化的矩阵乘法实现
void matmul(double **a, double **b, double **c, int n, int m, int o) {
size_t vlmax = __riscv_vsetvlmax_e64m1();
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
double *ptr_a = &a[i][0];
double *ptr_b = &b[j][0];
int k = o;
vfloat64m1_t vec_s = __riscv_vfmv_v_f_f64m1(0, vlmax);
vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
for (size_t vl; k > 0; k -= vl, ptr_a += vl, ptr_b += vl) {
vl = __riscv_vsetvl_e64m1(k);

vfloat64m1_t vec_a = __riscv_vle64_v_f64m1(ptr_a, vl);
vfloat64m1_t vec_b = __riscv_vle64_v_f64m1(ptr_b, vl);

vec_s = __riscv_vfmacc_vv_f64m1(vec_s, vec_a, vec_b, vl);
}

vfloat64m1_t vec_sum;
vec_sum = __riscv_vfredusum_vs_f64m1_f64m1(vec_s, vec_zero, vlmax);
double sum = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
c[i][j] = sum;
}
}
}

// 计算时间差的辅助函数
void calc_elapsed_time(const char *label, struct timespec start, struct timespec end) {
long seconds = end.tv_sec - start.tv_sec;
long nanoseconds = end.tv_nsec - start.tv_nsec;
if (start.tv_nsec > end.tv_nsec) {
--seconds;
nanoseconds += 1000000000;
}
printf("%s elapsed time: %ld.%09ld seconds\n", label, seconds, nanoseconds);
}

int main() {
const int N = 512; // 矩阵 A 的行数
const int M = 512; // 矩阵 B 的行数(等于 C 的列数)
const int O = 512; // 矩阵 A 的列数(等于 B 的列数)
uint32_t seed = 0xdeadbeef;
srand(seed);

// 分配内存并生成随机数据
double **A = alloc_array_2d(N, O);
double **B = alloc_array_2d(M, O);

struct timespec start, end;
clock_gettime(CLOCK_REALTIME, &start);
gen_rand_2d(A, N, O);
gen_rand_2d(B, M, O);
clock_gettime(CLOCK_REALTIME, &end);

calc_elapsed_time("Data generation", start, end);

// 分配内存用于存储结果
double **golden = alloc_array_2d(N, M);
double **actual = alloc_array_2d(N, M);

// 执行黄金实现
struct timespec start1, end1;
clock_gettime(CLOCK_REALTIME, &start1);
matmul_golden(A, B, golden, N, M, O);
clock_gettime(CLOCK_REALTIME, &end1);
calc_elapsed_time("Golden implementation", start1, end1);

// 执行优化实现
struct timespec start2, end2;
clock_gettime(CLOCK_REALTIME, &start2);
matmul(A, B, actual, N, M, O);
clock_gettime(CLOCK_REALTIME, &end2);
calc_elapsed_time("Vector implementation", start2, end2);

// 比较结果
puts(compare_2d(golden, actual, N, M) ? "pass" : "fail");

// 释放内存
free_array_2d(A, N);
free_array_2d(B, M);
free_array_2d(golden, N);
free_array_2d(actual, N);

return 0;
}

实际机器测试结果

-O -O1 -O2 -O3
normal 0.58 0.66 0.46 0.45
rvv 0.33 0.34 0.34 0.33

qemu可以执行RVV的指令,但是开销很大,实机的运行效率更高。

目前自动向量化的效率在矩阵运算上慢与手动向量化Intrinsic。编译器的自动向量化对于程序的运行效率是有提升的。

-O2 strcpy strncpy strlen
常量 0.000002333 0.000003750 0.000000959
向量 0.000010833 0.000001125 0.000007666

总结:编译器GCC或者LLVM可以对程序进行自动向量化的优化,但是因为目前的编译器版本对自动向量化的支持还不完善,通过测试结果可看到实际机器上跑Intrinsic的向量化优化的效率要比编译器的优化更高。

性能影响因素

内存对齐:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include <riscv_vector.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 10 // 矩阵的维度

// 使用RVV实现未对齐的矩阵加法
void matrix_add_unaligned(const int32_t* matA, const int32_t* matB, int32_t* matC) {
size_t vl;
for (size_t i = 0; i < N; i++) {
for (size_t j = 0; j < N; j += vl) {
vl = __riscv_vsetvl_e32m1(N - j);

// 非对齐加载
vint32m1_t vecA = __riscv_vle32_v_i32m1(matA + i * N + j, vl);
vint32m1_t vecB = __riscv_vle32_v_i32m1(matB + i * N + j, vl);

// 元素加法
vint32m1_t vecC = __riscv_vadd_vv_i32m1(vecA, vecB, vl);

// 保存结果
__riscv_vse32_v_i32m1(matC + i * N + j, vecC, vl);
}
}
}

// 使用RVV实现对齐的矩阵加法
void matrix_add_aligned(const int32_t* matA, const int32_t* matB, int32_t* matC) {
size_t vl;
for (size_t i = 0; i < N; i++) {
for (size_t j = 0; j < N; j += vl) {
vl = __riscv_vsetvl_e32m1(N - j);

// 对齐加载
vint32m1_t vecA = __riscv_vle32_v_i32m1((int32_t*)__builtin_assume_aligned(matA + i * N + j, 4), vl);
vint32m1_t vecB = __riscv_vle32_v_i32m1((int32_t*)__builtin_assume_aligned(matB + i * N + j, 4), vl);

// 元素加法
vint32m1_t vecC = __riscv_vadd_vv_i32m1(vecA, vecB, vl);

// 保存结果
__riscv_vse32_v_i32m1((int32_t*)__builtin_assume_aligned(matC + i * N + j, 4), vecC, vl);
}
}
}

int main() {
int32_t matA[N][N] __attribute__((aligned(4))) = {0};
int32_t matB[N][N] __attribute__((aligned(4))) = {0};
int32_t matC_unaligned[N][N] = {0};
int32_t matC_aligned[N][N] = {0};

// 初始化矩阵数据
for (size_t i = 0; i < N; i++) {
for (size_t j = 0; j < N; j++) {
matA[i][j] = i + j;
matB[i][j] = i - j;
}
}

// 测试未对齐加法
clock_t start_unaligned = clock();
matrix_add_unaligned(&matA[0][0], &matB[0][0], &matC_unaligned[0][0]);
clock_t end_unaligned = clock();
double time_unaligned = (double)(end_unaligned - start_unaligned) / CLOCKS_PER_SEC;

// 测试对齐加法
clock_t start_aligned = clock();
matrix_add_aligned(&matA[0][0], &matB[0][0], &matC_aligned[0][0]);
clock_t end_aligned = clock();
double time_aligned = (double)(end_aligned - start_aligned) / CLOCKS_PER_SEC;

// 打印未对齐结果
printf("未对齐计算结果:\n");
for (size_t i = 0; i < N; i++) {
for (size_t j = 0; j < N; j++) {
printf("%d ", matC_unaligned[i][j]);
}
printf("\n");
}

// 打印对齐结果
printf("\n对齐计算结果:\n");
for (size_t i = 0; i < N; i++) {
for (size_t j = 0; j < N; j++) {
printf("%d ", matC_aligned[i][j]);
}
printf("\n");
}

// 打印时间比较
printf("\n未对齐计算时间: %.6f 秒\n", time_unaligned);
printf("对齐计算时间: %.6f 秒\n", time_aligned);

return 0;
}

屏幕截图 2024 11 25 151948