MKL + CUDA 编译 MAGMA 以及使用 MAGMA

久闻 MAGMA 大名若干年以后,最近终于要用到它来做测试。记录一下安装和使用的一些步骤。

硬件环境:Xeon E5-1620v2, GTX 1060 3GB
软件环境:Kubuntu 18.04 LTS, Intel Parallel Studio 2018 update 4, CUDA 10.0

下载 MAGMA-2.5.0 的源代码并解压:

1
2
3
4
wget http://icl.utk.edu/projectsfiles/magma/downloads/magma-2.5.0.tar.gz
tar xf magma-2.5.0.tar.gz
mv magma-2.5.0 magma-2.5.0-src
cd magma-2.5.0-src

MAGMA 可以使用下列数学库中的一个 MKL, OpenBLAS, ATLAS, ACML (AMD 数学库), ESSL (IBM 数学库)+ CUDA,需要自己在 make.inc 里设置。我让 MAGMA 使用 MKL 和 CUDA 并且使用 ICC 来编译,需要检测系统环境变量的 $MKLROOT$CUDADIR$MKLROOT 在 module file 里有了,$CUDADIR 我之前没有写在 CUDA 的 module file 里,因此先在 CUDA 的 module file 里加两行:

1
2
prepend-path CUDADIR /usr/local/cuda-10.0
prepend-path CUDA_PATH /usr/local/cuda-10.0

加载 Intel Parallel Studio 2018 和 CUDA 10:

1
module load intel/2018.4 cuda/10.0

从模板 make.inc 里拷贝一个最接近的出来,按需修改后编译:

1
2
3
4
5
6
7
8
9
cp make.inc-examples/make.inc.mkl-icc ./make.inc
# 我的显卡是 GTX 1060,因此修改第20行为:
# GPU_TARGET ?= Pascal
make -j8
make test -j8
# 编译好以后安装到 ~/magma-2.5.0
make install prefix=~/magma-2.5.0

装好以后创建一个 MAGMA 的 module file (不用 environment module 的可以跳过),以后直接 module load magma/2.5.0

1
2
3
4
5
6
7
#%Module 1.0
conflict magma
prereq intel/2018.04 cuda/10.0
prepend-path MAGMA_PATH /home/enigma/magma-2.5.0
prepend-path LD_LIBRARY_PATH /home/enigma/magma-2.5.0/lib

附上一个简单的 makefile 和测试代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
EXE = MAGMA_sparse_test
HOSTCC = icc
CUDA_PATH = /usr/local/cuda-10.0
CUDA_INC_FLAGS = -I$(CUDA_PATH)/include
CUDA_LD_FLAGS = -L$(CUDA_PATH)/lib64 -lcublas -lcudart -lcusparse
MAGMA_PATH = $(HOME)/magma-2.5.0
MAGMA_INC_FLAGS = -I$(MAGMA_PATH)/include -I$(MAGMA_PATH)/sparse/include
MAGMA_LD_FLAGS = -L$(MAGMA_PATH)/lib -lmagma -lmagma_sparse
CFLAGS = -O3 -xHost -qopenmp -g -Wall $(MAGMA_INC_FLAGS) $(CUDA_INC_FLAGS)
LDFLAGS = -O3 -xHost -qopenmp -g -Wall $(MAGMA_LD_FLAGS) $(CUDA_LD_FLAGS)
OBJS = magma_csr_cg_example.o
EXE: $(OBJS)
$(HOSTCC) $(OBJS) -o $(EXE) $(LDFLAGS)
magma_csr_cg_example.o: magma_csr_cg_example.c
$(HOSTCC) $(CFLAGS) magma_csr_cg_example.c -c
clean:
rm *.o $(EXE)

这里顺带吐槽一下 MAGMA 的文档,opts.solver_par.solver 有哪些选项居然没有在文档里列出来,还要我去翻头文件……

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
// Modified from magma-2.5.0/example/example_sparse.c
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "magma_v2.h"
#include "magmasparse.h"
int main(int argc, char** argv)
{
int m = 200, n = 1;
int *row = (int*) calloc(m+1, sizeof(int));
int *col = (int*) calloc(m, sizeof(int));
double *val = (double*) calloc(m, sizeof(double));
double *rhs = (double*) calloc(m, sizeof(double));
double *sol = (double*) calloc(m, sizeof(double));
// Create a simple diagonal matrix in CSR format and the right-hand side
for (int i = 0; i < m; ++i)
{
col[i] = i;
row[i] = i;
val[i] = 3.0;
rhs[i] = 3.0;
sol[i] = 0.0;
}
row[m] = m;
// Initialize MAGMA
magma_init();
magma_dopts opts;
magma_queue_t queue;
magma_queue_create(0, &queue);
magma_d_matrix A={Magma_CSR}, dA={Magma_CSR};
magma_d_matrix b={Magma_CSR}, db={Magma_CSR};
magma_d_matrix x={Magma_CSR}, dx={Magma_CSR};
// Pass the system to MAGMA
magma_dcsrset(m, m, row, col, val, &A, queue);
magma_dvset(m, 1, rhs, &b, queue);
magma_dvset(m, 1, sol, &x, queue);
// Setup MAGMA solver
opts.solver_par.solver = Magma_CG;
opts.solver_par.maxiter = 1000;
opts.solver_par.rtol = 1e-10;
// Initialize MAGMA solver
magma_dsolverinfo_init(&opts.solver_par, &opts.precond_par, queue);
// Copy the system to the device (optional, only necessary if using the GPU)
magma_dmtransfer(A, &dA, Magma_CPU, Magma_DEV, queue);
magma_dmtransfer(b, &db, Magma_CPU, Magma_DEV, queue);
magma_dmtransfer(x, &dx, Magma_CPU, Magma_DEV, queue);
// Solve the linear system
magma_d_solver(dA, db, &dx, &opts, queue);
// Copy the solution back to the host
magma_dmfree(&x, queue);
magma_dmtransfer(dx, &x, Magma_CPU, Magma_DEV, queue);
// Copy the solution in MAGMA host structure to the application code
magma_dvget(x, &m, &n, &sol, queue);
// Check the results
int correct = 1;
for (int i = 0; i < m; i++)
if (fabs(sol[i] - 1.0) / 1.0 > 1e-10) correct = 0;
if (correct) printf("MAGMA solution is correct\n");
else printf("MAGMA solution is wrong\n");
// Free the allocated memory and finalize MAGMA
magma_dmfree(&dx, queue);
magma_dmfree(&db, queue);
magma_dmfree(&dA, queue);
magma_queue_destroy(queue);
magma_finalize();
return 0;
}