MKL + CUDA 编译 MAGMA 以及使用 MAGMA

久闻 MAGMA 大名若干年以后,最近终于要用到它来做测试。记录一下安装和使用的一些步骤。

硬件环境:Xeon E5-1620v2, GTX 1060 3GB
软件环境:Kubuntu 18.04 LTS, Intel Parallel Studio 2018 update 4, CUDA 10.0

下载 MAGMA-2.5.0 的源代码并解压:

1
2
3
4
wget http://icl.utk.edu/projectsfiles/magma/downloads/magma-2.5.0.tar.gz
tar xf magma-2.5.0.tar.gz
mv magma-2.5.0 magma-2.5.0-src
cd magma-2.5.0-src

MAGMA 可以使用下列数学库中的一个 MKL, OpenBLAS, ATLAS, ACML (AMD 数学库), ESSL (IBM 数学库)+ CUDA,需要自己在 make.inc 里设置。我让 MAGMA 使用 MKL 和 CUDA 并且使用 ICC 来编译,需要检测系统环境变量的 $MKLROOT$CUDADIR$MKLROOT 在 module file 里有了,$CUDADIR 我之前没有写在 CUDA 的 module file 里,因此先在 CUDA 的 module file 里加两行:

1
2
prepend-path    CUDADIR          /usr/local/cuda-10.0
prepend-path    CUDA_PATH        /usr/local/cuda-10.0

加载 Intel Parallel Studio 2018 和 CUDA 10:

1
module load intel/2018.4 cuda/10.0

从模板 make.inc 里拷贝一个最接近的出来,按需修改后编译:

1
2
3
4
5
6
7
8
9
cp make.inc-examples/make.inc.mkl-icc ./make.inc
# 我的显卡是 GTX 1060,因此修改第20行为:
# GPU_TARGET ?= Pascal

make -j8
make test -j8

# 编译好以后安装到 ~/magma-2.5.0
make install prefix=~/magma-2.5.0

装好以后创建一个 MAGMA 的 module file (不用 environment module 的可以跳过),以后直接 module load magma/2.5.0

1
2
3
4
5
6
7
#%Module 1.0

conflict	magma
prereq      intel/2018.04 cuda/10.0

prepend-path    MAGMA_PATH       /home/enigma/magma-2.5.0
prepend-path    LD_LIBRARY_PATH  /home/enigma/magma-2.5.0/lib

附上一个简单的 makefile 和测试代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
EXE = MAGMA_sparse_test

HOSTCC = icc

CUDA_PATH       = /usr/local/cuda-10.0
CUDA_INC_FLAGS  = -I$(CUDA_PATH)/include
CUDA_LD_FLAGS   = -L$(CUDA_PATH)/lib64 -lcublas -lcudart -lcusparse

MAGMA_PATH      = $(HOME)/magma-2.5.0
MAGMA_INC_FLAGS = -I$(MAGMA_PATH)/include -I$(MAGMA_PATH)/sparse/include 
MAGMA_LD_FLAGS  = -L$(MAGMA_PATH)/lib -lmagma -lmagma_sparse 

CFLAGS  = -O3 -xHost -qopenmp -g -Wall $(MAGMA_INC_FLAGS) $(CUDA_INC_FLAGS)
LDFLAGS = -O3 -xHost -qopenmp -g -Wall $(MAGMA_LD_FLAGS)  $(CUDA_LD_FLAGS)

OBJS = magma_csr_cg_example.o

EXE: $(OBJS)
	$(HOSTCC) $(OBJS) -o $(EXE) $(LDFLAGS) 

magma_csr_cg_example.o: magma_csr_cg_example.c
	$(HOSTCC) $(CFLAGS) magma_csr_cg_example.c -c

clean:
	rm *.o $(EXE)

这里顺带吐槽一下 MAGMA 的文档,opts.solver_par.solver 有哪些选项居然没有在文档里列出来,还要我去翻头文件……

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
// Modified from magma-2.5.0/example/example_sparse.c
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include "magma_v2.h"
#include "magmasparse.h"

int main(int argc, char** argv)
{
    int m = 200, n = 1;
    
    int *row = (int*) calloc(m+1, sizeof(int));
    int *col = (int*) calloc(m,   sizeof(int));
    
    double *val = (double*) calloc(m, sizeof(double));
    double *rhs = (double*) calloc(m, sizeof(double));
    double *sol = (double*) calloc(m, sizeof(double));
    
    // Create a simple diagonal matrix in CSR format and the right-hand side
    for (int i = 0; i < m; ++i) 
    {
        col[i] = i;
        row[i] = i;
        val[i] = 3.0;
        rhs[i] = 3.0;
        sol[i] = 0.0;
    }
    row[m] = m;
    
    // Initialize MAGMA
    magma_init();
    magma_dopts   opts;
    magma_queue_t queue;
    magma_queue_create(0, &queue);
    
    magma_d_matrix A={Magma_CSR}, dA={Magma_CSR};
    magma_d_matrix b={Magma_CSR}, db={Magma_CSR};
    magma_d_matrix x={Magma_CSR}, dx={Magma_CSR};
    
    // Pass the system to MAGMA
    magma_dcsrset(m, m, row, col, val, &A, queue);
    magma_dvset(m, 1, rhs, &b, queue);
    magma_dvset(m, 1, sol, &x, queue);
    
    // Setup MAGMA solver
    opts.solver_par.solver  = Magma_CG;
    opts.solver_par.maxiter = 1000;
    opts.solver_par.rtol    = 1e-10;
    
    // Initialize MAGMA solver
    magma_dsolverinfo_init(&opts.solver_par, &opts.precond_par, queue);
    
    // Copy the system to the device (optional, only necessary if using the GPU)
    magma_dmtransfer(A, &dA, Magma_CPU, Magma_DEV, queue);
    magma_dmtransfer(b, &db, Magma_CPU, Magma_DEV, queue);
    magma_dmtransfer(x, &dx, Magma_CPU, Magma_DEV, queue);

    // Solve the linear system
    magma_d_solver(dA, db, &dx, &opts, queue);

    // Copy the solution back to the host
    magma_dmfree(&x, queue);
    magma_dmtransfer(dx, &x, Magma_CPU, Magma_DEV, queue);
    
    // Copy the solution in MAGMA host structure to the application code
    magma_dvget(x, &m, &n, &sol, queue);
    
    // Check the results
    int correct = 1;
    for (int i = 0; i < m; i++)
        if (fabs(sol[i] - 1.0) / 1.0 > 1e-10) correct = 0;
    if (correct) printf("MAGMA solution is correct\n");
    else printf("MAGMA solution is wrong\n");
    
    // Free the allocated memory and finalize MAGMA
    magma_dmfree(&dx, queue);
    magma_dmfree(&db, queue);
    magma_dmfree(&dA, queue);
    magma_queue_destroy(queue);
    magma_finalize();
    
    return 0;
}