0
使用Armadillo
我寫了一個矩陣向量乘法和一個線性系統求解。犰狳是從源代碼編譯並使用OpenBLAS
,也從源代碼編譯。不幸的是,我得到了單線程和多線程運行的不一致結果。矩陣向量乘法在單線程上運行得更快,而線程系統求解在多線程時運行得更快。我希望如果有人能夠給我一些關於我做錯了什麼的指示。使用Armadillo和OpenBLAS進行多線程時性能不一致
見下文:
- 源碼
- 編譯&運行bash腳本
- 結果
- 系統信息
matmul_armadillo.cpp
#include <armadillo>
using namespace arma;
int main(int argc, char *argv[])
{
const int n = atoi(argv[1]);
mat A = randu<mat>(n, n);
vec x = randu<vec>(n);
A*x;
return 0;
}
solve_armadillo.cpp
#include <armadillo>
using namespace arma;
int main(int argc, char *argv[])
{
const int n = atoi(argv[1]);
mat A = randu<mat>(n, n);
vec b = randu<vec>(n);
vec x;
x = solve(A, b);
return 0;
}
benchmark.sh
#!/bin/bash
g++ matmul_armadillo.cpp -o matmul_armadillo -O3 -march=native -std=c++11 -larmadillo
g++ solve_armadillo.cpp -o solve_armadillo -O3 -march=native -std=c++11 -larmadillo
N=7500
export OPENBLAS_NUM_THREADS=1
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=2
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=3
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=4
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=5
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=6
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=7
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
echo ''
export OPENBLAS_NUM_THREADS=8
echo 'Running matmul_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./matmul_armadillo $N
echo ''
echo 'Running solve_armadillo on' $OPENBLAS_NUM_THREADS 'threads'
time ./solve_armadillo $N
結果
$ ./benchmark.sh
Running matmul_armadillo on 1 threads
real 0m0.943s
user 0m0.628s
sys 0m0.159s
Running solve_armadillo on 1 threads
real 0m13.910s
user 0m13.553s
sys 0m0.300s
Running matmul_armadillo on 2 threads
real 0m1.528s
user 0m1.361s
sys 0m0.402s
Running solve_armadillo on 2 threads
real 0m15.815s
user 0m29.097s
sys 0m1.083s
Running matmul_armadillo on 3 threads
real 0m1.534s
user 0m1.480s
sys 0m0.533s
Running solve_armadillo on 3 threads
real 0m11.729s
user 0m31.022s
sys 0m1.290s
Running matmul_armadillo on 4 threads
real 0m1.543s
user 0m1.619s
sys 0m0.674s
Running solve_armadillo on 4 threads
real 0m10.013s
user 0m34.055s
sys 0m1.696s
Running matmul_armadillo on 5 threads
real 0m1.545s
user 0m1.620s
sys 0m0.664s
Running solve_armadillo on 5 threads
real 0m9.945s
user 0m33.803s
sys 0m1.669s
Running matmul_armadillo on 6 threads
real 0m1.543s
user 0m1.607s
sys 0m0.684s
Running solve_armadillo on 6 threads
real 0m10.069s
user 0m34.283s
sys 0m1.699s
Running matmul_armadillo on 7 threads
real 0m1.542s
user 0m1.622s
sys 0m0.661s
Running solve_armadillo on 7 threads
real 0m10.041s
user 0m34.154s
sys 0m1.704s
Running matmul_armadillo on 8 threads
real 0m1.546s
user 0m1.576s
sys 0m0.712s
Running solve_armadillo on 8 threads
real 0m10.123s
user 0m34.492s
sys 0m1.697s
系統信息
- 的openSUSE 13.1 64位
- 犰狳4.100.2(從源代碼編譯)
- OpenBLAS 0.2.8(從源代碼編譯)
您可能想訪問[OpenBLAS wiki](https://github.com/xianyi/OpenBLAS/issues),因爲您更有可能在那裏獲得回覆 – mtall