Question

我试着在我的笔记本电脑上运行一个非常简单的（稀疏的）矩阵矢量积（V0），但它非常慢？...一个简单的实现（V1）非常快：

>> g++ -march=native -O3 -ftree-vectorize -funroll-loops -ffast-math -fstrict-aliasing -o matVecProdV0.exe matVecProdV0.cpp -I /path/to/eigen-eigen-5a0156e40feb/local/include/eigen3 -mavx -fopenmp
>> g++ -march=native -O3 -ftree-vectorize -funroll-loops -ffast-math -fstrict-aliasing -o matVecProdV1.exe matVecProdV1.cpp

>>  ./matVecProdV0.exe 10000 100
    134536 ms
>>  ./matVecProdV1.exe 10000 100
    498 ms

我错过了什么？

我的笔记本电脑有4个触发器（2个内核+超线程）。使用最新的debian /测试与g ++ - 7.2

>> cat /proc/cpuinfo
model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm cpuid_fault epb tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt dtherm ida arat pln pts

代码是：

>> more *.cpp
::::::::::::::
matVecProdV0.cpp
::::::::::::::
#include <iostream>
#include <string>
#include <sstream>
#include <chrono>
#include <cmath>
#include <cstdlib> // rand.
#include <Eigen/Sparse>
#include <vector>

using namespace std;

#define AVE 75000

int main(int argc, char ** argv) {
  if (argc != 3 || !argv) return 1;

  size_t n = 0; stringstream sn(argv[1]); sn >> n; if (n <= 0)          return 1;
  size_t p = 0; stringstream sp(argv[2]); sp >> p; if (p <= 0 || p > n) return 1;

  vector<Eigen::Triplet<double>> ijAij;
  ijAij.reserve(n*p); // We have p values per col.
  for (size_t i = 0; i < n; i++) {
    for (size_t j = 0; j < p; j++) { // We have p values per col.
      ijAij.push_back(Eigen::Triplet<double> (i, rand()%n, 1.)); // Get column in the range 0 to n-1.
    }
  }
  Eigen::SparseMatrix<double> mat(n, n);
  mat.reserve(n*p); // We have p values per col.
  mat.setFromTriplets(ijAij.begin(), ijAij.end());

  Eigen::VectorXd vec(n);
  for (size_t i = 0; i < n; i++) vec(i) = 1.;

  Eigen::VectorXd res(n);
  for (size_t i = 0; i < n; i++) res(i) = 0.;

  auto start = chrono::high_resolution_clock::now();
  for (size_t a = 0; a < AVE; a++) { // Average.
    res += mat*vec;
  }
  auto end = chrono::high_resolution_clock::now();
  cout << chrono::duration_cast<chrono::milliseconds>(end-start).count() << " ms" << flush;

  return rc;
}
::::::::::::::
matVecProdV1.cpp
::::::::::::::
#include <iostream>
#include <string>
#include <sstream>
#include <chrono>
#include <cmath>
#include <cstdlib> // rand.

using namespace std;

#define AVE 75000

int main(int argc, char ** argv) {
  if (argc != 3 || !argv) return 1;

  size_t n = 0; stringstream sn(argv[1]); sn >> n; if (n <= 0)          return 1;
  size_t p = 0; stringstream sp(argv[2]); sp >> p; if (p <= 0 || p > n) return 1;

  int * pMatIr = new int[n+1]; pMatIr[0] = 0;
  int nnz = n*p; // Number of non null values: p values per col * n cols.
  int * pMatJc = new int[nnz];
  double * pMatVal = new double[nnz];
  size_t s = 0; // Scan.
  for (size_t i = 0; i < n; i++) {
    pMatIr[i+1] = p; // We have p values per col.
    for (size_t j = 0; j < p; j++) {
      pMatJc[s] = rand()%n; // Get column in the range 0 to n-1.
      pMatVal[s] = 1.;
      s++;
    }
  }

  double * pVec = new double[n];
  for (size_t i = 0; i < n; i++) pVec[i] = 1.;

  double * pRes = new double[n];
  for (size_t i = 0; i < n; i++) pRes[i] = 0.;

  auto start = chrono::high_resolution_clock::now();
  for (size_t a = 0; a < AVE; a++) { // Average.
    for (size_t i = 0; i < n; i++) {
      int startJc = pMatIr[i];
      size_t nbJc = pMatIr[i+1] - startJc;
      for (size_t j = 0; j < nbJc; j++) {
        pRes[i] += pMatVal[pMatJc[startJc+j]]*pVec[i];
      }
    }
  }
  auto end = chrono::high_resolution_clock::now();
  cout << chrono::duration_cast<chrono::milliseconds>(end-start).count() << " ms" << flush;

  if (pMatIr)  {delete [] pMatIr;  pMatIr  = NULL;}
  if (pMatJc)  {delete [] pMatJc;  pMatJc  = NULL;}
  if (pMatVal) {delete [] pMatVal; pMatVal = NULL;}
  if (pVec)    {delete [] pVec;    pVec    = NULL;}
  if (pRes)    {delete [] pRes;    pRes    = NULL;}

  return rc;
}

为什么本征比天真的实现慢？

弗兰克

Answer 1

如果我正确地实现了您的天真实现，则行pMatIr[i+1] = p;应为pMatIr[i+1] = pMatIr[i] + p;，内循环中的pVec[i]应在j上运行。一旦修复，本征版本实际上在我的系统上运行得更快（> ~40％）。

请注意错误的代码很快，因为for（j）循环几乎不执行任何工作（pMatIr[i+1] - startJc对于所有i＆gt; 0都为零）...

特征：慢矩阵向量乘积

1 个答案: