并行读取多个大型csv文件到数组中

时间:2017-12-11 15:47:30

标签: c++ arrays csv parallel-processing

借助一些教程和一些人在stackoverflow上的帮助,我设法将这个基本的并行(多个文件)csv - >拼接在一起。阵列阅读器。我可以更快地得到它吗?我已经在这里和那里读过有关将文件预加载到内存中,以某种方式优化线程,或者在cuda中做一些部分(我有一些小经验)的可能性?但不知道下一步应该是什么。有什么建议可以加快速度吗?:

// parallel-matrix-multiply.cpp
// compile with: /EHsc
#include <windows.h>
#include <ppl.h>
#include <iostream>
#include <random>

using namespace concurrency;    
using namespace std;

#include <fstream>
#include <sstream>

int main()
{
    int numRows = 360;
    int numCols = 4096;

    int** data = new int*[numRows * 120];
    for (int i = 0; i < numRows * 120; i++) {
        data[i] = new int[numCols];
    }

    clock_t starttimetotal = clock();
    char comma; // Just a place holder to store the commas
    char newLine; // Just a place holder to store the newlines

    int m = 120; //120 files of same format

    Concurrency::parallel_for(0, m,
        [&numCols, &numRows, &comma, &newLine, &data](int i) {

        std::ifstream in("C:/codeoutput/output_" + std::to_string(i + 1) + ".txt");

        for (int row = 0; row < numRows; row++) {
            for (int col = 0; col < numCols; col++)
            {
                // Grab Data for the cell in (row,col)
                in >> data[i * 360 + row][col];
                // If this is not the last column grab the comma between the values
                if (col < numCols - 1) {
                    in >> comma;
                }
            }
            in >> newLine; // Grab the remaining newLine character
        }
        in.close();
    });

    clock_t stoptotal = clock();
    double elapsed = (double)(stoptotal - starttimetotal) * 1000.0 / CLOCKS_PER_SEC;
    printf("Time elapsed in ms: %f\n", elapsed);

    return 0;
} 

1 个答案:

答案 0 :(得分:0)

你的内循环是计算可以移动到外循环的东西。考虑到这一点,瓶颈很可能是读取文件所以我不太确定这会产生巨大的差异。

    for (int row = 0; row < numRows; row++) {
        int rowIdx = i * 360 + row;
        for (int col = 0; col < numCols - 1; col++)
        {
            // Grab Data for the cell in (row,col)
            in >> data[rowIdx][col];
            in >> comma;
        }

        // Get last column + new line
        in >> data[rowIdx][numCols - 1];
        in >> newLine; // Grab the remaining newLine character
    }