HDFql写入非常慢

时间:2020-04-29 11:54:23

标签: hdf5 hdfql

我有一些代码,该代码迭代地接收转储到HDF5文件中的数据。这是我要达到的目标的一个玩具示例:

#include <HDFql.hpp>

void createHDF(const std::string & filepath)
{
    char script_[1024];
    sprintf(script_, "CREATE TRUNCATE FILE %s", filepath.c_str());
    HDFql::execute(script_);
    sprintf(script_, "USE FILE %s", filepath.c_str());
    HDFql::execute(script_);

    sprintf(script_, "CREATE GROUP events");
    HDFql::execute(script_);
    HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");

    sprintf(script_, "CREATE GROUP frames");
    HDFql::execute(script_);

    sprintf(script_, "CREATE GROUP optic_flow");
    HDFql::execute(script_);
}

void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v, 
    std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
    //Input arrays are all the same size
    const int data_size = ts_v.size();

    //Open file
    sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
    HDFql::execute(script_);

    //Add events
    sprintf(script_, "ALTER DIMENSION events/xs TO +%d", data_size);
    HDFql::execute(script_);
    sprintf(script_, "ALTER DIMENSION events/ys TO +%d", data_size);
    HDFql::execute(script_);
    sprintf(script_, "ALTER DIMENSION events/ts TO +%d", data_size);
    HDFql::execute(script_);
    sprintf(script_, "ALTER DIMENSION events/ps TO +%d", data_size);
    HDFql::execute(script_);

    HDFql::variableRegister(&xs_v[0]);
    sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size, 
    data_size, HDFql::variableGetNumber(&xs_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&xs_v[0]);

    HDFql::variableRegister(&ys_v[0]);
    sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
         data_size, HDFql::variableGetNumber(&ys_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&ys_v[0]);

    HDFql::variableRegister(&ts_v[0]);
    sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
         data_size, HDFql::variableGetNumber(&ts_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&ts_v[0]);

    HDFql::variableRegister(&ps_v[0]);
    sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
            data_size, HDFql::variableGetNumber(&ps_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&ps_v[0]);

    total_events_added_ += data_size;
    events_idx_++;
}

int main (int argc, const char * argv[]) {
    std::string path = "/tmp/test.h5";
    createHDF(path);

    const int data_size = 1000;
    const int iterations = 10000;
    std::vector<double> ts(data_size);
    std::vector<int16_t> xs(data_size);
    std::vector<int16_t> ys(data_size);
    std::vector<int8_t> ps(data_size);
    for(int i=0; i<data_size; i++)
    {
        ts_v.push_back(i);
        xs_v.push_back(i);
        ys_v.push_back(i);
        ps_v.push_back(1);
    }
    for(int i=0; i<iterations; i++)
    {
        writeData(ts, xs, ys, ps);
    }
}

这段代码极端运行缓慢。使用诸如cnpy之类的其他二进制库,只需眨眼即可执行,因此问题不在于写入的数据量。我想知道这是否就是HDFql中的情况,还是某个地方的代码中存在一些错误。

非常感谢!

2 个答案:

答案 0 :(得分:1)

您是否cnpy执行与HDFql中相同的操作(例如,扩展数据集events/xsevents/ysevents/ts和{{1} },是否使用等于1)的块大小?

查看您的代码,您可能希望显式指定等于events/ps的数据集的块大小,因为这很可能会大大提高性能。现在,拥有它的方式使HDFql为方便起见自动计算块大小(使用最佳猜测方法),这可能不会导致最佳性能。您需要明确指定块大小,例如ts_v.size()

答案 1 :(得分:1)

您的代码更加优化:

#include <HDFql.hpp>

void createHDF(const std::string & filepath)
{
    char script_[1024];

    sprintf(script_, "CREATE TRUNCATE FILE %s", filepath.c_str());
    HDFql::execute(script_);

    sprintf(script_, "USE FILE %s", filepath.c_str());
    HDFql::execute(script_);

    HDFql::execute("CREATE GROUP events, frames, optic_flow");

    HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");

}


void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v, std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
    //Input arrays are all the same size
    const int data_size = ts_v.size();

    //Open file
    sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
    HDFql::execute(script_);

    //Add events
    sprintf(script_, "ALTER DIMENSION events/xs, events/ys, events/ts, events/ps TO +%d", data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY 0", data_size, data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY 1", data_size, data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY 2", data_size, data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY 3", data_size, data_size);
    HDFql::execute(script_);

    total_events_added_ += data_size;
    events_idx_++;
}


int main (int argc, const char * argv[]) {
    std::string path = "/tmp/test.h5";
    createHDF(path);

    const int data_size = 1000;
    const int iterations = 10000;
    std::vector<double> ts(data_size);
    std::vector<int16_t> xs(data_size);
    std::vector<int16_t> ys(data_size);
    std::vector<int8_t> ps(data_size);

    for(int i=0; i<data_size; i++)
    {
        ts_v.push_back(i);
        xs_v.push_back(i);
        ys_v.push_back(i);
        ps_v.push_back(1);
    }


    HDFql::variableRegister(&xs_v);
    HDFql::variableRegister(&ys_v);
    HDFql::variableRegister(&ts_v);
    HDFql::variableRegister(&ps_v);


    for(int i=0; i<iterations; i++)
    {
        writeData(ts, xs, ys, ps);
    }

}

此外,是否可以将这两行连续的代码sprintf(script_, "USE FILE %s", HDF5_path_.c_str()); HDFql::execute(script_);移动到writeData函数之外,并且只打开一次文件?这样做肯定可以使事情更快。

相关问题