Question

所以我正在寻求从收入和计划参与调查的核心波数据中转换.dat文件。有15个波浪，每个波浪大约300,000-400,000个观测值。这些波中的每一个都具有相同的布局，具有1023个变量。 .dat文件结构不包含任何分隔符，因此每个观察都是一串字符（主要是数字，但有些“ - ”符号）。我用Python解析了数据字典，以捕获变量名，起始位置和大小。我还编写了一个脚本，将这些文件转换为DataFrame，以方便与pandas一起使用。问题是，我的脚本已运行超过36个小时，我需要大幅加快速度。

输入C.我是新手，但此时我想做的就是将这些.dat文件转换为.csv。是的，我可以使用Anaconda开发的几个包来更好地优化Python，但这似乎是一个很好的包含任务，我自己介绍给C.我写了一些小脚本来测试这个转换，这里是最相关的一个：

/*This script tests a conversion from .dat to .csv*/
#include <stdio.h>

main()
{
  /*Declare variables*/
  int var1;
  int var2;
  int var3;
  int i;

  /*Initialize pointers for source (test) and desitination (test_out) files*/
  FILE *test; /* test = test.dat file pointer */
  FILE *test_out; /* test_out = test.csv file pointer */

  /*Attempt to open the source file, and if it doesn't work, tell me about it */
  if ((test=fopen("O:\\Analyst\\Marvin\\scrap\\test.dat","r"))==NULL)
    printf("Source file could not be opened\n");
  /*Attempt to open the source file, and if it doesn't work, tell me about it */
  else if ((test_out=fopen("O:\\Analyst\\Marvin\\scrap\\test.csv","w"))==NULL)
    printf("Destination file could not be opened\n");
  /*If it does open, initiate read*/
  else{
    /*Write the headings to disk in the destination file*/
    fprintf(test_out,"%10s%10s%10s\n","Var 1,","Var 2,","Var 3");
    /*Initialize variables with the first row of data in the source file*/
    fscanf(test,"%3d%4d%3d",&var1,&var2,&var3);
    /*Initialize line counter*/
    i=1;
    /*For the remaining data lines in the source file...*/
    while (!feof(test)) {
      /*...write the last line's values to destination file...*/
      fprintf(test_out,"%9d%1s%9d%1s%10d\n",var1,",",var2,",",var3);
      /*...load the current line's values into the variable addresses...*/
      fscanf(test,"%3d%4d%3d",&var1,&var2,&var3);
      /*...and print (stout) then iterate the counter*/
      printf("%20s %d\n","Writing line #",i++);
    }
    /*Once the EOF is reached, close the source and destination files*/
    fclose(test);
    fclose(test_out);
  }
  /*Return 0 if everything has gone smoothly*/
  return 0;
}

源文件包含以下值：

它将以下内容输出到目的地：

Var 1,    Var 2,     Var 3
   12,     3456,       789
   12,     3456,       789
   12,     3456,       789
   12,     3456,       789
   12,     3456,       789

所以，这一切都很好，但我正在处理超过1000个变量。写出一个长期呕吐诱导的格式字符串的想法不仅仅是让我感到不好的做法＆gt;＆gt;许多击键的方法。鉴于我在一个易于解析的文件中有布局，我认为在输入和输出方面必须有一些编程解决这个问题。

在SO上只有一些与C相关的输入问题。我已经回顾过的样本似乎从未解决过使用已知布局解析如此大量变量的问题。有人请赐教。

Answer 1

要读取各种宽度的1023个整数，建议创建一个数组，指示每个整数的固定宽度。使用该宽度作为所需格式的索引。使用long long处理数字宽度最多19个左右。

#define MAX_WIDTH 4
int ReadLine(FILE *inf, const unsigned char *width, long long *data, size_t n) {
  static const char *format[MAX_WIDTH+1] = { 
      "%lld", "%1lld", "%2lld", "%3lld", "%4lld" };
  size_t i = 0;
  for (i = 0; i < n; i++) {
    if (width[i] > MAX_WIDTH) Handle_BadWidth(); 
    if (fscanf(inf, format[width[i]], &data[i]) != 1) {
      return 1;  // FAIL
    }
  }
  int eol = fgetc(inf);
  if (eol != '\n' && eol != EOF)
    return 2;  // FAIL
  return 0;
}

// Sample use - error handling omitted.
#define N (3)
int main(void) {
  FILE *inf = fopen("test.dat","r");
  long long data[N];

  unsigned char width[N];
  FormWidths("input.csv", width, N);
  return ReadLine(inf, width, data, N); 
}

OP解释说存在一个.CSV文件，宽度可以以“VarN，width”的形式读取

int FormWidths(const char *fname, unsigned char *width, size_t n) {
  FILE *inf = fopen(fname, "r");
  if (!inf) return 1;  // error
  char buf[100];
  fgets(buf, sizeof buf, inf);  // Use this line if file has table header
  size_t i;
  for (i=0; i<n; i++) {
    if (fgets(buf, sizeof buf, inf) == NULL) break;
    int index, w;
    if (sscanf(buf, "Var%d ,%d", &index, &w) != 2) break;
    if (index != i) break;
    // Adjust MAX_WIDTH to larger values like 19 and then update the format table
    if (w < 1 || w > MAX_WIDTH) break; 
    width[i] = w;
  }
  fclose(inf);
  if (i != n) return 1; // Did not read .csv as expected
  return 0;
}

在C中读取包含许多变量的.dat文件：格式化字符串的动态构建？

1 个答案: