Question

我正在阅读大量的.csv文件。它们中的每一个都有四组数据，由三个空白列分隔。读入整个文件并将其分配给数据框后，空白列的内容将被解释为NA。有超过一百个这样的文件，我无法保证每组数据的列数不变。

从初始数据帧开始，我需要创建四个独立的数据帧。在这种情况下，我无法弄清楚如何使用标签。描述四个数据集中的每一个的标签标记其第一列，但所有其他列名称都获得自动名称（V2，V3，V4等）。在找到标签位置列表后，我可以将第二个数据集计算为第6:25列，但R并不想按数字标识列：只有名称。

我能想到的另一个工具是找到NA列并在它们之间提取数据。不幸的是，我还没有办法做到这一点。坚持列名仍然是个问题。

我很感激你能给我的任何帮助。谢谢。

Answer 1

它应该与列索引一起使用。

我测试了这个假文件：

test = structure(list(testA1 = 1:5, testA2 = 1:5, testA3 = 1:5, c(NA, 
NA, NA, NA, NA), c(NA, NA, NA, NA, NA), c(NA, NA, NA, NA, NA), 
    testB1 = 1:5, testB2 = 1:5, c(NA, NA, NA, NA, NA), c(NA, 
    NA, NA, NA, NA), testC1 = 1:5, testC2 = 1:5, testC3 = 1:5, 
    testC4 = 1:5, c(NA, NA, NA, NA, NA), testD1 = 1:5, testD2 = 1:5), .Names = c("testA1", 
"testA2", "testA3", "", "", "", "testB1", "testB2", "", "", "testC1", 
"testC2", "testC3", "testC4", "", "testD1", "testD2"), class = "data.frame", row.names = c(NA, 
-5L))

write.csv(test, "test.csv", na="", row.names=FALSE, quote=FALSE)

然后，您可以尝试此功能：

readAndSkipBlanks = function(file, sep=",", ...) {
  # read the headers of the file
  headers = unlist(strsplit(x = readLines(file, n=1), split = sep))
  isBlank = headers == "" # detect the blank columns
  ind = rle(isBlank)$lengths # how many data and blank columns
  isData = rle(isBlank)$values # TRUE for blanks
  n = length(ind) 
  # start and end of data columns
  inds = cbind(cumsum(c(1, ind[-n])), cumsum(ind)) 
  inds = inds[!isData, ] # data only
  # read the data
  data = read.csv(file=file, sep=sep, ...)
  output = list()
  # loop over the datasets
  for(i in seq_len(nrow(inds))) {
    # this creates a list, each element a dataset
    output[[i]] = data[, inds[i, 1]:inds[i,2]]
  }
  return(output)
}

和

readAndSkipBlanks("test.csv")

[[1]]
  testA1 testA2 testA3
1      1      1      1
2      2      2      2
3      3      3      3
4      4      4      4
5      5      5      5

[[2]]
  testB1 testB2
1      1      1
2      2      2
3      3      3
4      4      4
5      5      5

[[3]]
  testC1 testC2 testC3 testC4
1      1      1      1      1
2      2      2      2      2
3      3      3      3      3
4      4      4      4      4
5      5      5      5      5

[[4]]
  testD1 testD2
1      1      1
2      2      2
3      3      3
4      4      4
5      5      5

从单个csv文件中提取多个数据文件

1 个答案: