绑定具有不同行数/列数的少数数据帧

时间:2014-05-22 13:39:03

标签: r

我正在尝试绑定数据帧。问题是它们的行/列数不同。

让我们从显示数据开始:

第一张表:

> dput(head(tbl_gel1))
structure(list(X = 1:6, X.1 = structure(1:6, .Label = c("AT1G01050", 
"AT1G01080", "AT1G01090", "AT1G01220", "AT1G01320", "AT1G01420", 
"ATCG01120", "ATCG01240", "ATCG01300", "ATCG01310", "ATMG01190"
), class = "factor"), x1.1 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.10 = c(NA, NA, 0.97940004406824, 
NA, NA, NA), x1.11 = c(NA, 0.715595925164684, 1.12076888461521, 
NA, 1, NA), x1.12 = c(NA, NA, 1, NA, 1, NA), x1.13 = c(NA, NA, 
1.27620944815459, NA, 1.10617482362388, NA), x1.14 = c(NA, NA, 
0.970143924518673, NA, 0.897284652612375, NA), x1.15 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.16 = c(NA, 
NA, 0.855292180180481, NA, 0.678275003166569, NA), x1.17 = c(NA, 
NA, NA, NA, 1.31361646343431, NA), x1.18 = c(NA, NA, 1.01824439729952, 
NA, 0.731395183389585, NA), x1.19 = c(NA, NA, 2.13871102449867, 
NA, 1.26860481661042, NA), x1.2 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.20 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.21 = c(NA, NA, 1.5546960313129, 
NA, 2.12826383499469, NA), x1.22 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.23 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.24 = c(NA, 0.553801084127354, 
1.68155174378018, NA, 1, NA), x1.3 = c(NA, 1.91253217984776, 
NA, NA, NA, NA), x1.4 = c(NA, 1.2635979388975, NA, NA, NA, NA
), x1.5 = c(NA, 0.997262468935362, NA, NA, NA, 1), x1.6 = c(NA, 
0.836333481838468, 0.186450525168714, NA, NA, 1), x1.7 = c(0.713761294385108, 
0.998433283631924, NA, NA, NA, NA), x1.8 = c(NA, 1.00273753106464, 
0.105799532964898, NA, NA, NA), x1.9 = c(1.14311935280745, 0.720766625421293, 
0.763452683687036, 1, NA, NA)), .Names = c("X", "X.1", "x1.1", 
"x1.10", "x1.11", "x1.12", "x1.13", "x1.14", "x1.15", "x1.16", 
"x1.17", "x1.18", "x1.19", "x1.2", "x1.20", "x1.21", "x1.22", 
"x1.23", "x1.24", "x1.3", "x1.4", "x1.5", "x1.6", "x1.7", "x1.8", 
"x1.9"), row.names = c(NA, 6L), class = "data.frame")

第二张表:

> dput(head(tbl_gel3))
structure(list(X = 1:6, X.1 = structure(1:6, .Label = c("AT1G01050", 
"AT1G01080", "AT1G01090", "AT1G01220", "AT1G01320", "AT1G01420", 
"ATCG00820", "ATCG01090", "ATCG01110", "ATCG01240", "ATCG01310", 
"ATMG00510", "ATMG01190"), class = "factor"), x1.49 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.50 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.51 = c(NA, 
1, NA, NA, NA, NA), x1.52 = c(NA, 1.7994810956534, NA, NA, NA, 
NA), x1.53 = c(NA, 1, NA, NA, NA, 1), x1.54 = c(NA, 7.89402612997038, 
NA, NA, NA, NA), x1.55 = c(0.920776942793063, 0.996320522101043, 
0.254584439603907, NA, NA, NA), x1.56 = c(1, 0.729758385900956, 
0.300151773873743, NA, NA, NA), x1.57 = c(1, 0.947723222879912, 
0.948619033067299, 1, NA, NA), x1.58 = c(1, 0.928854762925871, 
1.3235617264432, 0.785944656498542, 0.675641973487141, NA), x1.59 = c(1.06908415906789, 
0.634382162824105, 1.04395304578544, 1, 0.650651881343625, NA
), x1.60 = c(1.80853320689787, NA, 0.880820179658551, NA, NA, 
NA), x1.61 = c(1, NA, 1.6718152409295, 1.09278053029295, 1.01060798973004, 
NA), x1.62 = c(0.704459686809266, NA, 1, NA, 1.08123985492291, 
NA), x1.63 = c(0.629128718440608, 0.445252633504756, 0.675960340502994, 
NA, 1, NA), x1.64 = c(0.171185393355124, 0.884594994748168, 1, 
NA, 1.08954220349952, NA), x1.65 = c(NA, NA, 1.11460636151774, 
NA, NA, NA), x1.66 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), x1.67 = c(NA, NA, NA, NA, 10.2238567979379, 
NA), x1.68 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_), x1.69 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), x1.70 = c(NA, NA, 2.0577136925345, NA, 3.60392205648014, 
NA), x1.71 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_), x1.72 = c(NA, NA, 10.9845898205719, NA, NA, NA)), .Names = c("X", 
"X.1", "x1.49", "x1.50", "x1.51", "x1.52", "x1.53", "x1.54", 
"x1.55", "x1.56", "x1.57", "x1.58", "x1.59", "x1.60", "x1.61", 
"x1.62", "x1.63", "x1.64", "x1.65", "x1.66", "x1.67", "x1.68", 
"x1.69", "x1.70", "x1.71", "x1.72"), row.names = c(NA, 6L), class = "data.frame")

第三张表:

> dput(head(tbl_sec))
structure(list(X = structure(c(1782L, 1230L, 1526L, 2041L, 102L, 
698L), .Label = c("AT1G01050", "AT1G01090", "AT1G01100", "AT1G01750", 
"AT1G01800", "AT1G01960", "AT1G02090", "AT1G02305", "AT1G02360", 
"ATMG00070", "ATMG00090", "ATMG00160", "ATMG00510", "ATMG00640", 
"ATMG01190"), class = "factor"), X59 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X57 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X55 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X53 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X51 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X49 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X47 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X45 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), X43 = c(NA, NA, NA, 
NA, 550460.850765662, NA), X41 = c(NA, NA, NA, NA, 1517453.1780485, 
NA), X39 = c(NA, NA, NA, NA, 1315894.53103572, NA), X37 = c(NA, 
NA, NA, NA, 763309.836661819, NA), X35 = c(NA, NA, NA, NA, 634821.777772181, 
NA), X34 = c(NA, NA, NA, NA, 564880.262630396, NA), X33 = c(NA, 
NA, NA, NA, 560540.154923967, NA), X32 = c(NA, NA, NA, NA, 408375.360849701, 
NA), X31 = c(NA, NA, NA, NA, 621546.292486539, NA), X30 = c(NA, 
NA, NA, NA, 485668.141052665, NA), X29 = c(NA, NA, NA, NA, 458257.613478579, 
NA), X28 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_), X27 = c(NA, 2440.7661278223, NA, NA, NA, NA), X26 = c(NA, 
4313.08630371071, NA, NA, NA, NA), X25 = c(NA, 4647.39624023317, 
NA, NA, NA, NA), X24 = c(NA, 58501.7612593881, NA, NA, NA, NA
), X23 = c(NA, 4190.23148752641, NA, NA, NA, NA), X22 = c(NA, 
107865.93517639, NA, NA, NA, 56406.6255492295), X21 = c(NA, 103791.018692176, 
NA, NA, NA, 29801.44939896), X20 = c(NA, 101731.297303329, NA, 
NA, NA, 23752.8636664153), X19 = c(NA, 187370.754307142, NA, 
NA, NA, 59617.3689626876), X18 = c(NA, 193243.175254632, NA, 
NA, NA, 64925.9111057751), X17 = c(10853.13960828, 353170.97273086, 
383.972051991499, NA, 108320.413111105, 113709.733982341), X16 = c(13615.4187989797, 
454979.058426937, 126.61821720077, 2609.01425958656, 83367.047760359, 
152865.002861717), X15 = c(10152.0103407231, 787963.439771831, 
1122.40393680747, 377.483551025716, 115923.894416264, 153545.693711685
), X14 = c(23143.5651841651, 567417.145667799, 8666.35434242051, 
8418.96021663202, 160773.696496814, 24686.6650727505), X13 = c(12252.4621330072, 
120524.618492739, 6464.31782562405, 7114.35092074038, 73034.4486062994, 
4764.64061832276), X12 = c(39825.7855187457, 124982.084491202, 
15628.5786829252, 22855.2072395256, 201680.161282015, 905.953254853388
), X11 = c(173427.818954414, 125561.599047516, 57808.038487185, 
169845.738327073, 689074.78337782, 3291.62336724373), X10 = c(207578.716342028, 
99187.3238695562, 101921.969426591, 354982.817312907, 1154900.17933545, 
1919.86709330743), X9 = c(646197.416842161, 56845.6145899293, 
539905.113076328, 1928345.23451746, 4433849.5933255, 3701.9695777427
), X8 = c(436777.078182472, 66253.5865570073, 432025.383808184, 
1417796.33906413, 4936232.13860428, 4819.95526893262), X7 = c(192459.924633376, 
65577.6207320551, 460879.516636071, 1412729.33407886, 5269775.52541647, 
8113.6218931867), X6 = c(16693.5102795581, NA, 25824.1240764504, 
67651.5468133664, 1954673.90534866, NA), X5 = c(866.390222098434, 
NA, 22845.9576326078, 9493.05851485606, 209609.445853025, NA), 
    X4 = c(NA, NA, 35203.4876056065, 11471.9290105946, 24567.3997200956, 
    NA), X3 = c(NA, NA, 36936.1513095815, 5385.28997570043, 29947.5560730183, 
    NA)), .Names = c("X", "X59", "X57", "X55", "X53", "X51", 
"X49", "X47", "X45", "X43", "X41", "X39", "X37", "X35", "X34", 
"X33", "X32", "X31", "X30", "X29", "X28", "X27", "X26", "X25", 
"X24", "X23", "X22", "X21", "X20", "X19", "X18", "X17", "X16", 
"X15", "X14", "X13", "X12", "X11", "X10", "X9", "X8", "X7", "X6", 
"X5", "X4", "X3"), row.names = c(NA, 6L), class = "data.frame")

一种方法是获取所有名称的列表,然后准备一个循环来用数据框填充数据框。这就是我到目前为止所做的:

##Loading files
list_of_data = list.files(pattern="*.csv")
tbl = lapply(list_of_data, read.csv)

## Table - getting the list of all accessions
all_data = do.call(rbind.fill, tbl)

tbl_list <- subset(transform(all_data, X = sub("\\..*", "", X)), 
                   !duplicated(X))

tbl_list <- tbl_list[,-1]

#### Creating a vector of the genes accessions
gnames = unique(tbl_list[,1])
gnames= gnames[2:length(gnames)]

问题来自于循环,因为我用于其他数据的那个并不是那么好,并且肯定将这些数据与这样的循环组合需要很长时间。无论如何,我会发布它,向您展示我如何设法结合其他数据。这些数据之间的区别在于,在旧数据中,每个数据帧只需要4列。现在我需要所有(24 + 24 + 46)94列。

循环:

gdata = lapply(tbl,function(x) x[94:nrow(x),2:95])

for( i in 1:length(tbl)){
  rownames(gdata[[i]]) = tbl[[i]][3:nrow(tbl[[i]]),1]
}
tmp = lapply(gdata,function(x) matrix(x),ncol=94)


final.table1=c()
for(i in 1:length(gnames)){
  print(i)
  tmp=gnames[i]
  f1 = function(x) {x[tmp,]}
  tmp2 = lapply(gdata,f1)
  tmp3 = c()
  for(j in 1:length(tmp2)){
    tmp3=rbind(tmp3,tmp2[[j]])
  }
  tmp4 = as.vector(t(tmp3))
  final.table1 = rbind(final.table1,tmp4)
}

rownames(final.table1) = gnames

1 个答案:

答案 0 :(得分:2)

您只想合并基因名称列上的数据框?首先是将基因名称列转换为字符

tbl_gel1$'X.1' <- as.character(tbl_gel1$'X.1')
tbl_gel3$'X.1' <- as.character(tbl_gel3$'X.1')
tbl_sec$'X.1' <- as.character(tbl_sec$X)
# Note I changed the gene name column for tbl_sec so that it is the same as the others
# you might want to remove tbl_sec$X
mylist <- list(tbl_gel1,tbl_gel3,tbl_sec)
merge.df <- Reduce(function(x, y) merge(x, y, all=T,by="X.1",sort=F), 
                mylist, accumulate=F)

由于它设置为all=T,您将获得所有data.frames中的所有条目,如果设置为false,则只获得两者共有的条目。

注意tbl_sec对我而言是混乱的,并且有所有NA而不是基因名称。