Question

我有一个700K行和5列的data.frame。我首先通过第一列子集，然后我想基于第二列将子集切割成100个间隔。对于这些间隔中的每一个，我想计算平均值dat.percent（第五列），我想在切割时绘制它。

> head(data)
  X1    X2 X3 X4 dat.percent
1  1 69270 NA NA    57.32338
2  1 69351 NA NA    61.68868
3  1 69428 NA NA    57.03619
4  1 69511 NA NA    52.78576
5  1 69552 NA NA    57.66801
6  1 69590 NA NA    44.39977
> dput(head(data)) 
structure(list(X1 = c(1L, 1L, 1L, 1L, 1L, 1L), X2 = c(69270, 
69351, 69428, 69511, 69552, 69590), X3 = c(NA, NA, NA, NA, NA, 
NA), X4 = c(NA, NA, NA, NA, NA, NA), dat.percent = c(57.323377369328, 
61.6886846639862, 57.0361860999426, 52.7857553130385, 57.6680068925905, 
44.3997702469845)), .Names = c("X1", "X2", "X3", "X4", "dat.percent"
), row.names = c(NA, 6L), class = "data.frame")

我无法让它发挥作用，但到目前为止，这就是我的目标：

for(i in 1:length(chr)) { 
png(paste0("./plots/Feature", i,".png"))
data.subset <- data[which(data[,1] %in% chr[i]) ,]
data.cuts <- cut(data.subset[,2], 100) 
data.subset$cuts <- data.cuts ## Does it need to be in data.frame for by? 
by(data.subset, as.factor(data.subset$cuts), function(x) { 
plot(mean(x[,2]), mean(x[,5]), xlim = c(min(x[,2]) , max(x[,2])) , 
        ylim = c( min(x[,5]) - mean(abs(x[,5])), max(x[,5]) + mean(abs(x[,5]))) ,
        xlab = "Genome", 
        ylab = "Percent" , 
        main = paste0("Feature ", i))
    }   
) 
} 
dev.off()
}

Answer 1

ggplot2如何：

require(plyr)
require(ggplot2)
data<-data.frame(X1=rep(1:3,each=10000),X2=sample(600000:700000,30000),X3=NA,X4=NA,dat.percent=runif(30000)*100)
head(data.plot)

data.plot<-ddply(data,.(X1,cut=cut(X2,10)),summarise,mean=mean(dat.percent))

ggplot(data.plot)+
  geom_point(aes(cut,mean,color=factor(X1)),size=10,alpha=0.5)+
  geom_line(aes(cut,mean,group=factor(X1)),alpha=0.5) +
  theme(axis.text.x=element_text(angle=-90))

enter image description here

你怎么用来做这个，R？

1 个答案: