从数据框中提取信息

时间:2016-03-01 15:30:02

标签: r dataframe

我有一个如下数据框

df<- structure(list(s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"
    ), class = "factor"), s2 = structure(1:3, .Label = c("2-4", "3-15", 
    "7-16"), class = "factor")), .Names = c("s1", "s2"), row.names = c(NA, 
    -3L), class = "data.frame")

如下所示

> df
#   s1   s2
#1 3-4  2-4
#2 4-1 3-15
#3 5-4 7-16

我想要做的是首先搜索并找到之后类似的值 - 例如,这里4位于s1的第一行,s2的第一行和s1的第三行

  

- 第二列表示找到这些值的次数

     

- 第三列显示其中有多少来自df的第一列

     

- 第四列显示了其中有多少来自df的第二列

     

- 第五个是来自第一列的字符串

     

- 第六个是来自第二列的字符串

输出看起来像这样

Value    repeated     s1N   s1N   ss1    ss2
4           3         2      1    3,5     2
1           1         1      -     4      -
15          1         -      1     -      3
16          1         -      1     -      7

4 个答案:

答案 0 :(得分:1)

您需要做的第一件事是从字符串中提取数字。运行:

newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)

在第一行中分割字符串,并在第二行中将它们转换为数值。结果是一个三维矩阵,您可以使用它来提取您的值。

首先创建一个新的数据框:

#length of the columns in the new frame = number of unique values
dflength <- length(unique(array(newdf[2,,]))) 
dfout <- data.frame(Value=rep(0,dflength),repeated=rep(0,dflength),s1N=rep(0,dflength),s2N=rep(0,dflength),ss1=rep(0,dflength),ss2=rep(0,dflength))

最明显的方式(但可能不是最有效的方法)就是循环匹配你需要的任何东西:

dfout$Value <- unique(array(newdf[2,,]))
for(i in 1:dflength){
  getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout$Value[i])
  dfout$repeated[i] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
  dfout$s1N[i] <- as.data.frame(table(newdf[2,,1]))$Freq[getID]
  if(is.na(dfout$s1N[i])){
    dfout$s1N[i] <- 0
  }
  dfout$s2N[i] <- as.data.frame(table(newdf[2,,2]))$Freq[getID]
  if(is.na(dfout$s2N[i])){
    dfout$s2N[i] <- 0
  }
  getID <- which(newdf[2,,1]==dfout$Value[i])
  if(length(getID)>0){
    dfout$ss1[i] <- toString(newdf[1,,1][getID])
  } else {
    dfout$ss1[i] <- 0
  }
  getID <- which(newdf[2,,2]==dfout$Value[i])
  if(length(getID)>0){
    dfout$ss2[i] <- toString(newdf[1,,2][getID])
  } else {
    dfout$ss2[i] <- 0
  }
}
dfout
#  Value repeated s1N s2N  ss1 ss2
#1     4        3   2   1 3, 5   2
#2     1        1   1   1    4   0
#3    15        1   0   1    0   3
#4    16        1   0   0    0   7

编辑以循环n个值的值

newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
dflength <- length(unique(array(newdf[2,,])))
#find the number of s variables
slength <- length(newdf[1,1,])
#create a matrix of appropriate size
dfout <- matrix(data=NA,nrow=dflength,ncol=(2+2*slength))
#create a (near)-empty names array, we will fill it in later
names <- c("Value","repeated",rep("",2*slength))
#fill in the Values column
dfout[,1] <- unique(array(newdf[2,,]))
#loop for every s variable
for(j in 1:slength){
  #get their names, paste N or s and add them to the names array
  names[2+j] <- paste(names(df)[j],"N",sep="")
  names[2+j+slength] <- paste("s",names(df)[j],sep="")
  #loop to get the other values
  for(i in 1:dflength){
    getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout[i,1])
    dfout[i,2] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
    dfout[i,2+j] <- as.data.frame(table(newdf[2,,j]))$Freq[getID]
    if(is.na(dfout[i,2+j])){
      dfout[i,2+j] <- 0
    }
    getID <- which(newdf[2,,j]==dfout[i,1])
    if(length(getID)>0){
      dfout[i,2+j+slength] <- toString(newdf[1,,j][getID])
    } else {
      dfout[i,2+j+slength] <- 0
    }
  }
}
colnames(dfout)<-names
as.data.frame(dfout)
#  Value repeated s1N s2N  ss1 ss2
#1     4        3   2   1 3, 5   2
#2     1        1   1   1    4   0
#3    15        1   0   1    0   3
#4    16        1   0   0    0   7

答案 1 :(得分:1)

出乎意料的难题。将其分解为几个合乎逻辑的步骤是很好的:

## 1: split into (val,ss) pairs, and capture ci (column index) association
res <- setNames(do.call(rbind,lapply(seq_along(df),function(ci)
    do.call(rbind,lapply(strsplit(as.character(df[[ci]]),'-'),function(x)
        data.frame(x[2L],x[1L],ci,stringsAsFactors=F)
    ))
)),c('val','ss','ci'));
res;
##   val ss ci
## 1   4  3  1
## 2   1  4  1
## 3   4  5  1
## 4   4  2  2
## 5  15  3  2
## 6  16  7  2

## 2: aggregate ss (joining on comma) by (val,ci), and capture record count as n
res <- do.call(rbind,by(res,res[c('val','ci')],function(x)
    data.frame(val=x$val[1L],ci=x$ci[1L],n=nrow(x),ss=paste(x$ss,collapse=','),stringsAsFactors=F)
));
res;
##   val ci n  ss
## 1   1  1 1   4
## 2   4  1 2 3,5
## 3  15  2 1   3
## 4  16  2 1   7
## 5   4  2 1   2

## 3: reshape to wide format
res <- reshape(res,idvar='val',timevar='ci',dir='w');
res;
##   val n.1 ss.1 n.2 ss.2
## 1   1   1    4  NA <NA>
## 2   4   2  3,5   1    2
## 3  15  NA <NA>   1    3
## 4  16  NA <NA>   1    7

## 4: add repeated column; can be calculated by summing all n.* columns
## note: leveraging psum() from <http://stackoverflow.com/questions/12139431/add-variables-whilst-ignoring-nas-using-transform-function>
psum <- function(...,na.rm=F) { x <- list(...); rowSums(matrix(unlist(x),ncol=length(x)),na.rm=na.rm); };
res$repeated <- do.call(psum,c(res[grep('^n\\.[0-9]+$',names(res))],na.rm=T));
res;
##   val n.1 ss.1 n.2 ss.2 repeated
## 1   1   1    4  NA <NA>        1
## 2   4   2  3,5   1    2        3
## 3  15  NA <NA>   1    3        1
## 4  16  NA <NA>   1    7        1

关于NA,如果需要,可以在以后修复它们。但是,我建议n.*列的正确类型是整数,因为它们代表计数,因此使用'-'(如示例输出中)来表示空单元格是不合适的。我会建议零。破折号适用于ss.*列,因为它们是字符串。以下是如何做到这一点:

n.cis <- grep('^n\\.[0-9]+$',names(res));
ss.cis <- grep('^ss\\.[0-9]+$',names(res));
res[n.cis][is.na(res[n.cis])] <- 0L;
res[ss.cis][is.na(res[ss.cis])] <- '-';
res;
##   val n.1 ss.1 n.2 ss.2 repeated
## 1   1   1    4   0    -        1
## 2   4   2  3,5   1    2        3
## 3  15   0    -   1    3        1
## 4  16   0    -   1    7        1

答案 2 :(得分:1)

我已根据您的实际数据完全重写了所有代码,并且已在我的机器上对其进行了测试。因为它是一个非常大的数据帧,所以运行需要一些时间,在我看来,循环是不可避免的。

# function to split the strings
myfun<-function(x){
  x<-strsplit(as.character(x), '-')
  x1<-unlist(x)
  x.new<-as.data.frame(matrix(x1, byrow = T, length(x)))
  return(x.new)
}

# this returns a list of dataframes 
list.v<-lapply(df[1:dim(df)[2]], myfun)
# like this
head(list.v[[17]])

# try to combine all the dfs, produced an error of mismatching # of columns
df2<-do.call(rbind, list.v) 

# some of the dfs in list.v are all NA's, they should be dropped
sum<-summary(list.v)        
list.v<-list.v[-which(sum[,1] != "2")]  # this excludes those all-NA datafrmes in list.v

# now combine all dfs for indexing purposes
df2<-do.call(rbind, list.v)

# create "value", "repeated" column in the desired result df. 
# These codes are same as my previous answer
value<-names(table(df2[,2]))
repeated<-as.vector(table(df2[,2]))

# create an empty list to store the counts columns
list.count<-vector("list", length = length(list.v))

# every df in list.v has same number of rows, get the row number
rownum<-nrow(list.v[[1]])

# use a for loop to fill out list.count
for(i in 0:(length(list.count)-1)){
  row.start<-i*rownum+1   # it is kind of tricky here
  row.end<-(i+1)*rownum   # same as above
  list.count[[i + 1]]<-as.vector(table(df2[,2][row.start:row.end]))
}

# combine the vectors in list.count and assing names
count.df<-do.call(cbind, list.count)
count.df<-as.data.frame(count.df)

# create & assign colum names in the format of "s_n", and "_" is filled with corresponding original column name
names.cnt<-character()
for(i in 1:length(names(list.v))){
  names.cnt[i]<-paste("s", names(list.v)[i], "n", sep="")
}          
names(count.df)<-names.cnt

# this is a very long loop to concatenate the strings and store them into a matrix, but it gets the job done here.
ss.store<-matrix(,nrow = length(value), ncol = length(list.v), byrow = FALSE)
for(i in 1:length(list.v)){
  for(j in 1:length(value)){
  ss.store[j,i]<-paste(list.v[[i]][,1][which(list.v[[i]][,2] == value[j])], collapse =",")
  }
}

# create a df for strings
string.df<-as.data.frame(ss.store, stringsAsFactors = FALSE)

# create & assign names to the df
names.str<-character()
for(i in 1:length(names(list.v))){
  names.str[i]<-paste("s", "s", names(list.v)[i], sep="")
} 
names(string.df)<-names.str

# combine everything and form the new data frame
new.df<-cbind(value, repeated, count.df, string.df, stringAsFactors = FALSE)

new.df[1:10, 1:15]
   value repeated sAn sF1n sF2n sF3n sF4n sF5n sF6n sF7n sF8n sF9n sF10n sF11n sF12n
1    100      155   3    0    0    0    0    0    0    0    0    0     0     0     0
2   1005       14   1    0    0    0    0    0    0    0    0    0     0     0     0
3   1006       50   1    0    0    0    0    0    0    0    0    0     0     0     0
4   1023        1   1    0    0    0    0    0    0    0    0    0     0     0     0
5   1025       38   1    0    0    0    0    0    0    0    0    0     0     0     0
6   1030      624   1    0    1    2    0    0    0    0    0    0     1     0     0
7   1035        1   1    0    0    0    0    0    0    0    0    0     0     0     0
8    104      165   2    0    0    0    0    0    0    0    0    0     0     0     0
9   1076      186   2    0    0    0    0    0    0    0    0    0     0     0     0
10  1078      333   3    0    0    0    0    0    0    0    0    0     0     0     0

答案 3 :(得分:1)

df <-
  structure(
    list(
      s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"), class = "factor"), 
      s2 = structure(1:3, .Label = c("2-4", "3-15", "7-16"), class = "factor"
      )
    ), .Names = c("s1", "s2"), row.names = c(NA,-3L), class = "data.frame"
  )


library(tidyr)
library(dplyr)

# Split columns at "-" and add to data.frame
splitCols <- function(df) {
  new_headers <- paste("s1", c("1st", "2nd"), sep = "_")
  split_1 <- (separate(df, s1, into = new_headers, sep = "-"))[,new_headers]
  split_1$s1_1st <- as.integer(split_1$s1_1st)
  split_1$s1_2nd <- as.integer(split_1$s1_2nd)

  new_headers <- paste("s2", c("1st", "2nd"), sep = "_")
  split_2 <- (separate(df, s2, into = new_headers, sep = "-"))[,new_headers]
  split_2$s2_1st <- as.integer(split_2$s2_1st)
  split_2$s2_2nd <- as.integer(split_2$s2_2nd)
  cbind(df, split_1, split_2)
}

# given a df outputted from splitCols return final df
analyzeDF <- function(df) {
  target_vals <- unique(c(df$s1_2nd, df$s2_2nd)) # for each uniq val compute stuff
  out_df <- data.frame(Value = integer(0), 
                       repeated = integer(0), 
                       s1N = integer(0), 
                       s2N = integer(0), 
                       ss1 = character(0), 
                       ss2 = character(0))

  # iterate through target_vals, create a row of output,
  # and append to out_df
  for (val in target_vals) {
    s1_match <- val == df$s1_2nd
    s2_match <- val == df$s2_2nd
    total_cnt <- sum(s1_match, s2_match)
    s1_firstcol <- paste(df$s1_1st[s1_match], collapse = ",")
    s2_firstcol <- paste(df$s2_1st[s2_match], collapse = ",")
    # coerce empty string to "-"
    if (s1_firstcol == "") s1_firstcol <- "-"
    if (s2_firstcol == "") s2_firstcol <- "-"

    row_df <- data.frame(Value = val, 
                         repeated = total_cnt, 
                         s1N = sum(s1_match), 
                         s2N = sum(s2_match), 
                         ss1 = s1_firstcol,
                         ss2 = s2_firstcol)
    out_df <- rbind(out_df, row_df)
  }
  return(out_df)
}


(df_split <- splitCols(df))
analyzeDF(df_split)

## Value repeated s1N s2N ss1 ss2
## 1     4        3   2   1 3,5   2
## 2     1        1   1   0   4   -
## 3    15        1   0   1   -   3
## 4    16        1   0   1   -   7