计算R中的多组行方式比例

时间:2018-06-07 20:24:23

标签: r dplyr data.table tidyr

我有一个const item = obj[id]; ,按data.framefolder分组,并包含每个z_stack_id的计数。 “主要”图层为binary_layer。我已经计算过其他地方的交叉点()。我的目标是计算文件夹内的z_stack内的比例(如果需要,还可以计算其他分组变量)。我希望使用FITC, TRITC, and Cy5。但我不确定如何去做这样的功能。

该函数的预期输出将是每个主图层的比例,按folder / z_stack_id /(...)分组。例如,对于dplyr::group_by(...) %>% summarise(my_custom_fancy_function)

Cy5FITC_Cy5/Cy5TRITC_Cy5/Cy5

请注意Triple/Cy5并不总是有计数,所以我需要先填写这些组(目前正在处理它)。

Triple

更新

我手工制作了Cy5计算的例子。请注意 my_df # A tibble: 13 x 4 folder z_stack_id binary_layer n_blobs <chr> <dbl> <chr> <int> 1 20180601_122650_896 1.00 Cy5 959 2 20180601_122650_896 1.00 FITC 16 3 20180601_122650_896 1.00 TRITC 499 4 20180601_122650_896 2.00 Cy5 225 5 20180601_122650_896 2.00 FITC 157 6 20180601_122650_896 2.00 TRITC 19 7 20180601_122650_896 1.00 FITC_Cy5 5 8 20180601_122650_896 1.00 FITC_TRITC 2 9 20180601_122650_896 1.00 TRITC_Cy5 301 10 20180601_122650_896 2.00 FITC_Cy5 34 11 20180601_122650_896 2.00 FITC_TRITC 8 12 20180601_122650_896 2.00 Triple 4 13 20180601_122650_896 2.00 TRITC_Cy5 8 dput(my_df) structure(list(folder = c("20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896", "20180601_122650_896"), z_stack_id = c(1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2), binary_layer = c("Cy5", "FITC", "TRITC", "Cy5", "FITC", "TRITC", "FITC_Cy5", "FITC_TRITC", "TRITC_Cy5", "FITC_Cy5", "FITC_TRITC", "Triple", "TRITC_Cy5"), n_blobs = c(959L, 16L, 499L, 225L, 157L, 19L, 5L, 2L, 301L, 34L, 8L, 4L, 8L)), class = c("tbl_df", "tbl", "data.frame" ), row.names = c(NA, -13L), .Names = c("folder", "z_stack_id", "binary_layer", "n_blobs")) 列上的大多数结果都是虚假的。唯一有意义的是其中prop_main_Cy5的Cy5值是总数(例如,FITC_TRITC / Cy5没有意义)

z_stack_id

2 个答案:

答案 0 :(得分:0)

也许是这样的,使用prop.table(),然后使用ungroup()group_by来提升汇总级别?


library(tidyverse)

my_df %>% 
  group_by(folder, z_stack_id) %>% 
  mutate(prop_binary_layer = n_blobs/sum(n_blobs)) %>% 
  ungroup %>% 
  group_by(folder) %>% 
  mutate(prop_z_stack_id = n_blobs/sum(n_blobs))


#> # A tibble: 13 x 6
#> # Groups: folder [1]
#>    folder              z_stack_id binary_layer n_blobs prop_bina~ prop_z_~
#>    <chr>                    <dbl> <chr>          <int>      <dbl>    <dbl>
#>  1 20180601_122650_896       1.00 Cy5              959    0.538   0.429   
#>  2 20180601_122650_896       1.00 FITC              16    0.00898 0.00715 
#>  3 20180601_122650_896       1.00 TRITC            499    0.280   0.223   
#>  4 20180601_122650_896       2.00 Cy5              225    0.495   0.101   
#>  5 20180601_122650_896       2.00 FITC             157    0.345   0.0702  
#>  6 20180601_122650_896       2.00 TRITC             19    0.0418  0.00849 
#>  7 20180601_122650_896       1.00 FITC_Cy5           5    0.00281 0.00224 
#>  8 20180601_122650_896       1.00 FITC_TRITC         2    0.00112 0.000894
#>  9 20180601_122650_896       1.00 TRITC_Cy5        301    0.169   0.135   
#> 10 20180601_122650_896       2.00 FITC_Cy5          34    0.0747  0.0152  
#> 11 20180601_122650_896       2.00 FITC_TRITC         8    0.0176  0.00358 
#> 12 20180601_122650_896       2.00 Triple             4    0.00879 0.00179 
#> 13 20180601_122650_896       2.00 TRITC_Cy5          8    0.0176  0.00358

答案 1 :(得分:0)

这就是我最终这样做的方式。它是分割数据的组合,在每个folder级别内进行一些过滤,稍微更改名称以便以后重新加入。 一旦每个z_stack_id具有每个通道的正确值(FITC_blobs,TRITC_blobs,Cy5_blobs),我们可以bind_rows并执行比例。这种方法仍然存在虚假的比例,但它们可以在某种程度上被过滤掉。

我不得不进行一些列重命名,因为我的实际数据与简化问题中发布的列不同。我将它浓缩成一个函数。

calculate_blob_proportions <- function(dataframe){




  dataframe <- dataframe %>% ungroup()

# prepare a list

      li <- list()


  for (i in unique(dataframe$folder)){
    # Get each folder
    my_df <- dataframe %>% filter(folder == i) %>%
      mutate(filename_cells = ifelse(is.na(filename_cells),
                                     filename_coloc,
                                     filename_cells)) %>%
      rename(filename = filename_cells) %>%
      select(-filename_coloc)

    Cy5 <- filter(my_df, binary_layer=="Cy5") %>%
      rename(Cy5_blobs = n_blobs) %>%
      select(-binary_layer, -filename) %>% 
      left_join(my_df)

    TRITC <- filter(my_df, binary_layer=="TRITC") %>%
      rename(TRITC_blobs = n_blobs) %>%
      select(-binary_layer, -filename) %>% 
      left_join(my_df)

    FITC <- filter(my_df, binary_layer=="FITC") %>%
      rename(FITC_blobs = n_blobs) %>%
      select(-binary_layer, -filename) %>% 
      left_join(my_df)


    li[[i]] <- left_join(Cy5,left_join(TRITC,FITC)) %>%
      select(RatID, folder, filename, z_stack_id,
             binary_layer, n_blobs,
             FITC_blobs, TRITC_blobs, Cy5_blobs)

  }


  df_out <- bind_rows(li) %>% 
            mutate(FITC_prop = n_blobs/FITC_blobs,
                   TRITC_prop = n_blobs/TRITC_blobs,
                   Cy5_prop = n_blobs/Cy5_blobs)

  return(df_out)

}
相关问题