我有一个数据框(subset_df
),如下所示:
structure(list(sequence = c("CSPPPPSPSPHPRPP", "GEGSPTSPTSPKQPG",
"EAGAPAGSGAPPPAD", "PAPPKPKESKEPENA", "AKPKQQDEDPDGAAE", "AYATMLKDVQWKVRKS",
"HEKLVQDIWKKLEAKG", "SCSVKLGLWKNAVNNC", "MAYVCELGPNQGWK", "LKDPKQYQSIVDAEWK",
"KEAPGATEKDRAKATP", "TAYIMRPLDHGADVTL", "CVTQEHFREAMAKTNP", "AGTGFPYREMMPMNAP",
"HKKSTEDNDDDAFCAP", "RPGGPPGYRTPYTAK", "TQGDRQKIQDAVSAA", "EVKSRYNVDVSQNKR",
"VIEMTRAFEDDDFDK", "GSADLTPSNLTRPAS"), group = c("BP", "BP",
"BP", "BP", "BP", "EpQ", "EpQ", "EpQ", "EpQ", "EpQ", "abc", "abc",
"abc", "abc", "abc", "LbT", "LbT", "LbT", "LbT", "LbT")), .Names = c("sequence",
"group"), row.names = c(NA, -20L), class = c("tbl_df", "tbl",
"data.frame"))
最后,我想创建一个新列(subset_df$ID
),其中的ID基于subset_df$sequence
下的每个条目,遵循以下模式:
subset_df$sequence
下的前四个字符。 为了说明,下面我会粘贴一些例子:
"BP_1_CSPP" "BP_2_GEGS" "BP_3_EAGA" "BP_4_PAPP" "BP_5_AKPK" "EpQ_1_AYAT"
我正在使用包str_sub
中的函数stringr
来生成输出(请参阅下面的循环)。
到目前为止,我所做的工作如下:
# define where the groups are (BP, abc, LbT, EpQ)
groups <- c("BP", "EpQ", "abc", "LbT")
# define the indexes of all groups using a loop
groups_indexes <- list()
for(i in groups) {
groups_indexes[[i]] <- grep(pattern = i, x = subset_df$group)
}
考虑我的列表(groups_indexes
)输出:
$BP
[1] 1 2 3 4 5
$EpQ
[1] 6 7 8 9 10
$abc
[1] 11 12 13 14 15
$LbT
[1] 16 17 18 19 20
我只为for loop
中的每个元素使用一个groups_indexes
设法完成了这项工作,如下所示(四个for loops
)
# BP
for(i in groups_indexes[1]) {
subset_df$IDs[i] <- paste0("BP_", i, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
# EpQ
for(i in groups_indexes[2]) {
subset_df$IDs[i] <- paste0("EpQ_", i-(groups_indexes$EpQ[1])+1, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
# abc
for(i in groups_indexes[3]) {
subset_df$IDs[i] <- paste0("abc_", i-(groups_indexes$abc[1])+1, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
# LbT
for(i in groups_indexes[4]) {
subset_df$IDs[i] <- paste0("LbT_", i-(groups_indexes$LbT[1])+1, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
分别运行这四个for loops
之后,我得到了以下输出:
> subset_df$IDs
[1] "BP_1_CSPP" "BP_2_GEGS" "BP_3_EAGA" "BP_4_PAPP" "BP_5_AKPK" "EpQ_1_AYAT" "EpQ_2_HEKL" "EpQ_3_SCSV" "EpQ_4_MAYV" "EpQ_5_LKDP"
[11] "abc_1_KEAP" "abc_2_TAYI" "abc_3_CVTQ" "abc_4_AGTG" "abc_5_HKKS" "LbT_1_RPGG" "LbT_2_TQGD" "LbT_3_EVKS" "LbT_4_VIEM" "LbT_5_GSAD"
但是,我正在尝试使用单个循环来完成这项工作。我虽然使用names(groups_indexes[i])
来调用for loops
中的引号中的字符(&#34; BP&#34;,&#34; EpQ&#34;,&#34; abc&#34;, &#34; LBT&#34;。)
答案 0 :(得分:1)
也许我们需要按“组”进行分组,然后按行(paste
)的序列和“序列(row_number()
)<的子串”substr
'组'进行分组/ p>
library(dplyr)
subset_df %>%
group_by(group) %>%
mutate(ID = paste(group, row_number(), substr(sequence, 1, 4), sep="_"))
# sequence group ID
# <chr> <chr> <chr>
#1 CSPPPPSPSPHPRPP BP BP_1_CSPP
#2 GEGSPTSPTSPKQPG BP BP_2_GEGS
#3 EAGAPAGSGAPPPAD BP BP_3_EAGA
#4 PAPPKPKESKEPENA BP BP_4_PAPP
#5 AKPKQQDEDPDGAAE BP BP_5_AKPK
#6 AYATMLKDVQWKVRKS EpQ EpQ_1_AYAT
#7 HEKLVQDIWKKLEAKG EpQ EpQ_2_HEKL
#8 SCSVKLGLWKNAVNNC EpQ EpQ_3_SCSV
#9 MAYVCELGPNQGWK EpQ EpQ_4_MAYV
#10 LKDPKQYQSIVDAEWK EpQ EpQ_5_LKDP
#11 KEAPGATEKDRAKATP abc abc_1_KEAP
#12 TAYIMRPLDHGADVTL abc abc_2_TAYI
#13 CVTQEHFREAMAKTNP abc abc_3_CVTQ
#14 AGTGFPYREMMPMNAP abc abc_4_AGTG
#15 HKKSTEDNDDDAFCAP abc abc_5_HKKS
#16 RPGGPPGYRTPYTAK LbT LbT_1_RPGG
#17 TQGDRQKIQDAVSAA LbT LbT_2_TQGD
#18 EVKSRYNVDVSQNKR LbT LbT_3_EVKS
#19 VIEMTRAFEDDDFDK LbT LbT_4_VIEM
#20 GSADLTPSNLTRPAS LbT LbT_5_GSAD
如果我们的数字不是按组分组而是基于整个列,那么请删除group_by
操作,然后执行
subset_df %>%
mutate(ID = paste(group, row_number(), substr(sequence, 1, 4), sep="_"))