我有一个包含10列的数据集,其中有10列有兴趣创建新的指标功能。功能是" pT"," pN",& " M"他们都采取不同的价值观。关闭这3个特征所采用的所有值,需要在新变量中捕获9个唯一组合。
PATHOT PATHON PATHOM
1 pT2 pN1 M0
4 pT1 pN1 M0
13 pT3 pN1 M0
161 pT1 *pN2 M0
391 pT1 pN1 *M1
810 *pTIS pN1 M0
948 pT3 *pN2 M0
1043 pT2 pN1 *M1
1067 *pT4 pN1 M0
例如,新变量将具有值" 1"当PATHOT = pT2时,PATHON = pN1& PATHOM = M0,依此类推至值9.我已完成任务但在花费了近20行代码涉及所有独特组合的矢量化操作之后。
diag3_bs$sfd[diag3_bs$pathot=="pT2" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 1
diag3_bs$sfd[diag3_bs$pathot=="pT1" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 2
diag3_bs$sfd[diag3_bs$pathot=="pT3" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 3... so on upto 9.
我想问一下是否有更好的更自动化的方法来获得相同的结果?
dput(data.frame)
在下面给出
structure(list(F_STATUS = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Y", class = "factor"), EVENT_ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BASELINE", class =
"factor"),
PAG_NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "BR2", class = "factor"), PTSIZE = c(3, 4,
2.7, 2, 0.9, 3, 3, 0.9, 3, 4.5), PTSIZE_U = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CM", class = "factor"),
PT_SYM = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "-", "<", ">"), class = "factor"), PATHOT = structure(c(4L,
4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L, 4L), .Label = c("*pT4", "*pTIS",
"pT1", "pT2", "pT3"), class = "factor"), PATHON = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*pN2", "pN1"
), class = "factor"), PATHOM = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*M1", "M0"), class = "factor"),
RSUBJID = 901000:901009, RUSUBJID = structure(1:10, .Label = c(
"000301-000-901-251", "000301-000-901-252", "000301-000-901-253",
"000301-000-901-254", "000301-000-901-255", "000301-000-901-256",
"000301-000-901-257", "000301-000-901-258", "000301-000-901-259",
"000301-000-901-260", "000301-000-901-261", "000301-000-901-262")
, class = "factor")), .Names = c("F_STATUS", "EVENT_ID", "PAG_NAME", "PTSIZE", "PTSIZE_U", "PT_SYM", "PATHOT",
"PATHON", "PATHOM", "RSUBJID", "RUSUBJID"), row.names = c(NA, 10L),
class = "data.frame")
感谢。
答案 0 :(得分:3)
我尝试编辑数据,因此它没有输入错误。还创建了可能组合的表格版本:
stg_tbl <- structure(list(PATHOT = structure(c(4L, 3L, 5L, 3L, 3L, 2L, 5L,
4L, 1L), .Label = c("*pT4", "*pTIS", "pT1", "pT2", "pT3"), class = "factor"),
PATHON = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("*pN2",
"pN1"), class = "factor"), PATHOM = structure(c(2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L), .Label = c("*M1", "M0"), class = "factor")), .Names = c("PATHOT",
"PATHON", "PATHOM"), class = "data.frame", row.names = c("1",
"4", "13", "161", "391", "810", "948", "1043", "1067"))
制作类别的文本等效矢量:
stg_lbls <- with(stg_tbl, paste(PATHOT, PATHON, PATHOM, sep="_") )
然后,使用这些级别创建的因子的as.numeric值将是所需的结果:
dat$stg <- with(dat, factor( paste(PATHOT, PATHON, PATHOM, sep="_"), levels=stg_lbls))
as.numeric(dat$stg)
#[1] 1 1 1 2 2 1 1 2 1 1
您可以按照常规方式分配这些值:
dat$sfd <- as.numeric(dat$stg)
答案 1 :(得分:2)
我制作了一些新数据,这对您的问题非常有用。
k<-expand.grid(data.frame(a=letters[1:3],b=letters[4:6],c=letters[7:9]))
library(dplyr)
k %>% mutate(groups=paste0(a,b,c))->k2
k2$groups<-as.numeric(factor(k2$groups))
k2
这很粗糙,你不会选择哪个组合获得哪个数字,所以之后需要进行一些挖掘,但这很快。