在R中合并 - 一列改变值

时间:2014-05-05 10:06:50

标签: r merge

我正在将一个df合并到另一个df中。这通常适合我使用:

df<-merge(x,y, by=c(variable1, variable2))

使用的实际代码

merge(mymaindf, mergingdf, by=c('hai_dispense_number','ID'))

我的问题是,对于我现在正在使用的数据,x中的一个变量不会保留它的值。它是一个二进制变量,大约有一半,一半是0和1.当我合并x和y时,这个变量中的值都变为1.我尝试了各种各样的事情,比如将变量从数字,字符变为因子变量 - 但没有运气。我也尝试在y中创建变量,但它在合并时也会发生变化。有谁知道会发生什么?我在下面列出了dfs(x = mymaindfy = mergingdf)的输入。

更改的变量称为scheme。

mymaindf:

structure(list(hai_dispense_number = c("Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059"
), ID = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), 
    variable.x = structure(1:10, .Label = c("month1", "month2", 
    "month3", "month4", "month5", "month6", "month7", "month8", 
    "month9", "month10", "month11", "month12", "month13", "month14", 
    "month15", "month16", "month17", "month18"), class = "factor"), 
    adherence = c(1, 1, 0.933333333333333, 0.966666666666667, 
    0.966666666666667, 0.966666666666667, 0.9, 0.966666666666667, 
    0.633333333333333, 0.866666666666667), time1 = c(-1, -2, 
    -3, -4, 1, 2, 3, 4, 5, 6), new_numbers = c(-4L, -3L, -2L, 
    -1L, 1L, 2L, 3L, 4L, 5L, 6L), variable.y = structure(1:10, .Label = c("t1", 
    "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11", 
    "t12", "t13", "t14", "t15", "t16", "t17", "t18"), class = "factor"), 
    age = c(72, 72.0833333333333, 72.1666666666667, 72.25, 72.3333333333333, 
    72.4166666666667, 72.5, 72.5833333333333, 72.6666666666667, 
    72.75), sex = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), post = c(0, 
    0, 0, 0, 1, 1, 1, 1, 1, 1), time_post = c(0, 0, 0, 0, 0, 
    1, 2, 3, 4, 5), base = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    scheme = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1"
    )), .Names = c("hai_dispense_number", "ID", "variable.x", 
"adherence", "time1", "new_numbers", "variable.y", "age", "sex", 
"post", "time_post", "base", "scheme"), row.names = c("1", "9", 
"10", "11", "12", "13", "14", "15", "16", "2"), class = "data.frame")

mergingdf:

structure(list(hai_dispense_number = c("Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059"
), aspT1person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), aspT2person = c(1, 
1, 1, 1, 1, 1, 1, 1, 1, 1), aspT3person = c(1, 1, 1, 1, 1, 1, 
1, 1, 1, 1), aspbaseperson = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
    lipidT1person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lipidT2person = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), lipidT3person = c(1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1), lipidbaseperson = c(1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1), hyptenT1person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1), hyptenT2person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), hyptenT3person = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), hyptenbaseperson = c(1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1), insulinT1person = c(0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0), insulinT2person = c(0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0), insulinT3person = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0), insulinbaseperson = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    aspirin = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7), aspirinbin = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), hypertension = c(7, 7, 7, 7, 
    7, 7, 7, 7, 7, 7), hypertensionbin = c(1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1), lipids = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7), lipidsbin = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), insulin = c(0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), insulinbin = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    scheme = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L), .Label = c("0", "1"), class = "factor"), ID = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9", "10")), .Names = c("hai_dispense_number", 
"aspT1person", "aspT2person", "aspT3person", "aspbaseperson", 
"lipidT1person", "lipidT2person", "lipidT3person", "lipidbaseperson", 
"hyptenT1person", "hyptenT2person", "hyptenT3person", "hyptenbaseperson", 
"insulinT1person", "insulinT2person", "insulinT3person", "insulinbaseperson", 
"aspirin", "aspirinbin", "hypertension", "hypertensionbin", "lipids", 
"lipidsbin", "insulin", "insulinbin", "scheme", "ID"), row.names = c(NA, 
10L), class = "data.frame")

合并df的输入

structure(list(hai_dispense_number = c("Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059", 
"Patient HAI0000059", "Patient HAI0000059", "Patient HAI0000059"
), ID = c("1", "10", "11", "12", "13", "14", "15", "16", "2", 
"3"), variable.x = structure(c(1L, 10L, 11L, 12L, 13L, 14L, 15L, 
16L, 2L, 3L), .Label = c("month1", "month2", "month3", "month4", 
"month5", "month6", "month7", "month8", "month9", "month10", 
"month11", "month12", "month13", "month14", "month15", "month16", 
"month17", "month18"), class = "factor"), adherence = c(1, 0.866666666666667, 
0.833333333333333, 0.833333333333333, 0.966666666666667, 0.6, 
0.833333333333333, 0.966666666666667, 1, 0.933333333333333), 
    time1 = c(-1, 6, 7, 8, 9, 10, 11, 12, -2, -3), new_numbers = c(-4L, 
    6L, 7L, 8L, 9L, 10L, 11L, 12L, -3L, -2L), variable.y = structure(c(1L, 
    10L, 11L, 12L, 13L, 14L, 15L, 16L, 2L, 3L), .Label = c("t1", 
    "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11", 
    "t12", "t13", "t14", "t15", "t16", "t17", "t18"), class = "factor"), 
    age = c(72, 72.75, 72.8333333333333, 72.9166666666667, 73, 
    73.0833333333333, 73.1666666666667, 73.25, 72.0833333333333, 
    72.1666666666667), sex = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    post = c(0, 1, 1, 1, 1, 1, 1, 1, 0, 0), time_post = c(0, 
    5, 6, 7, 8, 9, 10, 11, 0, 0), base = c(3, 12, 13, 14, 15, 
    16, 17, 18, 4, 5), scheme = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1), aspT1person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), aspT2person = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), aspT3person = c(1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1), aspbaseperson = c(1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1), lipidT1person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lipidT2person = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), lipidT3person = c(1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1), lipidbaseperson = c(1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1), hyptenT1person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1), hyptenT2person = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), hyptenT3person = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), hyptenbaseperson = c(1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1), insulinT1person = c(0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0), insulinT2person = c(0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0), insulinT3person = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0), insulinbaseperson = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    aspirin = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7), aspirinbin = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), hypertension = c(7, 7, 7, 7, 
    7, 7, 7, 7, 7, 7), hypertensionbin = c(1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1), lipids = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7), lipidsbin = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), insulin = c(0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), insulinbin = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("hai_dispense_number", 
"ID", "variable.x", "adherence", "time1", "new_numbers", "variable.y", 
"age", "sex", "post", "time_post", "base", "scheme", "aspT1person", 
"aspT2person", "aspT3person", "aspbaseperson", "lipidT1person", 
"lipidT2person", "lipidT3person", "lipidbaseperson", "hyptenT1person", 
"hyptenT2person", "hyptenT3person", "hyptenbaseperson", "insulinT1person", 
"insulinT2person", "insulinT3person", "insulinbaseperson", "aspirin", 
"aspirinbin", "hypertension", "hypertensionbin", "lipids", "lipidsbin", 
"insulin", "insulinbin"), row.names = c(NA, 10L), class = "data.frame")

1 个答案:

答案 0 :(得分:0)

您的mergingdf$schemefactor,因此其值不是您认为的值。

这很容易通过简单地输入来确定,

maindf$schememergingdf$scheme