对于变量的所有组合

时间:2016-11-07 04:00:59

标签: r dplyr tidyr

我对R编程比较新,所以如果这个问题太基础,我会道歉。我的交易显示了六种不同类型产品的收入。有三年的交易。我的目标是找出所有不同产品组合的产品总和,每年为2^6 - 1 = 64 - 1 = 63。意思是,我会63*3 = 189组合。

为了简单起见,我仅使用三个变量创建了测试数据,因为我使用while循环编写了一年的程序,这很糟糕。我的目标是展示我想要实现的目标。尽管如此,我已经在下面的原始文件中发布了随机样本。

以下是只有三个变量CarTire以及Serviceswhile循环的测试数据,以显示我在寻找的内容:

    dput(Sample_File)
structure(list(Order.ID = c(171, 173, 132, 174, 132, 174, 132, 
174, 174), Fiscal.Year = c(2017, 2016, 2016, 2016, 2016, 2016, 
2016, 2016, 2018), Car = c(2, 2, 3, 1, 0, 0, 0, 0, 1), Tire = c(0, 
0, 0, 1, 0, 1, 0, 1, 1), Services = c(3, 1, 4, 0, 4, 1, 4, 0, 
0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire", "Services"
), row.names = c(NA, 9L), class = "data.frame")

这是我的代码:

  i<-1
    Csum <- matrix(rep(0,21),nrow = 7,ncol = 3) 
    # Row 1 is used when C is ON; T is ON ; S is ON
    # Row 2 is used when C is ON; T is ON ; S is OFF
    # Row 3 is used when C is ON; T is OFF ; S is ON
    # Row 4 is used when C is OFF; T is ON ; S is ON
    # Row 5 is used when C is ON; T is OFF ; S is OFF
    # Row 6 is used when C is OFF; T is ON ; S is OFF
    # Row 7 is used when C is OFF; T is OFF ; S is ON

    while (i <= length(Sample_File$Order.ID))
    {
      if (Sample_File$Fiscal.Year[i]!=2016)
        {
        i<-i+1
        next
      }
      if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0)#1 
      {
        Csum[1,1] <- Csum[1,1] + Sample_File$Car[i]
        Csum[1,2] <- Csum[1,2] + Sample_File$Tire[i]
        Csum[1,3] <- Csum[1,3] + Sample_File$Services[i]

      }
      else if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0) #2
      {
        Csum[2,1] <- Csum[2,1] + Sample_File$Car[i]
        Csum[2,2] <- Csum[2,2] + Sample_File$Tire[i]
        Csum[2,3] <- Csum[2,3] + 0
      }
      else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]!=0) #3
        {

        Csum[3,1] <- Csum[3,1] + Sample_File$Car[i]
        Csum[3,2] <- Csum[3,2] + 0
        Csum[3,3] <- Csum[3,3] + Sample_File$Services[i]
      }
      else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0) #4
      {
        Csum[4,1] <- Csum[4,1] + 0
        Csum[4,2] <- Csum[4,2] + Sample_File$Tire[i]
        Csum[4,3] <- Csum[4,3] + Sample_File$Services[i]
      }
      else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]==0) #5
      {
        Csum[5,1] <- Csum[5,1] + Sample_File$Car[i]
        Csum[5,2] <- Csum[5,2] + 0
        Csum[5,3] <- Csum[5,3] + 0
      }
      else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0)#6 
      {
        Csum[6,1] <- Csum[6,1] + 0
        Csum[6,2] <- Csum[6,2] + Sample_File$Tire[i]
        Csum[6,3] <- Csum[6,3] + 0
      }
      else #7
        {
          Csum[7,1] <- Csum[7,1] + 0
          Csum[7,2] <- Csum[7,2] + 0
          Csum[7,3] <- Csum[7,3] + Sample_File$Services[i]
        }
      i<-i+1
    }  

我编写的代码只处理了一年,因为将代码复制三年非常痛苦。我正在寻找一个解决方案,可以创建3个数据框的列表,每个数据框三年。

这是一个大小为10的随机样本,其中有六个来自原始文件的变量。

dput(Sample_File_Random)
structure(list(Order.ID = c(171, 173, 132, 174, 169, 175, 163, 
186, 178, 121), Fiscal.Year = c(2016, 2016, 2017, 2016, 2015, 
2016, 2015, 2015, 2015, 2017), Car = c(2, 0, 3, 0, 0, 0, 0, 5346.25, 
0, 0), Tire = c(0, 0, 0, 8691.55800460666, 3198, 5, 2, 0, 2, 
3282.18), Services = c(3, 0, 4, 0, 0, 0, 0, 0, 0, 0), Insurance = c(4, 
0, 0, 4, 0, 4, 0, 0, 0, 0), Accessories = c(94.3, 3749.8, 9308.65, 
0, 2, 0, 1, 633.75, 51.44, 0), Finance = c(0, 0, 0, 4, 0, 14800, 
0, 0, 0, 0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire", 
"Services", "Insurance", "Accessories", "Finance"), row.names = c(NA, 
10L), class = "data.frame")

我真的被困了所以我真诚地感谢任何有关矢量化的帮助..

@ Ronak shah的请求:这是Sample_File_Random

的预期输出
Output_File
  Fiscal.Year     Car     Tire Services Insurance Accessories Finance
1        2015    0.00 3202.000        0         0       54.44       0
2        2015 5346.25    0.000        0         0      633.75       0
3        2016    2.00    0.000        3         4       94.30       0
4        2016    0.00    0.000        0         0     3749.80       0
5        2016    0.00 8696.558        0         8        0.00   14804
6        2017    3.00    0.000        4         0     9308.65       0
7        2017    0.00 3282.180        0         0        0.00       0

2 个答案:

答案 0 :(得分:1)

这是一个紧凑的&amp;富有表现力的dplyr解决方案,分三步进行:

  1. 创建每个服务是否在篮子中的指标
  2. 按年份分组,以及指标的组合
  3. 通过分组变量
  4. 对服务值求和

    以下是执行此操作的代码:

    df_foo %>% 
      # 1. create the combinations of whether each of the 
      #   products is in the basket or not
      mutate_each(
        funs(In_Basket = . > 0), Car:Services
      ) %>% 
      # 2. group by the year and the basket service indicators
      group_by_(.dots = c("Fiscal.Year", grep("_In_Basket", names(.), value = TRUE))) %>% 
      # 3. sum the service values
      summarise_each(
        funs(sum(., na.rm = TRUE)), Car:Services
      )
    

    这给出了输出:

    Source: local data frame [7 x 7]
    Groups: Fiscal.Year, Car_In_Basket, Tire_In_Basket [?]
    
      Fiscal.Year Car_In_Basket Tire_In_Basket Services_In_Basket   Car  Tire Services
            <dbl>         <lgl>          <lgl>              <lgl> <dbl> <dbl>    <dbl>
    1        2016         FALSE          FALSE               TRUE     0     0        8
    2        2016         FALSE           TRUE              FALSE     0     1        0
    3        2016         FALSE           TRUE               TRUE     0     1        1
    4        2016          TRUE          FALSE               TRUE     5     0        5
    5        2016          TRUE           TRUE              FALSE     1     1        0
    6        2017          TRUE          FALSE               TRUE     2     0        3
    7        2018          TRUE           TRUE              FALSE     1     1        0
    

答案 1 :(得分:0)

这里有一个很好的挑战....

Using your dataset that I called test.  I chose to approach this with matrices.
names<-colnames(test[3:8])
library(combinat)
one<-t(combn(names,1))
two<-t(combn(names,2))
three<-t(combn(names,3))
four<-t(combn(names,4))
five<-t(combn(names,5))
six<-t(combn(names,6))
library(plyr)
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six))
head(myset,3); tail(myset,3)

给出以下内容:

     [,1]       [,2] [,3] [,4] [,5] [,6]
[1,] "Car"      NA   NA   NA   NA   NA  
[2,] "Tire"     NA   NA   NA   NA   NA  
[3,] "Services" NA   NA   NA   NA   NA  
      [,1]   [,2]       [,3]        [,4]          [,5]          [,6]     
[61,] "Car"  "Services" "Insurance" "Accessories" "Finance"     NA       
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance"     NA       
[63,] "Car"  "Tire"     "Services"  "Insurance"   "Accessories" "Finance"

使用dplyr按年获得总和:

library(dplyr)
testsums<- test %>% select(-Order.ID) %>% group_by(Fiscal.Year) %>% summarise_each(funs(mean))
testsums
A tibble: 3 × 7
  Fiscal.Year      Car    Tire Services Insurance Accessories Finance
        <dbl>    <dbl>   <dbl>    <dbl>     <dbl>       <dbl>   <dbl>
1        2015 1336.562  800.50     0.00         0    172.0475       0
2        2016    0.500 2174.14     0.75         3    961.0250    3701
3        2017    1.500 1641.09     2.00         0   4654.3250       0

创建1&0和3的matix,乘以相同六个变量的年度和的向量。

mult.matrix<-myset
mult.matrix[!is.na(mult.matrix)]<-1
mult.matrix[is.na(mult.matrix)]<-0
class(mult.matrix) <- "numeric"
head(mult.matrix,3);tail(mult.matrix,3)
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    0    0    0    0    0
[2,]    1    0    0    0    0    0
[3,]    1    0    0    0    0    0
      [,1] [,2] [,3] [,4] [,5] [,6]
[61,]    1    1    1    1    1    0
[62,]    1    1    1    1    1    0
[63,]    1    1    1    1    1    1

将年度总和转换为矩阵表示法。将它乘以mult.matrix。将3个新列绑定到原始组合数据集。

year_sums<-unname(as.matrix(testsums[1:3,2:7]))
all_sums<-mult.matrix %*% t(year_sums)
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six))
myset<-cbind(myset,all_sums)
head(myset,5); tail(myset,5)
     [,1]          [,2] [,3] [,4] [,5] [,6] [,7]        [,8]  [,9] 
[1,] "Car"         NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[2,] "Tire"        NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[3,] "Services"    NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[4,] "Insurance"   NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
[5,] "Accessories" NA   NA   NA   NA   NA   "1336.5625" "0.5" "1.5"
      [,1]   [,2]       [,3]        [,4]          [,5]          [,6]      [,7]      [,8]               [,9]      
[59,] "Car"  "Tire"     "Services"  "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[60,] "Car"  "Tire"     "Insurance" "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[61,] "Car"  "Services" "Insurance" "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance"     NA        "2309.11" "3139.41450115167" "6298.915"
[63,] "Car"  "Tire"     "Services"  "Insurance"   "Accessories" "Finance" "2309.11" "6840.41450115166" "6298.915"

这可以清理很多。我选择了完成思考过程。您现在可以获取最终矩阵,将其转换为数据帧,重命名标题等...