通过减法运行总计

时间:2017-12-06 00:00:10

标签: r if-statement cumsum

我有一个数据集,其中包含加州公立学校的开学日期和开学日期。问题底部的heredput()可用。数据还列出了它的学校类型和所在地。我正在尝试创建一个运行总计列,该列也考虑了学校关闭以及学校类型。

这是我提出的解决方案,这基本上需要我使用ifelse根据条件编码许多不同的1和0:

# open charter schools
pubschls$open_chart <-  ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# open public schools
pubschls$open_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# closed charters
pubschls$closed_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==FALSE, 1,  0)
# closed public schools 
pubschls$closed_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==FALSE, 1,  0)
lausd <- filter(pubschls, NCESDist=="0622710")
# count number open during each year

然后我从彼此中减去列以获得总计。

la_schools_count <- aggregate(lausd[c('open_chart','closed_chart','open_pub','closed_pub')],
 by=list(year(lausd$OpenDate)), sum)

    # find net charters by subtracting closed from open
    la_schools_count$net_chart <- la_schools_count$open_chart - la_schools_count$closed_chart
    # find net public schools by subtracting closed from open
    la_schools_count$net_pub <- la_schools_count$open_pub - la_schools_count$closed_pub
    # add running totals 
    la_schools_count$cum_chart <- cumsum(la_schools_count$net_chart)
    la_schools_count$cum_pub <- cumsum(la_schools_count$net_pub)
    # total totals 
    la_schools_count$total <- la_schools_count$cum_chart + la_schools_count$cum_pub

我的输出如下:

la_schools_count <- select(la_schools_count, "year", "cum_chart", "cum_pub", "pen_rate", "total")
     year cum_chart cum_pub  pen_rate total
1  1952         1       0 100.00000     1
2  1956         1       1  50.00000     2
3  1969         1       2  33.33333     3
4  1980        55     469  10.49618   524
5  1989        55     470  10.47619   525
6  1990        55     470  10.47619   525
7  1991        55     473  10.41667   528
8  1992        55     476  10.35782   531
9  1993        55     477  10.33835   532
10 1994        56     478  10.48689   534
11 1995        57     478  10.65421   535
12 1996        57     479  10.63433   536
13 1997        58     481  10.76067   539
14 1998        59     480  10.94620   539
15 1999        61     480  11.27542   541
16 2000        61     481  11.25461   542
17 2001        62     482  11.39706   544
18 2002        64     484  11.67883   548
19 2003        73     485  13.08244   558
20 2004        83     496  14.33506   579
21 2005        90     524  14.65798   614
22 2006        96     532  15.28662   628
23 2007        90     534  14.42308   624
24 2008        97     539  15.25157   636
25 2009       108     546  16.51376   654
26 2010       124     566  17.97101   690
27 2011       140     580  19.44444   720
28 2012       144     605  19.22563   749
29 2013       162     609  21.01167   771
30 2014       179     611  22.65823   790
31 2015       195     611  24.19355   806
32 2016       203     614  24.84700   817
33 2017       211     619  25.42169   830

我只是想知道这是否可以以更好的方式完成。就像基于条件的所有行的apply语句一样?

dput:
structure(list(CDSCode = c("19647330100289", "19647330100297", 
"19647330100669", "19647330100677", "19647330100743", "19647330100750"
), OpenDate = structure(c(12324, 12297, 12240, 12299, 12634, 
12310), class = "Date"), ClosedDate = structure(c(NA, 15176, 
NA, NA, NA, NA), class = "Date"), Charter = c("Y", "Y", "Y", 
"Y", "Y", "Y")), .Names = c("CDSCode", "OpenDate", "ClosedDate", 
"Charter"), row.names = c(NA, -6L), class = c("tbl_df", "tbl", 
"data.frame"))

1 个答案:

答案 0 :(得分:0)

我按照你的代码了解了你在做什么,除了pen_rate。似乎pen_rate的计算方法是将cum_chart除以total。我下载原始数据集并执行以下操作。我调用了数据集foo。 Whenclosed_pub),我合并了CharterClosedDate。我检查了ClosedDate是否为NA,并将逻辑输出转换为数字(1 =打开,0 =关闭)。这就是我创建四个组的方式(即open_chart,closed_chart,open_pub和closed_pub)。我猜这会要求你少打字。由于日期是有特色的,我使用substr()提取年份。如果您有日期对象,则需要执行其他操作。有了年份,您可以使用它对数据进行分组,并使用count()计算每种类型学校的学校数量。此部分相当于您的aggregate()代码。然后,使用spread()将输出转换为宽格式数据,并完成您在代码中演示的其余计算。最终输出似乎与您在问题中的结果不同,但我的结果与我通过运行您的代码得到的结果相同。我希望这会对你有所帮助。

library(dplyr)
library(tidyr)
library(readxl)

# Get the necessary data
foo <- read_xls("pubschls.xls") %>%
       select(NCESDist, CDSCode, OpenDate, ClosedDate, Charter) %>%
       filter(NCESDist == "0622710" & (!Charter %in% NA))


mutate(foo, group = paste(Charter, as.numeric(is.na(ClosedDate)), sep = "_"),
       year = substr(OpenDate, star = nchar(OpenDate) - 3, stop = nchar(OpenDate))) %>%
count(year, group) %>%
spread(key = group, value = n, fill = 0) %>%
mutate(net_chart = Y_1 - Y_0,
       net_pub = N_1 - N_0,
       cum_chart = cumsum(net_chart),
       cum_pub = cumsum(net_pub),
       total = cum_chart + cum_pub,
       pen_rate = cum_chart / total)

# A part of the outcome
#    year N_0 N_1 Y_0 Y_1 net_chart net_pub cum_chart cum_pub total   pen_rate
#1   1866   0   1   0   0         0       1         0       1     1 0.00000000
#2   1873   0   1   0   0         0       1         0       2     2 0.00000000
#3   1878   0   1   0   0         0       1         0       3     3 0.00000000
#4   1881   0   1   0   0         0       1         0       4     4 0.00000000
#5   1882   0   2   0   0         0       2         0       6     6 0.00000000
#110 2007   0   2  15   9        -6       2        87     393   480 0.18125000
#111 2008   2   8   9  15         6       6        93     399   492 0.18902439
#112 2009   1   9   4  15        11       8       104     407   511 0.20352250
#113 2010   5  26   5  21        16      21       120     428   548 0.21897810
#114 2011   2  16   2  18        16      14       136     442   578 0.23529412
#115 2012   2  27   3   7         4      25       140     467   607 0.23064250
#116 2013   1   5   1  19        18       4       158     471   629 0.25119237
#117 2014   1   3   1  18        17       2       175     473   648 0.27006173
#118 2015   0   0   2  18        16       0       191     473   664 0.28765060
#119 2016   0   3   0   8         8       3       199     476   675 0.29481481
#120 2017   0   5   0   9         9       5       208     481   689 0.30188679