检查数据结构并估算缺失值

时间:2018-04-26 07:09:11

标签: r dataframe

有一些股票数据,例如如下所示,并创建了随机缺失值。想要创建一个函数来检查data.frame的数据结构,如果数据是连续的,计算缺失值的百分比,如果缺失值的百分比大于40%,则省略该行,如果小于40 %归咎于同样的平均值。

如果数据是分类的,则计算缺失值的百分比,如果缺失值的百分比大于40%,则省略该行,如果少于40%,则将该模式归为该行。

提前致谢。

comp <- c('F','F','F','S','S','S','T','T','T')
month <- c('Jan','Feb','March','Apr','May','June','July','Aug','Sept')
Sales <- c('Low','Medium','High','High','Low','High','Medium','Low','Low')
bq1 <- runif(9,min = 0, max = 100)
bq2 <- runif(9,min = 0, max = 100)
bq3 <- runif(9,min = 0, max = 100)
bq4 <- runif(9,min = 0, max = 100)
bq5 <- runif(9,min = 0, max = 100)
bq6 <- runif(9,min = 0, max = 100)

df <- data.frame(Comp = comp, Month = month, Sales = Sales, Qtr1 = bq1, Qtr2 
= bq2, Qtr3 = bq3, Qtr4 = bq4, Qtr5 = bq5, Qtr6 = bq6)



 df[5,5] <- NA
 df[5,4] <- NA
 df[5,7] <- NA
 df[5,9] <- NA


 df[3,5] <- NA
 df[3,4] <- NA
 df[3,7] <- NA
 df[3,9] <- NA

df[9,5] <- NA
df[9,4] <- NA
df[9,6] <- NA
df[9,8] <- NA

df[7,5] <- NA 
df[1,5] <- NA

1 个答案:

答案 0 :(得分:1)

希望这有帮助!

library(dplyr)
df %>% mutate_if(is.factor, as.character) -> df1

#imputation function
impute <- function(x){
  missing_perc <- sum(is.na(x))/length(x) * 100
  return(ifelse(missing_perc > 40, NA, 
                ifelse(is.character(x), names(sort(-table(x[!is.na(x)])))[1], mean(x[!is.na(x)]))))
}
impute_val <- sapply(df1, impute)

#impute missing values
df1[] <- Map(function(x, y) replace(x, is.na(x), y), df1, impute_val)
#drop rows where column has missing percentage > 40
df1 <- na.omit(df1)

#final data
df1

输出为:

  Comp Month  Sales             Qtr1             Qtr2             Qtr3             Qtr4             Qtr5
2    F   Feb Medium 65.4017299879342 66.0814035916701 13.8528823154047 21.5696093859151 18.2194353546947
4    S   Apr   High  89.403684460558 74.2279292317107 55.5751067353413  51.869766949676 9.31410894263536
6    S  June   High 11.7533272597939 11.6908136522397 12.5517533393577 95.4095394117758  36.061190161854
8    T   Aug    Low 7.48507694806904 77.5027731899172 42.0926807913929 11.0406906111166  17.137353355065
              Qtr6
2 82.1378237567842
4 27.7001850772649
6 88.5877252323553
8 23.5045042354614

示例数据:

structure(list(Comp = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L, 3L), .Label = c("F", "S", "T"), class = "factor"), Month = structure(c(4L, 
3L, 7L, 1L, 8L, 6L, 5L, 2L, 9L), .Label = c("Apr", "Aug", "Feb", 
"Jan", "July", "June", "March", "May", "Sept"), class = "factor"), 
    Sales = structure(c(2L, 3L, 1L, 1L, 2L, 1L, 3L, 2L, 2L), .Label = c("High", 
    "Low", "Medium"), class = "factor"), Qtr1 = c(43.4887288603932, 
    65.4017299879342, NA, 89.403684460558, NA, 11.7533272597939, 
    50.5520776147023, 7.48507694806904, NA), Qtr2 = c(NA, 66.0814035916701, 
    NA, 74.2279292317107, NA, 11.6908136522397, NA, 77.5027731899172, 
    NA), Qtr3 = c(5.68129089660943, 13.8528823154047, 35.6186878867447, 
    55.5751067353413, 6.98710139840841, 12.5517533393577, 8.91167896334082, 
    42.0926807913929, NA), Qtr4 = c(22.5347936619073, 21.5696093859151, 
    NA, 51.869766949676, NA, 95.4095394117758, 16.6109931422397, 
    11.0406906111166, 56.1983718769625), Qtr5 = c(5.67050215322524, 
    18.2194353546947, 88.5992815019563, 9.31410894263536, 77.7505977777764, 
    36.061190161854, 51.1230558156967, 17.137353355065, NA), 
    Qtr6 = c(27.9433359391987, 82.1378237567842, NA, 27.7001850772649, 
    NA, 88.5877252323553, 50.3849557833746, 23.5045042354614, 
    74.2521224310622)), .Names = c("Comp", "Month", "Sales", 
"Qtr1", "Qtr2", "Qtr3", "Qtr4", "Qtr5", "Qtr6"), row.names = c(NA, 
-9L), class = "data.frame")