有一些股票数据,例如如下所示,并创建了随机缺失值。想要创建一个函数来检查data.frame的数据结构,如果数据是连续的,计算缺失值的百分比,如果缺失值的百分比大于40%,则省略该行,如果小于40 %归咎于同样的平均值。
如果数据是分类的,则计算缺失值的百分比,如果缺失值的百分比大于40%,则省略该行,如果少于40%,则将该模式归为该行。
提前致谢。
comp <- c('F','F','F','S','S','S','T','T','T')
month <- c('Jan','Feb','March','Apr','May','June','July','Aug','Sept')
Sales <- c('Low','Medium','High','High','Low','High','Medium','Low','Low')
bq1 <- runif(9,min = 0, max = 100)
bq2 <- runif(9,min = 0, max = 100)
bq3 <- runif(9,min = 0, max = 100)
bq4 <- runif(9,min = 0, max = 100)
bq5 <- runif(9,min = 0, max = 100)
bq6 <- runif(9,min = 0, max = 100)
df <- data.frame(Comp = comp, Month = month, Sales = Sales, Qtr1 = bq1, Qtr2
= bq2, Qtr3 = bq3, Qtr4 = bq4, Qtr5 = bq5, Qtr6 = bq6)
df[5,5] <- NA
df[5,4] <- NA
df[5,7] <- NA
df[5,9] <- NA
df[3,5] <- NA
df[3,4] <- NA
df[3,7] <- NA
df[3,9] <- NA
df[9,5] <- NA
df[9,4] <- NA
df[9,6] <- NA
df[9,8] <- NA
df[7,5] <- NA
df[1,5] <- NA
答案 0 :(得分:1)
希望这有帮助!
library(dplyr)
df %>% mutate_if(is.factor, as.character) -> df1
#imputation function
impute <- function(x){
missing_perc <- sum(is.na(x))/length(x) * 100
return(ifelse(missing_perc > 40, NA,
ifelse(is.character(x), names(sort(-table(x[!is.na(x)])))[1], mean(x[!is.na(x)]))))
}
impute_val <- sapply(df1, impute)
#impute missing values
df1[] <- Map(function(x, y) replace(x, is.na(x), y), df1, impute_val)
#drop rows where column has missing percentage > 40
df1 <- na.omit(df1)
#final data
df1
输出为:
Comp Month Sales Qtr1 Qtr2 Qtr3 Qtr4 Qtr5
2 F Feb Medium 65.4017299879342 66.0814035916701 13.8528823154047 21.5696093859151 18.2194353546947
4 S Apr High 89.403684460558 74.2279292317107 55.5751067353413 51.869766949676 9.31410894263536
6 S June High 11.7533272597939 11.6908136522397 12.5517533393577 95.4095394117758 36.061190161854
8 T Aug Low 7.48507694806904 77.5027731899172 42.0926807913929 11.0406906111166 17.137353355065
Qtr6
2 82.1378237567842
4 27.7001850772649
6 88.5877252323553
8 23.5045042354614
示例数据:
structure(list(Comp = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L), .Label = c("F", "S", "T"), class = "factor"), Month = structure(c(4L,
3L, 7L, 1L, 8L, 6L, 5L, 2L, 9L), .Label = c("Apr", "Aug", "Feb",
"Jan", "July", "June", "March", "May", "Sept"), class = "factor"),
Sales = structure(c(2L, 3L, 1L, 1L, 2L, 1L, 3L, 2L, 2L), .Label = c("High",
"Low", "Medium"), class = "factor"), Qtr1 = c(43.4887288603932,
65.4017299879342, NA, 89.403684460558, NA, 11.7533272597939,
50.5520776147023, 7.48507694806904, NA), Qtr2 = c(NA, 66.0814035916701,
NA, 74.2279292317107, NA, 11.6908136522397, NA, 77.5027731899172,
NA), Qtr3 = c(5.68129089660943, 13.8528823154047, 35.6186878867447,
55.5751067353413, 6.98710139840841, 12.5517533393577, 8.91167896334082,
42.0926807913929, NA), Qtr4 = c(22.5347936619073, 21.5696093859151,
NA, 51.869766949676, NA, 95.4095394117758, 16.6109931422397,
11.0406906111166, 56.1983718769625), Qtr5 = c(5.67050215322524,
18.2194353546947, 88.5992815019563, 9.31410894263536, 77.7505977777764,
36.061190161854, 51.1230558156967, 17.137353355065, NA),
Qtr6 = c(27.9433359391987, 82.1378237567842, NA, 27.7001850772649,
NA, 88.5877252323553, 50.3849557833746, 23.5045042354614,
74.2521224310622)), .Names = c("Comp", "Month", "Sales",
"Qtr1", "Qtr2", "Qtr3", "Qtr4", "Qtr5", "Qtr6"), row.names = c(NA,
-9L), class = "data.frame")