R的朴素贝叶斯分类器

时间:2013-11-20 09:12:00

标签: r

我正在尝试运行运行此代码的NaiveBayes分类器:

library(e1071)
filename <- "file.csv"  
data <- read.csv(filename, sep=",")  
trainData <- data[1:1000,,drop = FALSE]    
testData <- data[1001:1500,,drop = FALSE]  
model <- naiveBayes(trainData[,1:49], trainData[,50])   
predicted <- predict(model, testData[,-50])  

我收到此错误:

Error in apply(log(sapply(seq_along(attribs), function(v) { : 
  dim(X) must have a positive length

我检查过,我没有看到任何遗漏属性。可能是什么问题?

一些样本数据,遵循Mike博士的要求。请注意,数据包含50列(在我原来的帖子中,我使用了30个,因为我试图简化事情)

structure(list(X = 1:10, X1 = c(12L, 6L, 6L, 11L, 17L, 7L, 5L, 
3L, 4L, 7L), X2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = "A", class = "factor"), X3 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "us", class = "factor"), 
    X4 = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, 
    TRUE), X5 = c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, 
    FALSE, FALSE, TRUE), X6 = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
    FALSE, FALSE, FALSE, FALSE, FALSE), X7 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "DAY", class = "factor"), 
    X8 = c(159L, 47L, 163L, 51L, 171L, 145L, 31L, 49L, 49L, 154L
    ), X9 = structure(c(2L, 5L, 2L, 4L, 3L, 2L, 1L, 5L, 1L, 2L
    ), .Label = c("100-150", "150-200", "50-100", "650-700", 
    "unknown"), class = "factor"), X10 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), X11 = c(0L, 1L, 0L, 2L, 0L, 0L, 0L, 
    1L, 0L, 0L), X12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L), X13 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), X14 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X15 = c(1L, 1L, 2L, 
    2L, 0L, 1L, 0L, 2L, 2L, 2L), X16 = c(2L, 1L, 2L, 1L, 0L, 
    2L, 1L, 2L, 1L, 2L), X17 = c(6L, 4L, 6L, 7L, 0L, 4L, 3L, 
    4L, 4L, 8L), X18 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L), X19 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X20 = c(1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X21 = c(7L, 6L, 7L, 
    3L, 0L, 6L, 6L, 2L, 7L, 7L), X22 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), X23 = c(2L, 2L, 2L, 2L, 0L, 2L, 2L, 
    2L, 2L, 2L), X24 = c(0L, 1L, 2L, 1L, 0L, 0L, 0L, 1L, 1L, 
    1L), X25 = c(2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), X26 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X27 = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X28 = c(1L, 0L, 0L, 0L, 0L, 
    0L, 0L, 1L, 1L, 0L), X29 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), X30 = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 2L, 
    0L), X31 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X32 = c(2L, 
    0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L), X33 = c(0L, 0L, 1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X34 = c(1L, 1L, 2L, 3L, 0L, 
    2L, 0L, 2L, 2L, 1L), X35 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    1L, 2L, 0L), X36 = c(3L, 3L, 5L, 3L, 0L, 4L, 4L, 7L, 4L, 
    2L), X37 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X38 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X39 = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X40 = c(3L, 1L, 4L, 3L, 0L, 
    2L, 0L, 3L, 1L, 0L), X41 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), X42 = c(3L, 6L, 3L, 2L, 0L, 2L, 0L, 3L, 2L, 
    3L), X43 = c(1L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L), X44 = c(1L, 
    2L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), X45 = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X46 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), X47 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), X48 = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 2L, 1L, 
    0L), X49 = c(2L, 0L, 1L, 1L, 0L, 0L, 2L, 3L, 0L, 4L), target = c(FALSE, 
    FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
    )), .Names = c("X", "X1", "X2", "X3", "X4", "X5", "X6", "X7", 
"X8", "X9", "X10", "X11", "X12", "X13", "X14", "X15", "X16", 
"X17", "X18", "X19", "X20", "X21", "X22", "X23", "X24", "X25", 
"X26", "X27", "X28", "X29", "X30", "X31", "X32", "X33", "X34", 
"X35", "X36", "X37", "X38", "X39", "X40", "X41", "X42", "X43", 
"X44", "X45", "X46", "X47", "X48", "X49", "target"), row.names = c(NA, 
10L), class = "data.frame")

根据迈克博士的建议,我对数据进行了分解。我仍然得到完全相同的错误 所以代码现在是:

data <- read.csv(filename, sep=",")
data[, -which(sapply(data, is.numeric))]<-
  as.data.frame(lapply((data[,-which(sapply(data, is.numeric))]), as.factor))
trainData <- data[1:1000,,drop = FALSE]    
testData <- data[1001:1500,,drop = FALSE]  
model <- naiveBayes(trainData[,1:49], trainData[,50])   
predicted <- predict(model, testData[,-50])  

3 个答案:

答案 0 :(得分:3)

您应该确保使用分类变量的因子。以下使用您的数据的小例子。

data$target[1:5]<-TRUE
data$target[6:10]<-FALSE
data<-data[sample(1:10),]
# Make sure the target is a factor
data$target<-as.factor(data$target)
# Work with a subset since we only have 10 data points.
data2<-data[,c(2,49:50,51)]
model<-naiveBayes(target~., data=data2[1:5,])
predict(model, data2[6:10,])
table(predict(model, data2[6:10,]), data[6:10, 5])
#        FALSE TRUE
#  FALSE     0    0
#  TRUE      0    5

因此,请确保将分类变量和响应转换为因子并使用变量的子集。

答案 1 :(得分:2)

发现问题并记录解决方案,以防有人像我一样搜索此错误。
在这种情况下,问题是“目标”属性仅包含标记为“假”的示例。由于没有标有“真实”的例子,分类器不起作用 我仍然不明白为什么我得到这个特定的错误,而不是像以下那样可读的东西:“不能分类,没有”真正的“标记的例子”。但至少我解决了它。

非常感谢Dr.Mike向我发送了正确的方向(在这种情况下是 - 查看数据,而不是代码)

答案 2 :(得分:0)

我有类似的问题。经过几个小时或研究后,我改为TRUE / FALSE 字符值y / n。 似乎R对TRUE / FALSE是分类还是数字感到困惑。