Question

你好我试图构建一个分类器，它必须识别具有负面情绪（手动标记为-1）和积极情绪（1）的推文。我试图在我的样本数据（约1800条推文）上拟合不同的监督方法模型，训练集由70％的样本组成，测试集由剩余部分组成。问题是获得的结果不令人满意，因为我获得了高测试误差和低AUC值。那问题出在哪里？有没有办法改善这些结果？我的样本可以在这里找到：https://drive.google.com/open?id=0B9DO29WohGN6eHM3UTM1OUdWVnM。下面我逐步报告我的分析，因此它是完全可重复的，主要结果如下：

##reading csv
tweets = read.csv2("Finale2.csv",stringsAsFactors = FALSE)
str(tweets)
tweets$Negative= as.factor(tweets$Sent<=-1)
table(tweets$Negative)   
###pre-processing steps
install.packages("tm")
install.packages("SnowballC")
library(tm)
library(SnowballC)
###create a corpus of words and edit it
corpus <- Corpus(VectorSource(tweets$Tweet))
corpus=tm_map(corpus,tolower)
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeNumbers)
corpus=tm_map(corpus,removeWords, stopwords("english")) 
corpus=tm_map(corpus,removeWords,c("RT","rt","https*"))
corpus=tm_map(corpus,stripWhitespace) 
##stemming
corpus <- tm_map(corpus, stemDocument)
##creating a matrix of word frequencies
###creazione matrice frequenza parole
DTM <- DocumentTermMatrix(corpus)
DTM
##remove sparse terms
sparse = removeSparseTerms(DTM, 0.995)
###convert the sparse matrix into a dataframe 
tweetsSparse <- as.data.frame(as.matrix(sparse))
names(tweetsSparse)
###make all variables R friendly
colnames(tweetsSparse) <- make.names(colnames(tweetsSparse))
##add indipendent variables 
tweetsSparse$Negative <- tweets$Negative
###loading package caTools
install.packages("caTools")
library(caTools)
###splitting training set and test set
set.seed(234)
splitNegative <- sample.split(tweetsSparse$Negative, SplitRatio = 0.7)
trainSparse <- subset(tweetsSparse, splitNegative == TRUE)
testSparse <- subset(tweetsSparse, splitNegative == FALSE)

模型

1线性判别分析

require(MASS)
library(MASS)
lda.fit = lda(Negative ~., trainSparse)
lda.fit
###prediction on the test set
lda.pred=predict(lda.fit,testSparse)
table(testSparse$Negative,lda.pred$class)

       FALSE TRUE
  FALSE   134  119
  TRUE    123  176
 testerror=(134+119)/nrow(testSparse)
 testerror

1 0.4583333

####ROC CURVE
install.packages("ROCR")
library(ROCR)
pred <- prediction(lda.pred$posterior[,2], testSparse$Negative) 
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
abline(0, 1, lty = 2)

as.numeric(performance(pred, "auc")@y.values)

1 0.598411

2-LASSO MODEL

install.packages("glmnet")
library(glmnet)
##create the x matrix and the y vector for the training and test set errors (controllare)
x.train = model.matrix(Negative ~ . -1, data = trainSparse)
y.train=trainSparse$Negative
x.test = model.matrix(Negative ~ . -1, data = testSparse)
y.test=testSparse$Negative
##lasso logistic model
sent.lasso = glmnet(x.train, y.train, family = "binomial")
plot(sent.lasso, xvar = "lambda", label = TRUE)
sent.lasso
##select the best lambda with 10-CV:
cv.lasso=cv.glmnet(x.train,y.train,family="binomial")
plot(cv.lasso)
coef(cv.lasso)
###using the best model to predict on the test set 
pred.lasso = predict(cv.lasso, x.test, s = cv.lasso$lambda.1se, type = "class")
table(testSparse$Negative, pred.lasso)
   pred.lasso
        FALSE TRUE
  FALSE    40  213
  TRUE     18  281
lassoerror=(213+18)/nrow(testSparse)
lassoerror

1 0.4184783

###ROC lasso
prob.lasso = predict(cv.lasso, x.test, s = cv.lasso$lambda.1se, type = "response")
predob = prediction(prob.lasso, testSparse$Negative)
perf = performance(predob, "tpr", "fpr")
par(mfrow = c(1, 1))
plot(perf, main = "LASSO Logistic Regression")
plot(perf, colorize = TRUE)
plot(perf, colorize = TRUE, print.cutoffs.at = seq(0, 1, by = 0.1), text.adj = c(-0.2, 1.7))
abline(0, 1, lty = 2)

as.numeric(performance(predob, "auc")@y.values)

1 0.6266673

3-CLASSIFICATION TREE

install.packages("tree")
library(tree)
sent.tree = tree(Negative ~ ., data = trainSparse)
summary(sent.tree)
plot(sent.tree)
text(sent.tree,pretty=0)

这里我对树的解释有疑问，节点数很差。

set.seed(2)
tree.pred = predict(sent.tree, testSparse, type = "class")
table(testSparse$Negative, tree.pred)

   tree.pred
        FALSE TRUE
  FALSE    14  239
  TRUE      5  294
> treerror=(239+5)/nrow(testSparse)
> treerror

1 0.442029

##Roc
tree.pred = predict(sent.tree, testSparse, type = "vector") # predict probabilities    
library(ROCR)
pred <- prediction(tree.pred[,2], testSparse$Negative) 
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

as.numeric(performance(pred, "auc")@y.values)

1 0.5195381

非常糟糕!!

4-随机森林

install.packages("randomForest")
library(randomForest)
set.seed(345)
sent.rf = randomForest(Negative ~ ., data = trainSparse)
sent.rf
##plot the error rate
plot(sent.rf$err.rate[,1])
###variable importance plot
varImpPlot(sent.rf)
###preformance on the test set
rf.pred = predict(sent.rf, testSparse, type = "class")
table(testSparse$Negative, rf.pred)

  rf.pred
        FALSE TRUE
  FALSE   162   91
  TRUE    119  180

rferror=(119+91)/nrow(testSparse)
rferror

1 0.3804348 在这种情况下，我无法计算ROC曲线，所以如果有人知道如何操作，请提供代码。

5-支持向量模型

###Support vector Classifier
install.packages("e1071")
library(e1071)
svmfit = svm(as.factor(Negative) ~ ., data = trainSparse, kernel = "linear", cost = 1)
names(svmfit)
summary(svmfit)
svmfit$index
##selecting best SVC with 10 CV
set.seed(1)
tune.out=tune(svm ,Negative~.,data=trainSparse, kernel ="linear",
              ranges=list(cost=c(0.1,1,10,100,1000),probability = TRUE))

在这种情况下，我有一个警告：达到最大迭代次数。如果有人知道如何解决这个问题，请帮助我。多项式内核也会出现同样的问题。

径向内核

tune.out=tune(svm ,Negative~.,data=trainSparse, kernel ="radial",
              ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4), 
                          probability = TRUE)) 
summary(tune.out)
best=tune.out$best.model
###prediction on the test set
svm.pred = predict(best, testSparse, type = "class")
table(testSparse$Negative,svm.pred)

 svm.pred
        FALSE TRUE
  FALSE    30  223
  TRUE      5  294

radialerror=(223+5)/nrow(testSparse)
 radialerror

1 0.4130435

###Roc curve
yhat.opt = predict(best,testSparse,probability = TRUE)
pred <- prediction(attributes(yhat.opt)$probabilities[,2], testSparse$Negative) 
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

as.numeric(performance(pred, "auc")@y.values)

1 0.5908827

嗯，希望这对某人来说可能是一个有用的例子而不会被阻止。如果有人可以帮助我改善这些结果或解决其中一个问题，请在下面评论。

Answer 1

Random Forest ROC试试这个：

rf.pred = predict(sent.rf, testSparse, type = "prob")
pred <- prediction(rf.pred[,2], testSparse$Negative) 
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
abline(0, 1, lty = 2)
as.numeric(performance(pred, "auc")@y.values)

拟合和改进情绪分类器

模型

2-LASSO MODEL

3-CLASSIFICATION TREE

4-随机森林

5-支持向量模型

径向内核

1 个答案: