当相关的unigram的bigram非零时,如何将unigram tfidf等同为0?

时间:2014-11-06 13:48:48

标签: python sentiment-analysis n-gram

我正在使用python与scikit-learn和nltk进行电影评论的情绪分析。我想将与unigram相关的元素等同于0(当它们具有相反极性时),当与那些单字组相关的二元组/三元组不为零时。

例如:

movie is not bad

比特征向量为['movie' 'is' 'not' 'bad' 'movie is' 'is not' 'not bad']=[3 3 1 1 4 2 4]

但我想改为[3 3 0 0 4 2 4]

代码:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

import math
#######################Reading Training Review Phrases and Sentiments###################################

train_list = []
train_sentiment = []
with open('sentences.txt') as f:
    content = f.readlines()
for sentence in content:
    train_list.append(sentence.rstrip('\n').split("\t")[0])
    train_sentiment.append(sentence.rstrip('\n').split("\t")[1])

#######################Number of phrases in each class###################################

ex_pos = pos = neu = neg = ex_neg = 0
ex_pos_phrases = pos_phrases = neu_phrases = neg_phrases = ex_neg_phrases = []

with open('ex_pos.txt', 'r') as ex_posF:
    ex_pos_phrases = ex_posF.readlines()
    ex_pos = len(ex_pos_phrases)
with open('pos.txt', 'r') as posF:
    pos_phrases = posF.readlines()
    pos = len(pos_phrases)
with open('neu.txt', 'r') as neuF:
    neu_phrases = neuF.readlines()
    neu = len(neu_phrases)
with open('neg.txt', 'r') as negF:
    neg_phrases = negF.readlines()
    neg = len(neg_phrases)
with open('ex_neg.txt', 'r') as ex_negF:
    ex_neg_phrases = ex_negF.readlines()
    ex_neg = len(ex_neg_phrases)

print(str(ex_neg) + "," + str(neg) + "," + str(neu) + "," + str(pos) + "," + str(ex_pos))

####################### Getting unique Words ###################################

unique_words = []
model = TfidfVectorizer(input=train_list)
train_tfidf = model.fit_transform(train_list)
unique_words = model.get_feature_names()

print("##### Word sentiment matrix ####")
########################## Word sentiment matrix ########################################

word_sentiment = [[0 for x in range(5)] for x in range(len(unique_words)) ]
wordcount = 0
for word in unique_words:
    count = 0
    for review in ex_neg_phrases:
        review_words = review.rstrip('\n').split(" ")
        for review_word in review_words:
            if review_word == word:
                count += 1
                break
    word_sentiment[wordcount][0] = count
    count = 0
    for review in neg_phrases:
        review_words = review.rstrip('\n').split(" ")
        for review_word in review_words:
            if review_word == word:
                count += 1
                break
    word_sentiment[wordcount][1] = count
    count = 0
    for review in neu_phrases:
        review_words = review.rstrip('\n').split(" ")
        for review_word in review_words:
            if review_word == word:
                count += 1
                break
    word_sentiment[wordcount][2] = count
    count = 0
    for review in ex_pos_phrases:
        review_words = review.rstrip('\n').split(" ")
        for review_word in review_words:
            if review_word == word:
                count += 1
                break
    word_sentiment[wordcount][4] = count
    count = 0
    for review in pos_phrases:
        review_words = review.rstrip('\n').split(" ")
        for review_word in review_words:
            if review_word == word:
                count += 1
                break
    word_sentiment[wordcount][3] = count
    wordcount += 1
print("###The Training feature matrix###")
#################################The feature matrix#######################################
feature_matrix = [[0 for x in range(len(unique_words))] for x in range(len(train_list))]
print(len(feature_matrix))
print(len(feature_matrix[0]))

wordcount = 0
for unique_word in unique_words:
    phrasecount = 0
    ep = p = nu = en = n = 0
    if word_sentiment[wordcount][4] != 0:
        ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)
    if word_sentiment[wordcount][3] != 0:
        p = .15 * math.log(word_sentiment[wordcount][3]/pos)
    if word_sentiment[wordcount][2] != 0:
        nu = 1 * math.log(word_sentiment[wordcount][2]/neu)
    if word_sentiment[wordcount][0] != 0:
        en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)
    if word_sentiment[wordcount][1] != 0:
        n = -.15 * math.log(word_sentiment[wordcount][1]/neg)

    for phrase in train_list:
        words = phrase.split(" ")
        docwordcount = 0
        for word in words:
            if word == unique_word:
                docwordcount += 1
        tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)
        feature_matrix[phrasecount][wordcount] = tfidf
        phrasecount += 1

    wordcount += 1

print("###The test feature matrix###")

test_list=[]
test_phraseid =[]
with open('sentences_test.txt') as f:
    content = f.readlines()
for sentence in content:
    test_list.append(sentence.rstrip('\n').split("\t")[0])
    test_phraseid.append(sentence.rstrip('\n').split("\t")[1])

wordcount = 0
test_tfidf = [[0 for x in range(len(unique_words))] for x in range(len(test_list))]

for unique_word in unique_words:
    phrasecount = 0
    ep = p = nu = en = n = 0
    if word_sentiment[wordcount][4] != 0:
        ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)
    if word_sentiment[wordcount][3] != 0:
        p = .15 * math.log(word_sentiment[wordcount][3]/pos)
    if word_sentiment[wordcount][2] != 0:
        nu = 1 * math.log(word_sentiment[wordcount][2]/neu)
    if word_sentiment[wordcount][0] != 0:
        en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)
    if word_sentiment[wordcount][1] != 0:
        n = -.15 * math.log(word_sentiment[wordcount][1]/neg)

    for phrase in test_list:
        words = phrase.split(" ")
        docwordcount = 0
        for word in words:
            if word == unique_word:
                docwordcount += 1
        tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)
        test_tfidf[phrasecount][wordcount] = tfidf
        phrasecount += 1
    wordcount += 1

print("###The Linear SVC ###")

self = LinearSVC()
self = LinearSVC.fit(self, feature_matrix, train_sentiment)
test_sentiment = LinearSVC.predict(self, test_tfidf)

with open('output_deltatfidf.csv', 'w') as fil:
    fil.write("PhraseId,Sentiment\n")
    for x in range(0, len(test_sentiment)):
        fil.write(test_phraseid[x] + "," + test_sentiment[x] + "\n")

0 个答案:

没有答案