如何提高LSTM训练的准确性

时间:2019-06-13 07:39:10

标签: python deep-learning nlp lstm

我使用LSTM训练了定额问题对检测,但是训练准确性非常低,并且在训练时总是会变化。我不明白我犯了什么错误。

我尝试更改损耗和优化器,并增加了时期。

import numpy as np
from numpy import array
from keras.callbacks import ModelCheckpoint
import keras
from keras.optimizers import SGD
import tensorflow as tf
from sklearn import preprocessing
import xgboost as xgb
from keras import backend as K
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from keras.preprocessing.text import Tokenizer , text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.models import Sequential, model_from_json, load_model
from keras.layers import LSTM, Dense, Input, concatenate, Concatenate,             Activation, Flatten
 from keras.models import Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import     TfidfVectorizer,CountVectorizer
import nltk

from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import pickle

df = pd.read_csv("questions.csv")
df.drop(['id','qid1', 'qid2'], axis=1, inplace=True)

df2 = pd.read_csv("testmenew.csv") 
##过滤数据集
 SPECIAL_TOKENS = {
    'quoted': 'quoted_item',
    'non-ascii': 'non_ascii_word',
    'undefined': 'something'
}

def clean(text, stem_words=True):
    import re
    from string import punctuation
    from nltk.stem import SnowballStemmer
    from nltk.corpus import stopwords

    def pad_str(s):
        return ' '+s+' '

    if pd.isnull(text):
        return ''

    if type(text) != str or text=='':
        return ''

    text = re.sub("\'s", " ", text) 
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub("(\d+)(kK)", " \g<1>000 ", text)
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text,    flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ",  text, flags=re.IGNORECASE)
     text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)

    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)     
    text = re.sub('[^\x00-\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text)  
    text = re.sub("(?<=[0-9])rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(" rs(?=[0-9])", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(r" (the[\s]+|The[\s]+)?US(A)? ", " America ", text)
    text = re.sub(r" UK ", " England ", text, flags=re.IGNORECASE)
    text = re.sub(r" india ", " India ", text)
    text = re.sub(r" switzerland ", " Switzerland ", text)
    text = re.sub(r" china ", " China ", text)
    text = re.sub(r" chinese ", " Chinese ", text) 
    text = re.sub(r" imrovement ", " improvement ", text, flags=re.IGNORECASE)
    text = re.sub(r" intially ", " initially ", text, flags=re.IGNORECASE)
    text = re.sub(r" quora ", " Quora ", text, flags=re.IGNORECASE)
    text = re.sub(r" dms ", " direct messages ", text,   flags=re.IGNORECASE)  
    text = re.sub(r" demonitization ", " demonetization ", text, flags=re.IGNORECASE) 
    text = re.sub(r" actived ", " active ", text, flags=re.IGNORECASE)
    text = re.sub(r" kms ", " kilometers ", text, flags=re.IGNORECASE)
    text = re.sub(r" cs ", " computer science ", text, flags=re.IGNORECASE) 
     text = re.sub(r" upvote", " up vote", text, flags=re.IGNORECASE)
    text = re.sub(r" iPhone ", " phone ", text, flags=re.IGNORECASE)
    text = re.sub(r" \0rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(r" calender ", " calendar ", text, flags=re.IGNORECASE)
     text = re.sub(r" ios ", " operating system ", text, flags=re.IGNORECASE)
     text = re.sub(r" gps ", " GPS ", text, flags=re.IGNORECASE)
    text = re.sub(r" gst ", " GST ", text, flags=re.IGNORECASE)
    text = re.sub(r" programing ", " programming ", text, flags=re.IGNORECASE)
    text = re.sub(r" bestfriend ", " best friend ", text, flags=re.IGNORECASE)
    text = re.sub(r" dna ", " DNA ", text, flags=re.IGNORECASE)
    text = re.sub(r" III ", " 3 ", text)
    text = re.sub(r" banglore ", " Banglore ", text, flags=re.IGNORECASE)
    text = re.sub(r" J K ", " JK ", text, flags=re.IGNORECASE)
    text = re.sub(r" J\.K\. ", " JK ", text, flags=re.IGNORECASE)
    text = re.sub('[0-9]+\.[0-9]+', " 87 ", text)
    text = ''.join([c for c in text if c not in punctuation]).lower()
    return text

    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

 df['question1'] = df['question1'].apply(clean)
 df['question2'] = df['question2'].apply(clean)

df2['q1'] = df2['q1'].apply(clean)
df2['q2'] = df2['q2'].apply(clean)

main =df['is_duplicate'].values

main.shape
(404351,)


vocabularySize = 20000
 lstm_out = 200
embed_dim = 128

Rawdata=df['question1'].apply(word_tokenize)
Rawdata2=df['question2'].apply(word_tokenize)

testme = df2['q1'].apply(word_tokenize)
testme2=df2['q2'].apply(word_tokenize)

tokenizer2 = Tokenizer(num_words = vocabularySize )

tokenizer2.fit_on_texts(testme)
tokenizer2.fit_on_texts(testme2)

tokenizer = Tokenizer(num_words = vocabularySize )

tokenizer.fit_on_texts(Rawdata)
tokenizer.fit_on_texts(Rawdata2)

 sequences = tokenizer.texts_to_sequences(Rawdata)
sequences2 = tokenizer.texts_to_sequences(Rawdata2)

sequences3 = tokenizer2.texts_to_sequences(testme)
sequences4 = tokenizer2.texts_to_sequences(testme2)

data = pad_sequences(sequences, maxlen=2)
data2 = pad_sequences(sequences2, maxlen=2)

data3 = pad_sequences(sequences3, maxlen=2)
data4 = pad_sequences(sequences4, maxlen=2)

TestInput = np.array([data3,data4])
TestInput = TestInput.reshape(1,2,2)
Input = np.array([data,data2])
Input =  Input.reshape(404351,2,2)

#opt = SGD(lr = 0.001, momentum = 0.60)

model = Sequential()
#model.add(Embedding(1, 4,input_length = 2 , dropout = 0.4))
model.add(LSTM((1), input_shape = (2,2), return_sequences=False))
model.add(Activation ('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])
X_train,X_test,y_train,y_test = train_test_split(Input,main,test_size = 0.2,random_state = 4)

Input.shape
(404351, 2, 2)

history = model.fit(X_train,y_train,epochs = 10,validation_data=   (X_test,y_test) )
model.save_weights('newoutput2.h5') 

训练323480个样本,验证80871个样本 时代1/10 323480/323480 [=============================]-27秒83us / step-损耗:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代2/10 323480/323480 [==============================]-24s 73us / step-损耗:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代3/10 323480/323480 [==============================]-23s 71us / step-损失:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代4/10 323480/323480 [==============================]-23s 71us / step-损失:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代5/10 323480/323480 [=============================]-23s 72us / step-损耗:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代6/10 323480/323480 [==============================]-23s 71us / step-损失:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代7/10 323480/323480 [==============================]-23s 71us / step-损失:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代8/10 323480/323480 [=============================]-25s 76us / step-损耗:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代9/10 323480/323480 [=============================]-25s 78us / step-损耗:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323 时代10/10 323480/323480 [=============================]-25s 78us / step-损耗:0.6931-acc:0.6304-val_loss :0.6931-val_acc:0.6323

filename = 'newoutput2.h5'
model.load_weights(filename)
new = model.predict(TestInput)
if new > 0.6:
    print("Duplication detected")
else:
    print("No duplicate")
new 

giving output around 0.6567 but not atall increasing, Please help !!

我需要提高培训的准确性

2 个答案:

答案 0 :(得分:0)

有两种方法可以提高准确性:

1)增加LSTM节点中的隐藏层。和/或2)添加LSTM的另一层。 仅1个隐藏层可能不足以训练您的数据。

对模型进行如上所述的更改后,您可能会看到精度在一定范围内保持稳定。基于此,您可以调整其他参数。

另一个注意事项:您需要启用嵌入层才能将单词转换为向量。

答案 1 :(得分:0)

有四种方法可以改善深度学习性能:

  • 通过数据提高性能。
  • 通过算法提高性能。
  • 通过算法调整提高性能。
  • 通过合奏改善演奏。

通过数据提高性能:

  1. 获取更多数据。
  2. 发明更多数据。
  3. 重新调整您的数据。
  4. 转换数据。
  5. 功能选择

通过算法提高性能

  1. 抽查算法:也许您选择的算法并非最适合您的问题。
  2. 重采样方法:您必须知道模型的质量。您对模型性能的估计是否可靠?

通过算法调整提高性能

关于调整神经网络算法以从中受益的一些想法。

  1. 诊断。
  2. 重量初始化。
  3. 学习率。
  4. 激活功能。
  5. 网络拓扑。
  6. 批次和时代。
  7. 正则化。
  8. 优化和损失。
  9. 早停。

通过合奏改善演奏

您可能要考虑的三个总体乐团领域:

  1. 组合模型。
  2. 组合视图。
  3. 堆叠。

检查以下链接以获取更多信息: https://machinelearningmastery.com/improve-deep-learning-performance/