keras fit_generator提供0精度

时间:2018-04-05 23:35:01

标签: python tensorflow keras keras-layer

我有一个大型的n_samples数据集,n_features,n_classes = 346679,10233,86。我正在尝试在这个数据集上构建一个分类器。为此,我使用的是使用keras序列模型构建的多层感知器。

DataGeneratorClass

class DataGeneratorKeras:
    def __init__(self, num_rows, n_classes, n_samples, n_features, batch_size=1, shuffle=True):
        self.num_rows = num_rows
        self.n_samples = n_samples
        self.n_features = n_features
        self.n_classes = n_classes
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.flag = False

    def __get_exploration_order(self, list_ids):
        """
        Generates order of exploration
        :param list_ids:
        :return:
        """
        # Find exploration order
        indexes = np.arange(len(list_ids))
        if self.shuffle:
            np.random.shuffle(indexes)
        return indexes

    def __data_generation(self, list_ids_temp, n_classes):
        """
        Generates data of batch_size samples
        :param list_ids_temp:
        :param n_classes:
        :return:
        """
        index = list_ids_temp[0]

        fv = load_npz("data_file_" + str(index) + ".npz")

        labels_complete = load(...) # Load labels
        partial_labels = labels_complete[index]
        del labels_complete

        y = self.sparsify(partial_labels, n_classes)
        return fv, y

    @staticmethod
    def sparsify(y, n_classes):
        """
        :return:
        """
        label_encoder = np_utils.to_categorical(y, n_classes)
        return label_encoder

    def generate(self, list_ids):
        """
        Generates batches of samples
        :param list_ids:
        :return:
        """
        # Infinite loop
        while 1:
            # Generate order of exploration of dataset
            indexes = self.__get_exploration_order(list_ids)

            # Generate batches
            imax = int(len(indexes) / self.batch_size)
            for i in range(imax):
                # Find list of IDs
                list_ids_temp = [list_ids[k] for k in indexes[i * self.batch_size:(i + 1) * self.batch_size]]

                # Generate data
                x, y = self.__data_generation(list_ids_temp, self.n_classes)

                yield x.toarray(), y

脚本类

class Script:
    def __init__(self, num_rows, batch_size, test_size, n_classes, n_samples, n_features):
        self.batch_size = batch_size
        self.num_rows = num_rows
        self.test_size = test_size
        self.n_classes = n_classes
        self.n_samples = n_samples
        self.n_features = n_features

    def main(self):
        validation = int(self.test_size * self.num_rows)
        train = self.num_rows - validation

        params = {
            'num_rows': self.num_rows,
            'n_samples': self.n_samples,
            'n_features': self.n_features,
            'n_classes': self.n_classes,
            'batch_size': self.batch_size,
            'shuffle': True
        }

        partition = {'train': range(train), 'validation': range(train, self.num_rows)}

        # Generators
        training_generator = DataGeneratorKeras(**params).generate(partition['train'])
        validation_generator = DataGeneratorKeras(**params).generate(partition['validation'])
        return training_generator, validation_generator, partition

if __name__ == "__main__":
    script = Script(num_rows=347, test_size=0.25, n_classes=86, n_samples=346679, n_features=10233, batch_size=1)
    training_generator, validation_generator, partition = script.main()

构建模型

def classifier_base_data(dropout, learning_rate):
        model = Sequential()
        model.add(Dense(2**13, input_shape=(script.n_features,), activation='relu', name="l_input"))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))
        model.add(Dense(2**12, input_dim=2**13, activation='relu', name="l_hidden_1"))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))
        model.add(Dense(2**11, input_dim=2**12, activation='relu', name="l_hidden_2"))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))
        model.add(Dense(2**10, input_dim=2**11, activation='relu', name="l_hidden_3"))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))
        model.add(Dense(2**9, input_dim=2**10, activation='relu', name="l_hidden_4"))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))
        model.add(Dense(2**8, input_dim=2**9, activation='relu', name="l_hidden_5"))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))
        model.add(Dense(2**7, input_dim=2**8, activation='relu', name="l_hidden_6"))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))
        model.add(Dense(script.n_classes, activation='softmax', name="l_output"))

        optimizer = adam(lr=learning_rate)

        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        print model.summary()
        return model

当我使用keras拟合函数运行模型时,我能够达到val_acc和acc 25%

history = model.fit(x_train.toarray(), y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_validation.toarray(), y_validation))

由于数据很大,我正在使用keras的DataGenerator,遵循一篇写得很好的教程keras-datagen-tutorial。当我使用fit_generator运行模型时,我得到 0% val_acc。

model.fit_generator(
    generator = training_generator,
    steps_per_epoch = len(partition['train']),
    epochs = epochs,
    validation_data = validation_generator,
    validation_steps = len(partition['validation']),
    verbose = 1
)

DataGenerator中是否有任何问题?

0 个答案:

没有答案