Question

训练集的准确度很高，但测试集的准确度不是很高。我曾经尝试过辍学和L2正规化，训练集的准确率可以达到90％以上，但是测试设置达到70％，不知道问题在哪里？参数是否未调整？或其他什么？

Answer 1

#input.py 
# ---------------------
import tensorflow as tf
import os
import numpy as np
import math


def get_files(file_dir, ratio):
    '''
    Args:
        file_dir: file directory
        ratio: take ratio % of dataset as validation data
    Returns:
        list of train_images and train_labels, val_images and val_labels
    '''
    all_image_list = []
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            all_image_list.append(os.path.join(root, file))

    all_label_list = [0] * 640 + [1] * 640

    temp = np.array([all_image_list, all_label_list])
    temp = temp.transpose()
    np.random.shuffle(temp)
    np.random.shuffle(temp)

    all_image_list = temp[:, 0]
    all_label_list = temp[:, 1]

    n_sample = len(all_label_list)
    n_val = math.ceil(n_sample * ratio)
    n_train = n_sample - n_val

    tra_images = all_image_list[0:n_train]
    tra_labels = all_label_list[0:n_train]
    tra_labels = [int(float(i)) for i in tra_labels]

    val_images = all_image_list[n_train:-1]
    val_labels = all_label_list[n_train:-1]
    val_labels = [int(float(i)) for i in val_labels]

    return tra_images, tra_labels, val_images, val_labels


def get_batch(image, label, image_W, image_H, batch_size):
    '''
    Args:
        image: list type
        label: list type
        image_W: image width
        image_H: image height
        batch_size: batch size
        shuffle：shuffle data Bool
    Returns:
        image_batch: 4D tensor [batch_size, width, height, 3], dtype=tf.float32
        label_batch: 1D tensor [batch_size], dtype=tf.int32
    '''

    image = tf.cast(image, tf.string)
    label = tf.cast(label, tf.int32)

    input_queue = tf.train.slice_input_producer([image, label])

    label = input_queue[1]
    labels = tf.one_hot(label, 2, dtype=tf.uint8)

    image_contents = tf.read_file(input_queue[0])
    image = tf.image.decode_jpeg(image_contents, channels=3)

    ######################################
    # data argumentation should go to here
    ######################################
    image = tf.image.resize_images(image, [image_W, image_H], method=1)
    # image = tf.image.resize_image_with_crop_or_pad(image, image_W, image_H)
    # image = tf.image.random_flip_left_right(image)  
    # image = tf.image.random_flip_up_down(image)
    # image = tf.subtract(image, VGG_MEAN)
    image = tf.image.per_image_standardization(image)

    image_batch, label_batch = tf.train.shuffle_batch([image, labels],
                                                      batch_size=batch_size,
                                                      capacity=88,
                                                      min_after_dequeue=64)

    # label_batch = tf.reshape(label_batch, [batch_size])
    image_batch = tf.cast(image_batch, tf.float32)

    return image_batch, label_batch

#tools.py 
# ---------------------
import tensorflow as tf
import numpy as np


def conv(layer_name, x, out_channels, kernel_size=[3, 3], stride=[1, 1, 1, 1], is_trainable=True):
    '''Convolution op wrapper, use RELU activation after convolution
    Args:
        layer_name: e.g. conv1, pool1...
        x: input tensor, [batch_size, height, width, channels]
        out_channels: number of output channels (or comvolutional kernels)
        kernel_size: the size of convolutional kernel, VGG paper used: [3,3]
        stride: A list of ints. 1-D of length 4. VGG paper used: [1, 1, 1, 1]
        is_trainable: if load pretrained parameters, freeze all conv layers. 
        Depending on different situations, you can just set part of conv layers to be freezed.
        the parameters of freezed layers will not change when training.
    Returns:
        4D tensor
    '''

    in_channels = x.get_shape()[-1]
    with tf.variable_scope(layer_name):
        w = tf.get_variable(name='weights',
                            trainable=is_trainable,
                            shape=[kernel_size[0], kernel_size[1], in_channels, out_channels],
                            initializer=tf.contrib.layers.xavier_initializer())  # default is uniform distribution initialization
        b = tf.get_variable(name='biases',
                            trainable=is_trainable,
                            shape=[out_channels],
                            initializer=tf.constant_initializer(0.0))
        x = tf.nn.conv2d(x, w, stride, padding='SAME', name='conv')
        x = tf.nn.bias_add(x, b, name='bias_add')
        x = tf.nn.relu(x, name='relu')
        return x


def pool(layer_name, x, kernel=[1, 2, 2, 1], stride=[1, 2, 2, 1], is_max_pool=True):
    '''Pooling op
    Args:
        x: input tensor
        kernel: pooling kernel, VGG paper used [1,2,2,1], the size of kernel is 2X2
        stride: stride size, VGG paper used [1,2,2,1]
        padding:
        is_max_pool: boolen
                    if True: use max pooling
                    else: use avg pooling
    '''
    if is_max_pool:
        x = tf.nn.max_pool(x, kernel, strides=stride, padding='SAME', name=layer_name)
    else:
        x = tf.nn.avg_pool(x, kernel, strides=stride, padding='SAME', name=layer_name)
    return x


def batch_norm(x):
    '''
    Batch normlization(I didn't include the offset and scale)
    '''
    epsilon = 1e-3
    batch_mean, batch_var = tf.nn.moments(x, [0])
    x = tf.nn.batch_normalization(x,
                                  mean=batch_mean,
                                  variance=batch_var,
                                  offset=None,
                                  scale=None,
                                  variance_epsilon=epsilon)
    return x


def FC_layer(layer_name, x, out_nodes):
    '''
    Wrapper for fully connected layers with RELU activation as default
    Args:
        layer_name: e.g. 'FC1', 'FC2'
        x: input feature map
        out_nodes: number of neurons for current FC layer
    '''
    shape = x.get_shape()
    if len(shape) == 4:
        size = shape[1].value * shape[2].value * shape[3].value
    else:
        size = shape[-1].value

    with tf.variable_scope(layer_name):
        w = tf.get_variable('weights',
                            shape=[size, out_nodes],
                            initializer=tf.contrib.layers.xavier_initializer())
        b = tf.get_variable('biases',
                            shape=[out_nodes],
                            initializer=tf.constant_initializer(0.0))
        flat_x = tf.reshape(x, [-1, size])  # flatten into 1D

        x = tf.nn.bias_add(tf.matmul(flat_x, w), b)
        # x = tf.nn.relu(x)
        return x


def loss(logits, labels):
    '''Compute loss
    Args:
        logits: logits tensor, [batch_size, n_classes]
        labels: one-hot labels
    '''
    with tf.name_scope('loss') as scope:
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels, name='cross-entropy')
        # reg = tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(0.5), tf.trainable_variables())
        # loss = tf.reduce_mean(cross_entropy + reg, name='loss')
        loss = tf.reduce_mean(cross_entropy, name='loss')
        # tf.summary.scalar(scope+'/loss', loss)
    return loss


def accuracy(logits, labels):
    """Evaluate the quality of the logits at predicting the label.
    Args:
        logits: Logits tensor, float - [batch_size, NUM_CLASSES].
        labels: Labels tensor,
    """
    with tf.name_scope('accuracy') as scope:
        correct = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
        correct = tf.cast(correct, tf.float32)
        accuracy = tf.reduce_mean(correct) * 100.0
        # tf.summary.scalar(scope+'/accuracy', accuracy)

    return accuracy


def optimize(loss, learning_rate):
    '''
    optimization, use Gradient Descent as default
    '''
    with tf.name_scope('optimizer'):
        # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
        # optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, momentum=0.9)
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
        # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        train_op = optimizer.minimize(loss)
        return train_op


def load_with_skip(data_path, session, skip_layer):
    data_dict = np.load(data_path, encoding='latin1').item()
    keys = sorted(data_dict.keys())
    for key in keys:
        if key not in skip_layer:
            with tf.variable_scope(key, reuse=True):
                for subkey, data in zip(('weights', 'biases'), data_dict[key]):
                    session.run(tf.get_variable(subkey).assign(data))
# vgg.py
# import tensorflow as tf
import tools


def VGG16(x, n_classes, is_trainable=True, keep_prob=1):

    x = tools.conv('conv1_1', x, 64, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=False)
    x = tools.conv('conv1_2', x, 64, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=False)
    x = tools.pool('pool1', x, kernel=[1,2,2,1], stride=[1,2,2,1], is_max_pool=True)

    x = tools.conv('conv2_1', x, 128, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.conv('conv2_2', x, 128, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.pool('pool2', x, kernel=[1,2,2,1], stride=[1,2,2,1], is_max_pool=True)

    x = tools.conv('conv3_1', x, 256, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.conv('conv3_2', x, 256, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.conv('conv3_3', x, 256, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.pool('pool3', x, kernel=[1,2,2,1], stride=[1,2,2,1], is_max_pool=True)

    x = tools.conv('conv4_1', x, 512, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.conv('conv4_2', x, 512, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.conv('conv4_3', x, 512, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.pool('pool3', x, kernel=[1,2,2,1], stride=[1,2,2,1], is_max_pool=True)

    x = tools.conv('conv5_1', x, 512, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.conv('conv5_2', x, 512, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.conv('conv5_3', x, 512, kernel_size=[3,3], stride=[1,1,1,1], is_trainable=is_trainable)
    x = tools.pool('pool3', x, kernel=[1,2,2,1], stride=[1,2,2,1], is_max_pool=True)

    x = tools.FC_layer('fc6', x, out_nodes=4096)
    x = tools.batch_norm(x)
    x = tf.nn.relu(x)
    relu6 = tf.nn.dropout(x, keep_prob=keep_prob)

    x = tools.FC_layer('fc7', relu6, out_nodes=4096)
    x = tools.batch_norm(x)
    x = tf.nn.relu(x)
    relu7 = tf.nn.dropout(x, keep_prob=keep_prob)

    x = tools.FC_layer('fc8', relu7, out_nodes=n_classes)
    relu8 = tf.nn.relu(x)

    return relu8, x

# train.py
import time
import tensorflow as tf
import input
import vgg
import tools
# import os


# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
num_class = 2
img_w = 224
img_h = 224
batch_size = 8
n_epoch = 100
learning_rate = 0.00001
is_Trainable = True
ratio = 0.3
num_images = 1280
n_step_epoch = int(num_images * (1 - ratio) / batch_size)
n_step = n_epoch * n_step_epoch


def run_training():

    pre_trained_weights = r'F:\DrProject\vgg16_pretrain\vgg16.npy'
    image_dir = r'G:\DR数据集\Messidor\png\\'
    logs_train_dir = r'D:\logs\log\log1\train\\'
    logs_val_dir = r'D:\logs\log\log1\test\\'
    logs_model_dir = r'D:\logs\log\log1\model\\'

    train, train_label, val, val_label = input.get_files(image_dir, ratio)
    train_batch, train_label_batch = input.get_batch(train, train_label, img_w, img_h, batch_size)
    val_batch, val_label_batch = input.get_batch(val, val_label, img_w, img_h, batch_size)

    x = tf.placeholder(tf.float32, shape=[batch_size, img_w, img_h, 3])
    y_ = tf.placeholder(tf.int32, shape=[batch_size, num_class])
    keep_prob = tf.placeholder(tf.float32)

    loss_mean = tf.placeholder(tf.float32)
    acc_mean = tf.placeholder(tf.float32)

    logits, score = vgg.VGG16(x, num_class, is_trainable=is_Trainable)
    loss = tools.loss(logits, y_)
    acc = tools.accuracy(logits, y_)
    train_op = tools.optimize(loss, learning_rate)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # loss_scalar_train = tf.summary.scalar('train/loss', loss)
    # accuracy_scalar_train = tf.summary.scalar('train/accuracy', acc)
    loss_mean_scalar_train = tf.summary.scalar('train/loss_mean', loss_mean)
    acc_mean_scalar_train = tf.summary.scalar('train/acc_mean', acc_mean)

    # accuracy_scalar_test = tf.summary.scalar('test/accuracy', acc)
    acc_mean_scalar_test = tf.summary.scalar('test/acc_mean', acc_mean)

    train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph)
    validation_writer = tf.summary.FileWriter(logs_val_dir, sess.graph)

    saver = tf.train.Saver()

    tools.load_with_skip(pre_trained_weights, sess, ['fc6', 'fc7', 'fc8'])

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    step = 0
    step_test = 0

    try:
        for epoch in range(n_epoch):
            start_time = time.time()

            if coord.should_stop():
                break

            train_loss, train_acc, n_batch = 0, 0, 0

            for i in range(n_step_epoch):

                tra_images, tra_labels = sess.run([train_batch, train_label_batch])

                _, loss1, acc1 = sess.run([train_op, loss, acc], feed_dict={x: tra_images, y_: tra_labels,
                                                                            keep_prob: 0.5})

                step += 1
                train_loss += loss1
                train_acc += acc1
                n_batch += 1

                # sum_loss_train, sum_accuracy_train, mean_loss_train, mean_acc_train = sess.run(
                #     [loss_scalar_train, accuracy_scalar_train, loss_mean_scalar_train, acc_mean_scalar_train],
                #     feed_dict={x: tra_images, y_: tra_labels, loss_mean: train_loss / n_batch,
                #                acc_mean: train_acc / n_batch})
                mean_loss_train, mean_acc_train = sess.run([loss_mean_scalar_train, acc_mean_scalar_train],
                                                           feed_dict={x: tra_images, y_: tra_labels,
                                                                      loss_mean: train_loss / n_batch,
                                                                      acc_mean: train_acc / n_batch})

                # train_writer.add_summary(sum_loss_train, step)
                # train_writer.add_summary(sum_accuracy_train, step)

                train_writer.add_summary(mean_acc_train, step)
                train_writer.add_summary(mean_loss_train, step)

            print("\nEpoch %d : Step %d-%d of %d took %fs" % (epoch + 1, step, step + n_step_epoch, n_step,
                                                              time.time() - start_time))

            print("   train loss: %f" % (train_loss / n_batch))
            print("   train acc: %f %%" % (train_acc / n_batch))

            test_loss, test_acc, n_batch = 0, 0, 0

            for j in range(int(num_images * ratio / batch_size)):

                val_images, val_labels = sess.run([val_batch, val_label_batch])

                err, ac = sess.run([loss, acc], feed_dict={x: val_images, y_: val_labels, keep_prob: 1})

                step_test += 1
                test_loss += err
                test_acc += ac
                n_batch += 1

                # sum_accuracy_test, mean_acc_test = sess.run(
                #     [accuracy_scalar_test, acc_mean_scalar_test], feed_dict={x: val_images, y_: val_labels,
                # loss_mean: test_loss / n_batch, acc_mean: test_acc / n_batch})

                mean_acc_test = sess.run(acc_mean_scalar_test, feed_dict={x: val_images, y_: val_labels,
                                                                          acc_mean: test_acc / n_batch})

                # validation_writer.add_summary(sum_accuracy_test, step_test)
                validation_writer.add_summary(mean_acc_test, step_test)

            print('   ------------------')
            print("   test loss: %f" % (test_loss / n_batch))
            print("   test acc: %f %%" % (test_acc / n_batch))

            if (epoch + 1) % 5 == 0 or (epoch + 1) == n_epoch:
                print("Save model !")
                saver.save(sess, logs_model_dir+'model.ckpt', global_step=epoch + 1)

    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()

    train_writer.close()
    validation_writer.close()
    coord.join(threads)
    sess.close()


if __name__ == '__main__':
    run_training()
    enter code here

训练集的准确度很高，但测试集的准确度不是很高。 overfiting？

1 个答案: