评估数据集上的评估结果不佳

时间:2019-03-10 08:49:23

标签: python tensorflow

我训练dnn回归模型,并使用训练后的模型评估训练数据集。训练损失很小,但是评估损失却很高。我对此感到困惑。

Screenshot1

Screenshot2

我知道,即使模型过度拟合,训练数据的评估结果也不会很差。我怀疑代码可能是问题。下面是我的代码。

from Constant import *
import tensorflow as tf
import logging
import time

FEATURELEN = 22
# Metadata describing the text columns
COLUMNS = [str(i) for i in range(FEATURELEN)] + ["label",]
FIELD_DEFAULTS = [[0.0]] * (FEATURELEN+1)


def csv_input_fn_evaluate(fname):
    ds = tf.data.TextLineDataset(fname)
    ds = ds.map(_parse_line)
    # print ds
    # Shuffle, repeat, and batch the examples.
    dataset = ds.batch(1000)
    # Return the dataset.
    return dataset

def _parse_line(line):
    # Decode the line into its fields
    fields = tf.decode_csv(line, FIELD_DEFAULTS,field_delim=' ')
    # Pack the result into a dictionary
    features = dict(zip(COLUMNS,fields))
    # Separate the label from the features
    label = features.pop('label')
    return features, label

def csv_input_fn(fname):
    ds = tf.data.TextLineDataset(fname)
    ds = ds.map(_parse_line)
    # print ds
    # Shuffle, repeat, and batch the examples.
    dataset = ds.shuffle(10000).repeat().batch(1000)
    # Return the dataset.
    return dataset

def absdifloss(labels, logits):
    return tf.math.abs(tf.math.subtract(labels, logits))

def train1():
    # global_step = tf.Variable(0, trainable=False)
    boundaries = [2000000, ]
    values = [0.001, 0.0001]
    # learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(), boundaries, values)

    my_feature_columns = []
    for key in range(FEATURELEN):
        my_feature_columns.append(tf.feature_column.numeric_column(key=str(key)))
    head = tf.contrib.estimator.regression_head(
        loss_reduction=tf.losses.Reduction.MEAN,
        loss_fn=absdifloss
    )
    estimator = tf.estimator.DNNEstimator(
        head=head,
        activation_fn=tf.nn.relu,
        feature_columns = my_feature_columns,
        hidden_units=[500, 500, 500, 500, 500, 500],
        model_dir = "/home/zoul15/pcshareddir/riverregressor/",
        optimizer=lambda: tf.train.AdamOptimizer(learning_rate=tf.train.piecewise_constant(
            tf.train.get_global_step(), boundaries, values))
    )
    logging.getLogger().setLevel(logging.INFO)

    estimator.train(input_fn=lambda: csv_input_fn(CACHEDIR + "1.csv"), steps=1000)

    starttime = time.time()
    print ("start evaluate")
    # for idx in range(3,4):
    eval_result = estimator.evaluate(input_fn=lambda: csv_input_fn_evaluate(CACHEDIR + "1.csv"))
    # print ("test idx:",idx)
    for key, value in eval_result.items():
        print (key, "\t", value)
    print ("finish evaluate")
    print (time.time() - starttime)

if __name__ == "__main__":
    train1()

我的问题是,即使我的模型过度拟合,用于训练模型的数据集上的评估结果也会很好。但是,对训练集上的训练数据集的评估结果很差。为什么会这样?

0 个答案:

没有答案