使用tf.learn

时间:2017-02-19 16:13:56

标签: tensorflow

我一直在尝试设置运行TensorFlow教程中提到的Boston Housing示例的分布式集群,但到目前为止,我有点迷失了。谷歌搜索或搜索教程没有帮助。

"""DNNRegressor with custom input_fn for Housing dataset."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import itertools
import json
import os

import pandas as pd
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age",
       "dis", "tax", "ptratio", "medv"]
FEATURES = ["crim", "zn", "indus", "nox", "rm",
        "age", "dis", "tax", "ptratio"]
LABEL = "medv"


def input_fn(data_set):
  feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
  labels = tf.constant(data_set[LABEL].values)
  return feature_cols, labels


def main(unused_argv):
  # Load datasets
  training_set = pd.read_csv("boston_train.csv", skipinitialspace=True,
                         skiprows=1, names=COLUMNS)
  test_set = pd.read_csv("boston_test.csv", skipinitialspace=True,
                     skiprows=1, names=COLUMNS)

  # Set of 6 examples for which to predict median house values
  prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True,
                           skiprows=1, names=COLUMNS)

  # Feature cols
  feature_cols = [tf.contrib.layers.real_valued_column(k)
              for k in FEATURES]

  cluster = {'ps': ['10.134.96.44:2222', '10.134.96.184:2222'],
             'worker': ['10.134.96.37:2222', '10.134.96.145:2222']}
  os.environ['TF_CONFIG'] = json.dumps(
      {'cluster': cluster,
       'task': {'type': 'worker', 'index': 0}})

  # Build 2 layer fully connected DNN with 10, 10 units respectively.
  regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[10, 10],
                                            model_dir="/tmp/boston_model",
                                            config=tf.contrib.learn.RunConfig())

  # Fit
  regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000)

  # Score accuracy
  ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
  loss_score = ev["loss"]
  print("Loss: {0:f}".format(loss_score))

  # Print out predictions
  y = regressor.predict(input_fn=lambda: input_fn(prediction_set))
  # .predict() returns an iterator; convert to a list and print predictions
  predictions = list(itertools.islice(y, 6))
  print("Predictions: {}".format(str(predictions)))

if __name__ == "__main__":
  tf.app.run()

我不确定我是否在这里正确设置了TF_CONFIG。我使用了4台机器的集群 - 两个PS和两个工人,但我没有设置环境'在群集中也没有' master'机器。我第一次开始运行两个PS,然后当我运行两个工作程序时,它就被卡在了#34; INFO:tensorflow:Create CheckpointSaverHook。"我在这里做错了吗?

感谢您的帮助。

1 个答案:

答案 0 :(得分:1)

我遇到了完全相同的问题。问题是grpc服务器实际上从未实际启动过。我做了同样的假设 - tf.learn启动了grpc服务器 - 但事实并非如此。您可以从python脚本中启动服务器。然后,取决于流程是否正在运行' ps'或者'工人'任务,您可以致电server.join()或运行模型的其余代码:

job = sys.argv[1]
task = int(sys.argv[2])

cluster = {'worker': ['localhost:2223'],
           'ps': ['localhost:2222']}

os.environ['TF_CONFIG'] = json.dumps({'cluster': cluster,
                                      'task': {'type': job, 'index': task}})

# Create the server
server = tf.train.Server(cluster,
                         job_name=job,
                         task_index=task)

if job == "ps":
    server.join()
elif job == "worker":
    # Load input
    # estimator.fit()

有关详情,请查看: how to run tensorflow distributed mnist example

并且

https://www.tensorflow.org/deploy/distributed#putting-it-all-together-example-trainer-program