2017-02-19 125 views
2

我一直在嘗試設置一個分佈式集羣,運行TensorFlow教程中提到的Boston Housing示例,但到目前爲止我有點迷路。谷歌搜索或在教程中搜索沒有幫助。使用tf.learn運行分佈式培訓

"""DNNRegressor with custom input_fn for Housing dataset.""" 

from __future__ import absolute_import 
from __future__ import division 
from __future__ import print_function 

import itertools 
import json 
import os 

import pandas as pd 
import tensorflow as tf 

tf.logging.set_verbosity(tf.logging.INFO) 

COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age", 
     "dis", "tax", "ptratio", "medv"] 
FEATURES = ["crim", "zn", "indus", "nox", "rm", 
     "age", "dis", "tax", "ptratio"] 
LABEL = "medv" 


def input_fn(data_set): 
    feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES} 
    labels = tf.constant(data_set[LABEL].values) 
    return feature_cols, labels 


def main(unused_argv): 
    # Load datasets 
    training_set = pd.read_csv("boston_train.csv", skipinitialspace=True, 
         skiprows=1, names=COLUMNS) 
    test_set = pd.read_csv("boston_test.csv", skipinitialspace=True, 
        skiprows=1, names=COLUMNS) 

    # Set of 6 examples for which to predict median house values 
    prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True, 
          skiprows=1, names=COLUMNS) 

    # Feature cols 
    feature_cols = [tf.contrib.layers.real_valued_column(k) 
       for k in FEATURES] 

    cluster = {'ps': ['10.134.96.44:2222', '10.134.96.184:2222'], 
      'worker': ['10.134.96.37:2222', '10.134.96.145:2222']} 
    os.environ['TF_CONFIG'] = json.dumps(
     {'cluster': cluster, 
     'task': {'type': 'worker', 'index': 0}}) 

    # Build 2 layer fully connected DNN with 10, 10 units respectively. 
    regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols, 
              hidden_units=[10, 10], 
              model_dir="/tmp/boston_model", 
              config=tf.contrib.learn.RunConfig()) 

    # Fit 
    regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000) 

    # Score accuracy 
    ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1) 
    loss_score = ev["loss"] 
    print("Loss: {0:f}".format(loss_score)) 

    # Print out predictions 
    y = regressor.predict(input_fn=lambda: input_fn(prediction_set)) 
    # .predict() returns an iterator; convert to a list and print predictions 
    predictions = list(itertools.islice(y, 6)) 
    print("Predictions: {}".format(str(predictions))) 

if __name__ == "__main__": 
    tf.app.run() 

我不知道我是否已經設置了TF_CONFIG正確這裏。我使用了4臺機器的集羣 - 兩個PS和兩個工人,但我沒有在集羣和「主」機器中設置「環境」。我首先開始運行兩個PS,然後當我運行兩個工人時,它在「INFO:tensorflow:Create CheckpointSaverHook」後卡住了。我在這裏做錯了什麼嗎?

我感謝您的幫助。

回答

1

我有完全相同的問題。問題在於grpc服務器從未真正開始。我做了和你一樣的假設 - tf.learn啓動grpc服務器 - 但事實並非如此。你可以從你的python腳本中啓動一個服務器。然後,取決於進程是否運行「PS」或「工人」的任務,你要麼打電話server.join()或運行的模型的代碼的其餘部分:

job = sys.argv[1] 
task = int(sys.argv[2]) 

cluster = {'worker': ['localhost:2223'], 
      'ps': ['localhost:2222']} 

os.environ['TF_CONFIG'] = json.dumps({'cluster': cluster, 
             'task': {'type': job, 'index': task}}) 

# Create the server 
server = tf.train.Server(cluster, 
         job_name=job, 
         task_index=task) 

if job == "ps": 
    server.join() 
elif job == "worker": 
    # Load input 
    # estimator.fit() 

欲瞭解更多信息,結賬: how to run tensorflow distributed mnist example

And

https://www.tensorflow.org/deploy/distributed#putting-it-all-together-example-trainer-program