RNN文本分类、预测与服务

2024-04-16 13:05:07 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图构建预测下一个单词(在我的例子中是URL)的模型。按照我的例子,我被困在预测部分。 我的python代码:

import argparse import sys import os import re import numpy as np import pandas import tensorflow as tf import url_datasets from tensorflow.contrib.learn.python.learn.preprocessing import text from tensorflow.python.framework import dtypes tf.app.flags.DEFINE_integer('model_version', 1, 'version number of the model.') tf.app.flags.DEFINE_string('work_dir', '/tmp/suc', 'Working directory.') FLAGS = tf.app.flags.FLAGS MAX_DOCUMENT_LENGTH = 40 EMBEDDING_SIZE = 40 n_words = 0 MAX_LABEL = 50 WORDS_FEATURE = 'words' # Name of the input words feature. TOKENIZER_RE = re.compile(r'([/a-z_-]*)\s') def tokenizer(iterator): """Tokenizer generator. Args: iterator: Input iterator with strings. Yields: array of tokens per each value in the input. """ for value in iterator: print(value) print(TOKENIZER_RE.findall(value)) yield TOKENIZER_RE.findall(value) def estimator_spec_for_softmax_classification( logits, labels, mode): """Returns EstimatorSpec instance for softmax classification.""" predicted_classes = tf.argmax(logits, 1) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions={ 'class': predicted_classes, 'prob': tf.nn.softmax(logits) }) onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0) loss = tf.losses.softmax_cross_entropy( onehot_labels=onehot_labels, logits=logits) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer(learning_rate=0.1) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) eval_metric_ops = { 'accuracy': tf.metrics.accuracy( labels=labels, predictions=predicted_classes) } return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) def updatePrediction(prediction): file = open("/tmp/ai/prediction.txt","a") file.write(str(prediction)) file.close() def rnn_model(features, labels, mode): word_vectors = tf.contrib.layers.embed_sequence( features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE) word_list = tf.unstack(word_vectors, axis=1) cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE) _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32) logits = tf.layers.dense(encoding, MAX_LABEL, activation=None) return estimator_spec_for_softmax_classification( logits=logits, labels=labels, mode=mode) def main(_): sess = tf.InteractiveSession() serialized_tf_example = tf.placeholder(tf.string, name='tf_example') global n_words urls = url_datasets.load_urls('/tmp/ai/demo') x_train = pandas.Series(urls.train.data[:,1]) labels = pandas.Series(urls.train.data[:,0]) y_train = pandas.Series(urls.train.target) x_test = pandas.Series(urls.test.data[:,1]) y_test = pandas.Series(urls.test.target) vocab_processor = text.VocabularyProcessor(MAX_DOCUMENT_LENGTH, min_frequency=0, tokenizer_fn=tokenizer) vocab_processor.fit(labels) x_transform_train = vocab_processor.fit_transform(x_train) x_transform_test = vocab_processor.transform(x_test) x_train = np.array(list(x_transform_train)) x_test = np.array(list(x_transform_test)) print(vocab_processor.vocabulary_._mapping) n_words = len(vocab_processor.vocabulary_) vocab_dict = vocab_processor.vocabulary_._mapping model_fn = rnn_model classifier = tf.estimator.Estimator(model_fn=model_fn) train_input_fn = tf.estimator.inputs.numpy_input_fn( x={WORDS_FEATURE: x_train}, y=y_train, batch_size=len(x_train), num_epochs=None, shuffle=False) classifier.train(input_fn=train_input_fn, steps=100) test_input_fn = tf.estimator.inputs.numpy_input_fn( x={WORDS_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=test_input_fn) export_path_base = sys.argv[-1] export_path = os.path.join( tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(FLAGS.model_version))) print('Exporting trained model to', export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) y_predicted = np.array(list(p['class'] for p in predictions)) inverseDictionary = dict(zip(vocab_dict.values(), vocab_dict.keys())) for prediction in y_predicted: print("prediction:"+inverseDictionary[int(prediction)]) updatePrediction(prediction) print("--> %s" % prediction) tensor_info_x = tf.saved_model.utils.build_tensor_info(serialized_tf_example) tensor_info_y = tf.saved_model.utils.build_tensor_info(tf.convert_to_tensor(y_predicted, tf.float32)) classification_inputs = tf.saved_model.utils.build_tensor_info( serialized_tf_example) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: tensor_info_y }, method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME)) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'x_strings': tensor_info_x}, outputs={'scores': tensor_info_y}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_url': prediction_signature, tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, legacy_init_op=legacy_init_op) builder.save() print('Done exporting!') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--test_with_fake_data', default=False, help='Test the example code with fake data.', action='store_true') parser.add_argument( '--bow_model', default=False, help='Run with BOW model instead of RNN.', action='store_true') tf.app.run()

我的输入数据,/tmp/ai/demo/火车.csv在

/tmp/ai/演示/测试.csv在

0,test,/url/b /url/c

当我建立模型并存储它时,一切正常。预测是正确的。但现在我想用占位符代替x\u测试: x检验=熊猫系列(数据.test url[:,1])

我的委托人:

import sys import threading from grpc.beta import implementations import tensorflow as tf from tensorflow_serving.apis import predict_pb2 from tensorflow_serving.apis import prediction_service_pb2 from tensorflow.core.framework import types_pb2 tf.app.flags.DEFINE_integer('concurrency', 1, 'maximum number of concurrent inference requests') tf.app.flags.DEFINE_string('server', '', 'PredictionService host:port') tf.app.flags.DEFINE_string('work_dir', '/tmp', 'Working directory. ') FLAGS = tf.app.flags.FLAGS def do_prediction(hostport, work_dir, concurrency): host, port = hostport.split(':') channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) request = predict_pb2.PredictRequest() request.model_spec.name = 'predict_url' request.model_spec.signature_name = 'predict_url' request.inputs['x_strings'].dtype = types_pb2.DT_STRING request.inputs['x_strings'].string_val.append('/url/a /url/b ') result = stub.Predict(request, 5.0) # 5 seconds return result def main(_): if not FLAGS.server: print('please specify server host:port') return prediction = do_prediction(FLAGS.server, FLAGS.work_dir, FLAGS.concurrency) print('\nPrediction from url_classify_client: %s%%' % prediction) if __name__ == '__main__': tf.app.run()

每当我用占位符替换x\u test时,服务请求的响应总是相同的:

Prediction from url_classify_client: outputs { key: "scores" value { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } float_val: 4.0 } }

更新 已更新导出模型的文件:

import argparse
import sys
import os

import re
import numpy as np
import pandas
import tensorflow as tf
import url_datasets
from tensorflow.contrib.learn.python.learn.preprocessing import text
from tensorflow.python.framework import dtypes

tf.app.flags.DEFINE_integer('model_version', 1, 'version number of the model.')
tf.app.flags.DEFINE_string('work_dir', '/tmp/suc', 'Working directory.')
FLAGS = tf.app.flags.FLAGS

MAX_DOCUMENT_LENGTH = 40
EMBEDDING_SIZE = 40
n_words = 0
MAX_LABEL = 50
WORDS_FEATURE = 'words'  # Name of the input words feature.
TOKENIZER_RE = re.compile(r'([/a-z_-]*)\s')

def tokenizer(iterator):
  """Tokenizer generator.

  Args:
    iterator: Input iterator with strings.

  Yields:
    array of tokens per each value in the input.
  """
  for value in iterator:
    yield TOKENIZER_RE.findall(value)

def estimator_spec_for_softmax_classification(
    logits, labels, mode):
  """Returns EstimatorSpec instance for softmax classification."""
  predicted_classes = tf.argmax(logits, 1)
  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={
            'class': predicted_classes,
            'prob': tf.nn.softmax(logits)
        })
  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
  loss = tf.losses.softmax_cross_entropy(
      onehot_labels=onehot_labels, logits=logits)
  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

  eval_metric_ops = {
      'accuracy': tf.metrics.accuracy(
          labels=labels, predictions=predicted_classes)
  }
  return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

def updatePrediction(prediction):
  file = open("/tmp/ai/prediction.txt","a")
  file.write(str(prediction))
  file.close()

def customTestFn(input, vocab_processor):
  feature_configs = {'x_strings': tf.FixedLenFeature(shape=[1], dtype=tf.string)}
  tf_example = tf.parse_example(input, feature_configs)
  x_transform_test = vocab_processor.fit_transform(tf_example)
  return np.array(list(x_transform_test))

def rnn_model(features, labels, mode):
  word_vectors = tf.contrib.layers.embed_sequence(
      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)

  word_list = tf.unstack(word_vectors, axis=1)

  cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)

  _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)

  logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
  return estimator_spec_for_softmax_classification(
      logits=logits, labels=labels, mode=mode)


def main(_):
  urls = url_datasets.load_urls('/tmp/ai/demo')
  sess = tf.InteractiveSession()
  serialized_tf_example = tf.placeholder(tf.string, name='x_strings')

  global n_words
  x_train = pandas.Series(urls.train.data[:,1])
  labels = pandas.Series(urls.train.data[:,0])
  y_train = pandas.Series(urls.train.target)
  y_test = pandas.Series(urls.test.target)
  vocab_processor = text.VocabularyProcessor(MAX_DOCUMENT_LENGTH,
               min_frequency=0,
               tokenizer_fn=tokenizer)
  vocab_processor.fit(labels)
  x_transform_train = vocab_processor.fit_transform(x_train)
  x_train = np.array(list(x_transform_train))

  print(vocab_processor.vocabulary_._mapping)

  n_words = len(vocab_processor.vocabulary_)
  vocab_dict = vocab_processor.vocabulary_._mapping
  model_fn = rnn_model

  classifier = tf.estimator.Estimator(model_fn=model_fn)

  train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={WORDS_FEATURE: x_train},
        y=y_train,
        batch_size=len(x_train),
        num_epochs=None,
        shuffle=False)
  classifier.train(input_fn=train_input_fn, steps=100)

  export_path_base = sys.argv[-1]
  export_path = os.path.join(
      tf.compat.as_bytes(export_path_base),
      tf.compat.as_bytes(str(FLAGS.model_version)))
  print('Exporting trained model to', export_path)
  builder = tf.saved_model.builder.SavedModelBuilder(export_path)

  sess.run(tf.global_variables_initializer())

  test_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={WORDS_FEATURE: customTestFn(serialized_tf_example, vocab_processor)},
      y=y_test,
      num_epochs=1,
      shuffle=False)
  predictions = classifier.predict(input_fn=test_input_fn)

  y_predicted = np.array(list(p['class'] for p in predictions))
  inverseDictionary = dict(zip(vocab_dict.values(), vocab_dict.keys()))
  for prediction in y_predicted:
       print("prediction:"+inverseDictionary[int(prediction)])
       updatePrediction(prediction)
       print("--> %s" % prediction)
  tensor_info_x = tf.saved_model.utils.build_tensor_info(serialized_tf_example)
  tensor_info_y = tf.saved_model.utils.build_tensor_info(tf.convert_to_tensor(y_predicted, tf.float32))

  classification_inputs = tf.saved_model.utils.build_tensor_info(
      serialized_tf_example)

  classification_signature = (
      tf.saved_model.signature_def_utils.build_signature_def(
          inputs={
              tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                 classification_inputs
          },
          outputs={
              tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
                 tensor_info_y
          },
          method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))

  prediction_signature = (
      tf.saved_model.signature_def_utils.build_signature_def(
          inputs={'x_strings': tensor_info_x},
          outputs={'scores': tensor_info_y},
          method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))

  legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
  builder.add_meta_graph_and_variables(
      sess, [tf.saved_model.tag_constants.SERVING],
      signature_def_map={
          'predict_url':
              prediction_signature,
          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
              classification_signature,
      },
      legacy_init_op=legacy_init_op)
  builder.save()

  print('Done exporting!')


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--test_with_fake_data',
      default=False,
      help='Test the example code with fake data.',
      action='store_true')
  parser.add_argument(
      '--bow_model',
      default=False,
      help='Run with BOW model instead of RNN.',
      action='store_true')
  tf.app.run()

每当我运行gRPC时,我都会得到相同的响应

Prediction from url_classify_client: outputs {
  key: "scores"
  value {
    dtype: DT_FLOAT
    tensor_shape {
      dim {
        size: 1
      }
    }
    float_val: 7.0
  }
}

这和模型导出时所做的预测完全相同,所以我认为“y逯预测”在服务期间没有被评估。。。在

我不知道如何在服务期间调试它(我用Bazel运行它)。设置后:

export TF_CPP_MIN_VLOG_LEVEL=0
export GRPC_VERBOSITY=DEBUG
export GRPC_TRACE=all

我在日志中收到以下消息(当我执行gRPC请求时):

'PRI * HTTP/2.0....SM......$..................................@................@.:scheme.http@.:method.POST..:path-/tensorflow.serving.PredictionService/Predict@.:authority.localhost:9000@.te.trailers@.content-type.application/grpc@.user-agent8grpc-python/1.4.0 grpc-c/4.0.0 (osx; chttp2; gregarious)@.grpc-accept-encoding.identity,deflate,gzip..grpc-timeout.5S...............B..........=....predict_url..predict_url....x_strings....B./url/a /url/b ..........................'

供我参考导出命令

python tensorflow/tensorflow/examples/learn/saved_simple_url_classification.py /tmp/saved_rnn

我的服务命令

bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 --model_name=predict_url --model_base_path=/tmp/saved_rnn/ --logtostderr --logdir logs &> grpc_log

我的gRPC命令

python url_classify_client.py --server=localhost:9000

Tags: testimporturlinputlabelsmodelmodetf