如何为tensorflow 2.0 keras导出模型的输入层设计/预处理特征

2024-04-23 18:19:37 发布

您现在位置:Python中文网/ 问答频道 /正文

我用TensorFlow-2.0-beta1创建了一个模型。这使用Keras函数API对输入数据执行回归。数据需要有一个热编码和数字输入规范化的分类特征。在过去使用TF1.11中的Estimators API时,可以使用特征列并将工程应用于服务inputReceiver中的特征来解决这一问题。从keras导出模型时有没有类似的方法?你知道吗

import tensorflow as tf
import pickle
import tensorflow_datasets as tfds
import pandas as pd

tf.keras.backend.clear_session()  # For easy reset of notebook state.

VERSION = tf.__version__
CWD = os.getcwd()
PARENT_DIR = os.path.split(CWD)[0]
DATETIME = datetime.datetime.utcnow()
DATA_DIR = os.path.join(PARENT_DIR, 'data')
train_file_path = os.path.join(DATA_DIR, 'traindf.csv')
test_file_path = os.path.join(DATA_DIR, 'testdf.csv')

CATEGORIES = os.path.join(DATA_DIR, "CATEGORIES")
fileObject = open(CATEGORIES, 'rb')
CATEGORIES = pickle.load(fileObject)
fileObject.close()

NUMERICSTATS = os.path.join(DATA_DIR, "NUMERICSTATS")
fileObject = open(NUMERICSTATS, 'rb')
NUMERICSTATS = pickle.load(fileObject)
fileObject.close()


# CSV columns in the input file.
with open(train_file_path, 'r') as f:
    names_row = f.readline()

CSV_COLUMNS = names_row.rstrip('\n').split(',')
print(CSV_COLUMNS)


drop_columns = ['SubSilo','Year','StockID', 'QuickRef', 'sumUKQuantity', 'sumNonUKQuantity']
columns_to_use = [col for col in CSV_COLUMNS if col not in drop_columns]

columns_to_use


LABEL_COLUMN = 'totalqty'
FEATURE_COLUMNS = [column for column in columns_to_use if column != LABEL_COLUMN]
test_labels = testdf[LABEL_COLUMN]


COLUMN_DEFAULTS = [tf.dtypes.string, #ProductBrand
                  tf.dtypes.string, #Department
                  tf.dtypes.string, #ProductType
                  tf.dtypes.string, #ProductSubType
                  tf.dtypes.string, #Silo
                  tf.dtypes.string, #Level
                  tf.dtypes.string, #BaseColour
                  tf.dtypes.string, #Sport
                  tf.dtypes.string, #UKSize
                  tf.dtypes.float32, #UnitCostPrice
                  tf.dtypes.float32, #ExVatSalesValue
                  tf.dtypes.float32, #RRP_GBP
                  tf.dtypes.string, #Week
                  tf.dtypes.int32] #totalqty

def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=60, # Artificially small to make examples easier to show.
        label_name=LABEL_COLUMN,
        select_columns=columns_to_use ,
        column_defaults=COLUMN_DEFAULTS,
        num_epochs=1,
        ignore_errors=True,
        shuffle=False)
    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)


def process_categorical_data(data, categories):
    """Returns a one-hot encoded tensor representing categorical values."""

    # Remove leading ' '.
    data = tf.strings.regex_replace(data, '^ ', '')
    # Remove trailing '.'.
    data = tf.strings.regex_replace(data, r'\.$', '')

    # ONE HOT ENCODE
    # Reshape data from 1d (a list) to a 2d (a list of one-element lists)
    data = tf.reshape(data, [-1, 1])
    # For each element, create a new list of boolean values the length of categories,
    # where the truth value is element == category label
    data = tf.equal(categories, data)
    # Cast booleans to floats.
    data = tf.cast(data, tf.float32)

    # The entire encoding can fit on one line:
    # data = tf.cast(tf.equal(categories, tf.reshape(data, [-1, 1])), tf.float32)
    return data


def process_continuous_data(data, mean, std):
    # Normalize data
    data = (tf.cast(data, tf.float32) - mean) / std
    return tf.reshape(data, [-1, 1])


def preprocess(features, labels):
    # Process categorial features.
    for feature in CATEGORIES.keys():
        features[feature] = process_categorical_data(features[feature], CATEGORIES[feature])


    # Process continuous features.
    for feature in NUMERICSTATS.keys():
        features[feature] = process_continuous_data(features[feature],
                                                    NUMERICSTATS[feature]['mean'],
                                                    NUMERICSTATS[feature]['std']
                                                   )


    # Assemble features into a single tensor.
    features = tf.concat([features[column] for column in FEATURE_COLUMNS], 1)

    return features, labels


train_data = raw_train_data.map(preprocess).shuffle(len(traindf))
test_data = raw_test_data.map(preprocess)


def get_model(input_dim):
    """Create a Keras model with layers.

    Args:
        input_dim: (int) The shape of an item in a batch. 

    Returns:
        A Keras model.
    """

    inputs = tf.keras.Input(shape=(input_dim,))
    x = tf.keras.layers.Dense(244, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(inputs)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(200, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1)(x)

    model = tf.keras.Model(inputs, outputs)

    return model


input_shape, output_shape = train_data.output_shapes
input_dimension = input_shape.dims[1] # [0] is the batch size

model = get_model(input_dimension)


optimizer = tf.keras.optimizers.Adam(0.001)

model.compile(loss='mse',
            optimizer=optimizer,
            metrics=['mae', 'mse', tf.keras.metrics.RootMeanSquaredError()])


# The patience parameter is the amount of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

# Display training progress by printing a single dot for each completed epoch
class PrintDot(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

tensor_board = tf.keras.callbacks.TensorBoard(log_dir=os.path.join(PARENT_DIR, 'tensorBoardLogs'))

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                 factor=0.2,
                                                 patience=4,
                                                 verbose=1,
                                                 min_lr=0.00001)

history = model.fit(train_data,
                    validation_data=test_data,
                    epochs=100,
                    verbose=1,
                    callbacks=[early_stop,
                               PrintDot(),
                               tensor_board,
                               reduce_lr]
                   )


tf.keras.experimental.export_saved_model(model, saved_model_path=os.path.join(PARENT_DIR, 'models/1'))

我想有一个模型,我可以使用TensorFlow服务,将采取的特点,因为他们在我的训练数据,其中13个和预处理他们在模型本身。所以用烧瓶之类的东西作为中间人就不需要了


Tags: topathdatastringmodeloslayerstf