<p>我发现有一个不明智的解决办法。
有解决方案代码:</p>
<pre class="lang-py prettyprint-override"><code>import tensorflow as tf
from tensorflow.keras import backend as K
def slice_batch(x, n_gpus, part):
sh = K.shape(x)
L = sh[0] // n_gpus
if part == n_gpus - 1:
return x[part * L:]
return x[part * L:(part + 1) * L]
def multi_gpu_wrapper(single_model, num_gpu):
inputs = single_model.inputs
towers = []
splited_layer = tf.keras.layers.Lambda(lambda x: slice_batch(x, num_gpu, gpu_id))
concate_layer = tf.keras.layers.Concatenate(axis=0)
with tf.device('/cpu:0'):
for gpu_id in range(num_gpu):
cur_inputs = []
for input in inputs:
cur_inputs.append(
splited_layer(input)
)
towers.append(single_model(cur_inputs))
print towers[-1]
outputs = []
num_output = len(towers[-1])
with tf.device('/cpu:0'):
for i in range(num_output):
tmp_outputs = []
for j in range(num_gpu):
tmp_outputs.append(towers[j][i])
outputs.append(concate_layer(tmp_outputs))
multi_gpu_model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
return multi_gpu_model
if __name__ == '__main__':
import config
import os
import numpy as np
gpu_ids = "0,1,3"
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
with tf.device('/cpu:0'):
input1 = tf.keras.layers.Input(config.input_shape)
input2 = tf.keras.layers.Input(config.input_shape)
sub_model = tf.keras.applications.VGG16(include_top=False, weights=config.VGG_MODEL_PATH,
input_shape=config.input_shape)
output1 = sub_model(input1)
output2 = sub_model(input2)
model = tf.keras.Model(inputs=[input1, input2], outputs=[output1, output2])
multi_gpu_model = multi_gpu_wrapper(model, 3)
multi_gpu_model.compile('sgd', loss=['mse', 'mse'])
multi_gpu_model.fit([np.random.random([1000, 128, 128, 3]), np.random.random([1000, 128, 128, 3])],
[np.random.random([1000, 4, 4, 512]), np.random.random([1000, 4, 4, 512])], batch_size=128)
</code></pre>
<p>但是,我发现这个解决方案中GPU的使用率很低。在</p>