tf.数据：“混合”批次大小？

import tensorflow as tf import numpy as np # Small data input x = np.arange(100) y = np.arange(100) # Large context array for both x and y context_x = np.random.rand(1, 10000, 10) context_y = np.random.rand(1, 10000, 10) # Create datasets dataset_x = tf.data.Dataset.from_tensor_slices(x) dataset_y = tf.data.Dataset.from_tensor_slices(y) # same context should be repeated for every data item dataset_context_x = tf.data.Dataset.from_tensor_slices(context_x) dataset_context_x = dataset_context_x.repeat() dataset_context_y = tf.data.Dataset.from_tensor_slices(context_y) dataset_context_y = dataset_context_y.repeat() dataset = tf.data.Dataset.zip((dataset_x, dataset_context_x)) dataset = dataset.concatenate( tf.data.Dataset.zip((dataset_y, dataset_context_y)) ) dataset = dataset.batch(32) iterator = dataset.make_initializable_iterator() (x_iter, context_iter) = iterator.get_next() with tf.Session() as sess: sess.run(iterator.initializer) while True: try: xi, ci = sess.run([x_iter, context_iter]) print(xi.shape, ci.shape) except tf.errors.OutOfRangeError: break

1条回答

网友

1楼 · 发布于 2024-05-26 07:46:48

好吧，这是我的初步解决方案。请注意，我假设您的数据是以某种方式排序的，这样当您构建x的批处理时，您所读取的下一个{}总是与当前批处理相关的。在

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # running on CPU
import tensorflow as tf
import numpy as np

# Small data input
x = np.arange(100)
y = np.arange(100)

# Large context array for both x  and y
context_x = np.random.rand(1, 10000, 10)
context_y = np.random.rand(1, 10000, 10)

# Create datasets
dataset_x = tf.data.Dataset.from_tensor_slices(x).batch(32)
dataset_y = tf.data.Dataset.from_tensor_slices(y).batch(32)

# same context should be repeated for every data item
dataset_context_x = tf.data.Dataset.from_tensor_slices(context_x)
dataset_context_x = dataset_context_x.repeat() # here just for demonstration purposes. Ideally you'll have enough context data to match the batches
dataset_context_y = tf.data.Dataset.from_tensor_slices(context_y)
dataset_context_y = dataset_context_y.repeat() # here just for demonstration purposes. Ideally you'll have enough context data to match the batches

dataset = tf.data.Dataset.zip((dataset_x, dataset_context_x))
dataset = dataset.concatenate( tf.data.Dataset.zip((dataset_y, dataset_context_y)) ) # This stacks all 'x' samples on top of all 'y' samples. Is this really what you wanted?

iterator = dataset.make_initializable_iterator()
(x_iter, context_iter) = iterator.get_next()
with tf.Session() as sess:
    sess.run(iterator.initializer)
    while True:
        try:
            xi, ci = sess.run([x_iter, context_iter])
            print(xi.shape, ci.shape)
        except tf.errors.OutOfRangeError:
            break

在您的实现中，删除dataset_context_* = dataset_context_*.repeat()行。在

您的管道的关键区别在于，我在使用上下文压缩它之前对x进行批处理，这样上下文就不会被复制。但是，这需要您在处理数据加载时要小心（因此我在上面假设）。在

相关问题更多 >

编程相关推荐

热门问题

热门文章