mxnet：如何设置自定义mx.io.DataIter预取？

def launchJobForDate(date_str): ### this is a function that gets called via multiprocessing ### to produce new data by calling a c++ binary ### whenever data queue is empty so that we need to produce more data try: f = "testdata/data%s.npy"%date_str if not os.path.isfile(f): cmd = CMD % ( date_str, JSON_FILE, date_str, date_str, date_str) while True: try: output = subprocess.check_output(cmd, shell=True) break except: pass while True: try: d = np.load(f) break except: pass data_queue.put((d, date_str)) except Exception as ex: print("launchJobForDate: ERROR ", ex) class ProduceDataIter(mx.io.DataIter): @staticmethod def processData(d, time_steps, num_inputs): try: ...processes data... return [z for z in zip(bigX, bigY, bigEvalY, dates)] except Exception as ex: print("processData: ERROR ", ex) def __init__(self, num_mgrs, end_date_str): ## iter stuff self.preprocess_threads = 4 self.prefetch_buffer = 1 ## set up internal data to preserve state ## and make a list of dates for which to run binary @property def provide_data(self): return [mx.io.DataDesc(name='seq_var', shape=(args_batch_size * GPU_COUNT, self.time_steps, self.num_inputs), layout='NTC')] @property def provide_label(self): return [mx.io.DataDesc(name='bd_return', shape=(args_batch_size * GPU_COUNT)), mx.io.DataDesc(name='bd_return', shape=(args_batch_size * GPU_COUNT, num_y_cols)), mx.io.DataDesc(name='date', shape=(args_batch_size * GPU_COUNT))] def __next__(self): try: z = self.z.pop(0) data = z[0:1] label = z[1:] return mx.io.DataBatch(data, label) except Exception as ex: ### if self.z (a list) has no elements to pop we need ### to get more data off the queue, process it, and put it ### on self.x so it's ready for calls to __next__() while True: try: d = data_queue.get_nowait() processedData = ProduceDataIter.processData(d, self.time_steps, self.num_inputs) self.z.extend(processedData) counter_queue.put(counter_queue.get() - 1) z = self.z.pop(0) data = z[0:1] label = z[1:] return mx.io.DataBatch(data, label) except queue.Empty: ...this is where new jobs to produce new data and put them ...on the queue would happen if nothing is left on the queue

1条回答

网友

1楼 · 发布于 2024-04-19 01:02:30

正如您已经提到的，gluon DataLoader正在提供预取。在定制的DataIterator中，使用Numpy数组作为输入。因此您可以执行以下操作：

f = "testdata/data%s.npy"%date_str
data = np.load(f)
train = gluon.data.ArrayDataset(mx.nd.array(data))
train_iter = gluon.data.DataLoader(train, shuffle=True, num_workers=4, batch_size=batch_size, last_batch='rollover')

因为您是动态创建数据的，所以可以尝试在每个epoch重置DataLoader并加载一个新的Numpy数组。如果GPU利用率仍然很低，请尝试增加批处理大小和工作线程数。另一个问题也可能是数据集的大小。重置DataLoader将影响性能，因此拥有更大的数据集将增加epoch的时间，从而提高性能。在

相关问题更多 >

编程相关推荐

热门问题

热门文章