resourceExhausterError:OOM分配十位数时

import pandas as pd import config from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelBinarizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from AlexNet import AlexNet from preproce import ImageToArrayPreprocessor from preproce import AspectAwarePreprocessor from preproce import FCHeadNet from preproce import HDF5datasetGenerator from preproce import HDF5DatasetWriter from tensorflow.python.keras.preprocessing.image import ImageDataGenerator from tensorflow.python.keras.optimizers import RMSprop from tensorflow.python.keras.optimizers import SGD from tensorflow.python.keras.applications import VGG16 from tensorflow.python.keras.layers import Input from tensorflow.python.keras.models import Model from imutils import paths import numpy as np import argparse import cv2 import os """aug = ImageDataGenerator(rotation_range=30, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode="nearest")""" """print("[INFO] loading images...") trainPaths = list(paths.list_images(config.IMAGES_PATH)) dataset = pd.read_csv("train.csv") labels = dataset.iloc[:, 1].values le = LabelEncoder() trainLabels = le.fit_transform(labels) split = train_test_split(trainPaths, trainLabels, test_size=config.NUM_TEST_IMAGES, stratify=trainLabels, random_state=42) (trainPaths, testPaths, trainLabels, testLabels) = split split = train_test_split(trainPaths, trainLabels, test_size=config.NUM_VAL_IMAGES, stratify=trainLabels,random_state=42) (trainPaths, valPaths, trainLabels, valLabels) = split datasets = [ ("train", trainPaths, trainLabels, config.TRAIN_HDF5), ("val", valPaths, valLabels, config.VAL_HDF5), ("test", testPaths, testLabels, config.TEST_HDF5)] for (dType, paths, labels, outputPath) in datasets: print("[INFO] building {}...".format(outputPath)) writer = HDF5DatasetWriter((len(paths), 500, 500, 3), outputPath) for (i, (path, label)) in enumerate(zip(paths, labels)): image = cv2.imread(path) image = aap.preprocess(image) writer.add([image], [label]) writer.close()""" #aap = AspectAwarePreprocessor(500, 500) iap = ImageToArrayPreprocessor() trainGen = HDF5DatasetGenerator(config.TRAIN_HDF5, 8, preprocessors=[iap], classes=102) valGen = HDF5DatasetGenerator(config.VAL_HDF5, 8, preprocessors=[iap], classes=102) print("[INFO] compiling model...") opt = RMSprop(lr=0.001) model=AlexNet.build(500,500,3,102) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) print("[INFO] training head...") model.fit_generator( trainGen.generator(), steps_per_epoch=trainGen.numImages // 8, validation_data=valGen.generator(), validation_steps=valGen.numImages // 8, epochs=75, max_queue_size=8 * 2, verbose=1) print("[INFO] serializing model...") model.save(config.MODEL_PATH, overwrite=True) trainGen.close() valGen.close()

1条回答

网友

1楼 · 发布于 2024-04-25 22:54:24

这是因为GPU内存不能自由分配用于训练，这可能是由于内存中的数据集过载（如果不是成批处理的话）。但是您已经使用了fit_generator，因此我们可以排除这一点，因为它为批量培训提供数据，同时并行运行生成数据。在

解决方案是检查哪个进程正在利用你的GPU。如果您使用的是nvidia GPU，您可以通过nvidia-smi检查进程使用GPU，或者您也可以尝试PS -fA | grep python。这将显示哪个进程正在运行并使用GPU。只需从PID列中获取进程ID并通过命令kill -9 PID终止进程。重新运行训练，这次你的GPU是免费的。我也面临同样的问题，清除GPU对我有帮助。在

注意-所有命令都要在终端中运行。在

相关问题更多 >

编程相关推荐

热门问题

热门文章