PyTorch数据集和Conv1d使用大量内存

def __init__(self, csv_file, train): self.train = train self.df_tmp = pd.read_csv(csv_file, header=None, sep='\t') self.df_tmp.drop(self.df_tmp.shape[1]-1, axis=1, inplace=True) self.df = self.df_tmp.transpose() self.sample_list = [] for i in range(self.df.shape[0]): #num rows, 33 million ish sample = torch.tensor([self.df.iloc[i][1:].values]) label = torch.tensor(self.df.iloc[i][0]) self.sample_list.append((sample, label)) def __len__(self): return len(self.sample_list) def __getitem__(self, idx): return self.sample_list[idx]

#input batch shape is (9 x 33889258 x 1) def __init__(self): super(CNN, self).__init__() #input channels 1, output 3 self.conv1 = torch.nn.Conv1d(1, out_channels=3, kernel_size=(100), stride=10, padding=1) #size in is 3,1,33889258 self.pool = torch.nn.MaxPool1d(kernel_size=2, stride=2, padding=0) self.fc1 = torch.nn.Linear(45750366, 1000) #3 * 1 * 3388917 self.fc2 = torch.nn.Linear(1000, 2) def forward(self, x): #size: (1x1x33889258) to (3x1x33889258) tmp = self.conv1(x.float()) x = F.relu(tmp) # x = self.pool(x) #whatever shape comes out of here needs to go into x.view x = x.view(45750366) #-1, 1*1*3388927 x = self.fc1(x) x = F.relu(x) x = self.fc2(x) return(x)

1条回答

网友

1楼 · 发布于 2024-04-25 10:10:50

您将所有数据点存储在列表中（即内存中），因此它有点像定制数据集/数据加载器的用途。您应该只在dataset类中保留dataframe的引用，并为每个索引返回正确的数据，如

def __init__(self, csv_file, train):

    self.train = train
    self.df_tmp = pd.read_csv(csv_file, header=None, sep='\t')
    self.df_tmp.drop(self.df_tmp.shape[1]-1, axis=1, inplace=True)
    self.df = self.df_tmp.transpose()

def __len__(self):
    return self.df.shape[0]

def __getitem__(self, idx):
    sample = torch.tensor([self.df.iloc[idx][1:].values])
    label = torch.tensor(self.df.iloc[idx][0])
    return sample, label

一个小提示：您正在从dataset的getitem方法返回张量，返回纯numpy数组更容易，因为dataloader会将其转换为 Pytork张量

相关问题更多 >

编程相关推荐

热门问题

热门文章