我使用 4 个 GPU,并使用 pytorch DDP 来加速训练。下面是我的数据集代码
import cv2
import ast
import torch
import numpy as np
import random
from glob import glob
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.distributed import DistributedSampler
cv2.setNumThreads(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class VimeoDataset(Dataset):
def __init__(self, mode): ##mode = 'train' means train mode or 'val' means val mode
self.h = 256
self.w = 448
self.mode = mode
self.trainlist = glob('/data/vimeoFlow2/dataset/train/*.npz')
self.vallist = glob('/data/vimeoFlow2/dataset/val/*.npz')
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
self.grid = np.stack((xx,yy),2).copy()
self.npzs = []
if self.mode == 'train':
self.npzs = self.trainlist[:80]
else:
self.npzs = self.vallist[:20]
def __len__(self):
return len(self.npzs)
def aug(self, img0, gt, img1, flow_gt, h, w):
ih, iw, _ = img0.shape
x = np.random.randint(0, ih - h + 1)
y = np.random.randint(0, iw - w + 1)
img0 = img0[x:x+h, y:y+w, :] #random generate image slices, img shape is (224,224,:)
img1 = img1[x:x+h, y:y+w, :]
gt = gt[x:x+h, y:y+w, :]
flow_gt = flow_gt[x:x+h, y:y+w, :]
return img0, gt, img1, flow_gt
def getimg(self, index):
f = np.load(self.npzs[index])
data = f['i0i1gt']
if self.mode == 'train':
flow_data = f['ft0ft1']
else:
flow_data = np.zeros((256, 448, 4))
img0 = data[:,:,0:3]
img1 = data[:,:,3:6]
gt = data[:,:,6:9]
flow_gt = flow_data
return img0, gt, img1, flow_gt
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
if self.mode == 'train':
img0, gt, img1, flow_gt = self.aug(img0, gt, img1, flow_gt, 224, 224)
flow_gt = torch.from_numpy(flow_gt.copy()).permute(2, 0, 1)
img0 = torch.from_numpy(img0.copy()).permute(2, 0, 1)
img1 = torch.from_numpy(img1.copy()).permute(2, 0, 1)
gt = torch.from_numpy(gt.copy()).permute(2, 0, 1)
if random.uniform(0, 1) < 0.5:
img0 = img0[:, :, ::-1]
img1 = img1[:, :, ::-1]
gt = gt[:, :, ::-1]
if random.uniform(0, 1) < 0.5:
img0 = img0[::-1]
img1 = img1[::-1]
gt = gt[::-1]
flow_gt = flow_gt[::-1]
flow_gt = np.concatenate((flow_gt[:, :, 0:1], -flow_gt[:, :, 1:2], flow_gt[:, :, 2:3], -flow_gt[:, :, 3:4]), 2)
if random.uniform(0, 1) < 0.5:
img0 = img0[:, ::-1]
img1 = img1[:, ::-1]
gt = gt[:, ::-1]
flow_gt = flow_gt[:, ::-1]
flow_gt = np.concatenate((-flow_gt[:, :, 0:1], flow_gt[:, :, 1:2], -flow_gt[:, :, 2:3], flow_gt[:, :, 3:4]), 2)
if random.uniform(0, 1) < 0.5:
tmp = img1
img1 = img0
img0 = tmp
flow_gt = np.concatenate((flow_gt[:, :, 2:4], flow_gt[:, :, 0:2]), 2)
flow_gt = torch.from_numpy(flow_gt.copy()).permute(2, 0, 1)
img0 = torch.from_numpy(img0.copy()).permute(2, 0, 1)
img1 = torch.from_numpy(img1.copy()).permute(2, 0, 1)
gt = torch.from_numpy(gt.copy()).permute(2, 0, 1)
return torch.cat((img0, img1, gt), 0), flow_gt
我使用以下数据加载器。
dataset = VimeoDataset(mode = 'train')
sampler = DistributedSampler(dataset)
train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=0, pin_memory=True, drop_last=True, sampler=sampler)
dataset_val = VimeoDataset(mode = 'val')
val_data = DataLoader(dataset_val, batch_size=16, pin_memory=True, num_workers=0)
我使用以下启动命令。
python -m torch.distributed.launch --nproc_per_node=4 --nnodes=1 --node_rank=0 train.py --epoch=100 --batch_size=16
错误回溯如下。我的数据集分为train和val,分别是npz文件。由于内存大小的限制,我一次只能根据索引读取一个npz文件
training...
Traceback (most recent call last):
File "train.py", line 173, in <module>
train(model, args.local_rank)
File "train.py", line 72, in train
for i, data in enumerate(train_data):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 346, in __next__
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/code/RIFE/dataset.py", line 67, in __getitem__
img0 = img0[::-1]
ValueError: negative step not yet supported
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launch.py", line 249, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', 'train.py', '--local_rank=3', '--epoch=100', '--batch_size=1']' returned non-zero exit status 1.