Question

我使用 4 个 GPU，并使用 pytorch DDP 来加速训练。下面是我的数据集代码

import cv2
import ast
import torch
import numpy as np
import random
from glob import glob
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.distributed import DistributedSampler

cv2.setNumThreads(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class VimeoDataset(Dataset):
    def __init__(self, mode): ##mode = 'train' means train mode or 'val' means val mode
        self.h = 256
        self.w = 448
        self.mode = mode
        self.trainlist = glob('/data/vimeoFlow2/dataset/train/*.npz')
        self.vallist = glob('/data/vimeoFlow2/dataset/val/*.npz')
        xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
        yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
        self.grid = np.stack((xx,yy),2).copy()
        self.npzs = []
        if self.mode == 'train':
            self.npzs = self.trainlist[:80]
        else:
            self.npzs = self.vallist[:20]

    def __len__(self):
        return len(self.npzs)

    def aug(self, img0, gt, img1, flow_gt, h, w):
        ih, iw, _ = img0.shape
        x = np.random.randint(0, ih - h + 1)
        y = np.random.randint(0, iw - w + 1)
        img0 = img0[x:x+h, y:y+w, :] #random generate image slices, img shape is (224,224,:) 
        img1 = img1[x:x+h, y:y+w, :]
        gt = gt[x:x+h, y:y+w, :]
        flow_gt = flow_gt[x:x+h, y:y+w, :]
        return img0, gt, img1, flow_gt

    def getimg(self, index):
        f = np.load(self.npzs[index])
        data = f['i0i1gt']
        if self.mode == 'train':
            flow_data = f['ft0ft1']
        else:
            flow_data = np.zeros((256, 448, 4))
        img0 = data[:,:,0:3]
        img1 = data[:,:,3:6]
        gt = data[:,:,6:9]
        flow_gt = flow_data
        return img0, gt, img1, flow_gt  
        
    def __getitem__(self, index):        
        img0, gt, img1, flow_gt = self.getimg(index)
        if self.mode == 'train':
            img0, gt, img1, flow_gt = self.aug(img0, gt, img1, flow_gt, 224, 224)
            flow_gt = torch.from_numpy(flow_gt.copy()).permute(2, 0, 1)
            img0 = torch.from_numpy(img0.copy()).permute(2, 0, 1)
            img1 = torch.from_numpy(img1.copy()).permute(2, 0, 1)
            gt = torch.from_numpy(gt.copy()).permute(2, 0, 1)             
            if random.uniform(0, 1) < 0.5:
                img0 = img0[:, :, ::-1] 
                img1 = img1[:, :, ::-1]
                gt = gt[:, :, ::-1]
            if random.uniform(0, 1) < 0.5:
                img0 = img0[::-1]
                img1 = img1[::-1]
                gt = gt[::-1]
                flow_gt = flow_gt[::-1]
                flow_gt = np.concatenate((flow_gt[:, :, 0:1], -flow_gt[:, :, 1:2], flow_gt[:, :, 2:3], -flow_gt[:, :, 3:4]), 2)
            if random.uniform(0, 1) < 0.5:
                img0 = img0[:, ::-1]
                img1 = img1[:, ::-1]
                gt = gt[:, ::-1]
                flow_gt = flow_gt[:, ::-1]
                flow_gt = np.concatenate((-flow_gt[:, :, 0:1], flow_gt[:, :, 1:2], -flow_gt[:, :, 2:3], flow_gt[:, :, 3:4]), 2)
            if random.uniform(0, 1) < 0.5:
                tmp = img1
                img1 = img0
                img0 = tmp
                flow_gt = np.concatenate((flow_gt[:, :, 2:4], flow_gt[:, :, 0:2]), 2)
        flow_gt = torch.from_numpy(flow_gt.copy()).permute(2, 0, 1)
        img0 = torch.from_numpy(img0.copy()).permute(2, 0, 1)
        img1 = torch.from_numpy(img1.copy()).permute(2, 0, 1)
        gt = torch.from_numpy(gt.copy()).permute(2, 0, 1)
        return torch.cat((img0, img1, gt), 0), flow_gt

我使用以下数据加载器。

dataset = VimeoDataset(mode = 'train')
sampler = DistributedSampler(dataset)
train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=0, pin_memory=True, drop_last=True, sampler=sampler)
dataset_val = VimeoDataset(mode = 'val')
val_data = DataLoader(dataset_val, batch_size=16, pin_memory=True, num_workers=0)

我使用以下启动命令。

python -m torch.distributed.launch --nproc_per_node=4 --nnodes=1 --node_rank=0 train.py --epoch=100 --batch_size=16

错误回溯如下。我的数据集分为train和val，分别是npz文件。由于内存大小的限制，我一次只能根据索引读取一个npz文件

 training...
    Traceback (most recent call last):
      File "train.py", line 173, in <module>
        train(model, args.local_rank)
      File "train.py", line 72, in train
        for i, data in enumerate(train_data):
      File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 346, in __next__
        data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
      File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
        data = [self.dataset[idx] for idx in possibly_batched_index]
      File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
        data = [self.dataset[idx] for idx in possibly_batched_index]
      File "/code/RIFE/dataset.py", line 67, in __getitem__
        img0 = img0[::-1]
    ValueError: negative step not yet supported
    File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launch.py", line 249, in main
        cmd=cmd)
    subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', 'train.py', '--local_rank=3', '--epoch=100', '--batch_size=1']' returned non-zero exit status 1.

ValueError：尚不支持负步。如何解决

0 个答案: