PyTorch脚本排放节点的Slurm sbatch;gres/gpu:节点node002的计数已从0更改为1

PyTorch脚本排放节点的Slurm sbatch;gres/gpu:节点node002的计数已从0更改为1,pytorch,slurm,Pytorch,Slurm,我们有一个用户,他的脚本总是消耗一个节点 请注意此错误:“gres/gpu:节点node002的计数已从0更改为1”,这可能会引起误解吗?什么会导致节点耗尽?以下是用户的SBATCH文件的内容。管道在这里会起作用吗?我刚才在键入时注意到的另一件事可能是试图混合使用库版本。因此,他有一个模块加载cuda10.0,然后是模块加载pytorch-py36-cuda10.1-gcc/1.3.1模块加载ml-pythondeps-py36-cuda10.1-gcc/3.0.0 python3.6 #!/b

我们有一个用户,他的脚本总是消耗一个节点

请注意此错误:“
gres/gpu:节点node002的计数已从0更改为1
”,这可能会引起误解吗?什么会导致节点耗尽?以下是用户的SBATCH文件的内容。管道在这里会起作用吗?我刚才在键入时注意到的另一件事可能是试图混合使用库版本。因此,他有一个
模块加载cuda10.0
,然后是
模块加载pytorch-py36-cuda10.1-gcc/1.3.1模块加载ml-pythondeps-py36-cuda10.1-gcc/3.0.0 python3.6

#!/bin/sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --mail-type=ALL
#SBATCH --gres=gpu:1
#SBATCH --job-name=$1sequentialBlur_squeezenet_training_imagewoof_crossval
module purge
module load gcc5 cuda10.0
module load openmpi/cuda/64
module load pytorch-py36-cuda10.1-gcc/1.3.1
module load ml-pythondeps-py36-cuda10.1-gcc/3.0.0
python3.6 SequentialBlur_untrained.py squeezenet 100 imagewoof $1 | tee squeeze_100_imwoof_seq_longtrain_cv_$1.txt
/u/run_seq_blur2.py 
以下是脚本内容:

# Banks 1978 paper:
# 1 month:  2.4 cyc/deg
# 2 month:  2.8 cyc/deg
# 3 month:  4 cyc/deg
# 224 pixels:
# 20 deg -> 11 pix in deg;  4.6 pix blur;  4 pix blur;  2.8 pix blur
# 4 deg -> 56 pix in deg; 23 pix blur (1 mo); 20 pix blur (2 mo); 14 pix blur (3 mo)

import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import models
import torchvision.datasets
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import sys
import scipy
from torch.utils.data.sampler import SubsetRandomSampler
import h5py

args = sys.argv
modelType = args[1] # 'alexnet', 'squeezenet', 'vgg16'
numEpochs = args[2] # int
image_set = str(args[3]) # 'imagewoof', 'imagenette'
block_call = args[4] # int {0:4}

# Example call:
# python3 alexnet 100 imagenette 1

def get_train_valid_loader(data_dir,block,augment=0,random_seed=69420,valid_size=0.2,shuffle=False,
                                                show_sample=False,num_workers=4, pin_memory=False, batch_size=128):
        # valid_size gotta be in [0,1]
        # block must be an int between 0:(1/valid_size) (0:4 for valid_size==0.2)
        transform = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
        )])
        train_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform)
        valid_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform)
        num_train = len(train_dataset)
        indices = list(range(num_train))
        split = int(np.floor(valid_size * num_train))
        split1 = int(np.floor(block*split))
        split2 = int(np.floor((block+1)*split))
        # if shuffle:
        np.random.seed(100)
        np.random.shuffle(indices)
        valid_idx = indices[split1:split2]
        train_idx = np.append(indices[:split1],indices[split2:])
        train_idx = train_idx.astype('int32')
        if block != 0:
                for b in range(block):
                        indices = [indices[(i + split) % len(indices)] for i, x in enumerate(indices)]
        # train_idx, valid_idx = indices[split:], indices[:split]
        train_sampler = SubsetRandomSampler(train_idx)
        # train_sampler = torch.utils.data.Subset(dataset, indices)
        valid_sampler = SubsetRandomSampler(valid_idx)
        train_loader = torch.utils.data.DataLoader(
                train_dataset, sampler=train_sampler, batch_size=batch_size,
                num_workers=num_workers, pin_memory=pin_memory,
        )
        valid_loader = torch.utils.data.DataLoader(
                valid_dataset, sampler=valid_sampler, batch_size=batch_size,
                num_workers=num_workers, pin_memory=pin_memory,
        )
        return (train_loader, valid_loader)

transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
 )])


blurTypes = ['gaussian']

data_dir = "/path/to/dir/" + image_set + "-320_blur/"


classes = []
for directory, subdirectories, files in os.walk(data_dir):
        for file in files:
                if directory.split("\\")[-1] not in classes:
                        classes.append(directory.split("\\")[-1])

criterion = nn.CrossEntropyLoss()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train():

        for epoch in range(int(numEpochs)):
                prev_loss = 100000.0
                running_loss = 0.0
                for i, data in enumerate(trainloader, 0):
                        # get the inputs; data is a list of [inputs, labels]
                        inputs, labels = data
                        inputs = inputs.to(device)
                        labels = labels.to(device)

                        # zero the parameter gradients
                        optimizer.zero_grad()

                        # forward + backward + optimize
                        outputs = net(inputs)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()

                        running_loss += loss.item()

                if epoch % 10 == 9:
                        print('[%d, %5d] loss: %.3f' %
                                (epoch + 1, i + 1, running_loss / 100))

allAccs = []
for blurType in blurTypes: # multiple types of blur
        print(blurType)
        print('-' * 10)
        # for block in range(5):
        block = int(block_call)
        print("\nFOLD " + str(block+1) + ":")
        for i in range(5):
                if i == 0:
                        blurLevels = [23, 11, 5, 3, 1]
                elif i == 1:
                        blurLevels = [11, 5, 3, 1]
                elif i == 2:
                        blurLevels = [5, 3, 1]
                elif i == 3:
                        blurLevels = [3, 1]
                elif i == 4:
                        blurLevels = [1]

                if modelType == 'vgg16':
                        net = torchvision.models.vgg16(pretrained=False)
                        num_ftrs = net.classifier[6].in_features
                        net.classifier[6] = nn.Linear(num_ftrs, len(classes))
                elif modelType == 'alexnet':
                        net = torchvision.models.alexnet(pretrained=False)
                        num_ftrs = net.classifier[6].in_features
                        net.classifier[6] = nn.Linear(num_ftrs, len(classes))
                else:
                        net = torchvision.models.squeezenet1_1(pretrained=False)
                        net.classifier[1] = nn.Conv2d(512, len(classes), kernel_size=(1, 1), stride=(1, 1))
                        net.num_classes = len(classes)
                optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
                net = net.to(device)
                for i in range(len(blurLevels)): #5 levels of blur: 1, 3, 5, 11, 23
                        mult = blurLevels[i]

                        trainloader, validloader = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set +
                                '-320_' + str(mult) + '/train',
                                block=block,shuffle=False,num_workers=0,batch_size=128)
                        print('Start training on blur window of ' + str(mult))
                        train()
                        print('Finished Training on ' + blurType + ' with blur window of ' + str(mult))

                accs = []
                permBlurLevels = [23, 11, 5, 3, 1]
                for j in range(len(permBlurLevels)):
                        tempMult = permBlurLevels[j]
                        correct = 0
                        total = 0
                        # newTestSet = torchvision.datasets.ImageFolder(root=data_dir + blurType + '/' + image_set + '-320_' +
                        #       str(tempMult) + '/val',
                        #       transform=transform)
                        # newTestLoader = torch.utils.data.DataLoader(newTestSet, batch_size=128,
                        #       shuffle=True, num_workers=0)
                        t2, validloader2 = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set +
                                '-320_' + str(mult) + '/train',
                                block=block,shuffle=False,num_workers=0,batch_size=128)

                        with torch.no_grad():
                                for data in validloader2:
                                        images, labels = data
                                        images = images.to(device)
                                        labels = labels.to(device)
                                        outputs = net(images)
                                        _, predicted = torch.max(outputs.data, 1)
                                        total += labels.size(0)
                                        correct += (predicted == labels).sum().item()
                                        acc = 100 * correct / total
                        print('Accuracy: %f %%' % (acc))
                        accs.append(acc)
                allAccs.append(accs)
下面是他每次运行时我们看到的错误:

[2020-03-13T08:54:02.269] gres/gpu: count changed for node node002 from 0
to 1
[2020-03-13T08:54:02.269] error: Setting node node002 state to DRAIN
[2020-03-13T08:54:02.269] drain_nodes: node node002 state set to DRAIN
[2020-03-13T08:54:02.269] error: _slurm_rpc_node_registration node=node002:
Invalid argument

我只能发现,甚至提到了节点的
gres/gpu:count已更改

问题是额外调用了
run\u seq\u blur2.py
。我们得到了一个
list index
错误,当您运行
sbatch
脚本时,您必须向
sbatch
命令添加额外的参数。用户以这种方式对其进行了修改,以便在不更改
sbatch
脚本的情况下更容易地运行Python文件的排列。例如:

sbatch run_seq_blur3.py 0
其中0可以是0-4之间的任何值

sbatch
文件中的最后一行现在如下所示:

python3.6 SequentialBlur_untrained.py alexnet 100 imagewoof 0
无论如何,它不再消耗节点