I posted this yesterday and this does appear to be related to a specific job. Note this error: "gres/gpu: count changed for node node002 from 0 to 1" Could it be misleading? What could cause the node to drain? Here are the contents of the user's SBATCH file. Could the piping having an effect here?
#!/bin/sh #SBATCH -N 1 #SBATCH -n 1 #SBATCH --mail-type=ALL #SBATCH --gres=gpu:1 #SBATCH --job-name=$1sequentialBlur_squeezenet_training_imagewoof_crossval module purge module load gcc5 cuda10.0 module load openmpi/cuda/64 module load pytorch-py36-cuda10.1-gcc/1.3.1 module load ml-pythondeps-py36-cuda10.1-gcc/3.0.0 python3.6 SequentialBlur_untrained.py squeezenet 100 imagewoof $1 | tee squeeze_100_imwoof_seq_longtrain_cv_$1.txt Here is the script contents: # Banks 1978 paper: # 1 month: 2.4 cyc/deg # 2 month: 2.8 cyc/deg # 3 month: 4 cyc/deg # 224 pixels: # 20 deg -> 11 pix in deg; 4.6 pix blur; 4 pix blur; 2.8 pix blur # 4 deg -> 56 pix in deg; 23 pix blur (1 mo); 20 pix blur (2 mo); 14 pix blur (3 mo) import torch import torchvision import torchvision.transforms as transforms from torchvision import models import torchvision.datasets import matplotlib.pyplot as plt import numpy as np import matplotlib.pyplot as plt import numpy as np import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import os import sys import scipy from torch.utils.data.sampler import SubsetRandomSampler import h5py args = sys.argv modelType = args[1] # 'alexnet', 'squeezenet', 'vgg16' numEpochs = args[2] # int image_set = str(args[3]) # 'imagewoof', 'imagenette' block_call = args[4] # int {0:4} # Example call: # python3 alexnet 100 imagenette 1 def get_train_valid_loader(data_dir,block,augment=0,random_seed=69420,valid_size=0.2,shuffle=False, show_sample=False,num_workers=4, pin_memory=False, batch_size=128): # valid_size gotta be in [0,1] # block must be an int between 0:(1/valid_size) (0:4 for valid_size==0.2) transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] )]) train_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform) valid_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform) num_train = len(train_dataset) indices = list(range(num_train)) split = int(np.floor(valid_size * num_train)) split1 = int(np.floor(block*split)) split2 = int(np.floor((block+1)*split)) # if shuffle: np.random.seed(100) np.random.shuffle(indices) valid_idx = indices[split1:split2] train_idx = np.append(indices[:split1],indices[split2:]) train_idx = train_idx.astype('int32') if block != 0: for b in range(block): indices = [indices[(i + split) % len(indices)] for i, x in enumerate(indices)] # train_idx, valid_idx = indices[split:], indices[:split] train_sampler = SubsetRandomSampler(train_idx) # train_sampler = torch.utils.data.Subset(dataset, indices) valid_sampler = SubsetRandomSampler(valid_idx) train_loader = torch.utils.data.DataLoader( train_dataset, sampler=train_sampler, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, ) valid_loader = torch.utils.data.DataLoader( valid_dataset, sampler=valid_sampler, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, ) return (train_loader, valid_loader) transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] )]) blurTypes = ['gaussian'] data_dir = "/path/to/dir/" + image_set + "-320_blur/" classes = [] for directory, subdirectories, files in os.walk(data_dir): for file in files: if directory.split("\\")[-1] not in classes: classes.append(directory.split("\\")[-1]) criterion = nn.CrossEntropyLoss() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def train(): for epoch in range(int(numEpochs)): prev_loss = 100000.0 running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if epoch % 10 == 9: print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100)) allAccs = [] for blurType in blurTypes: # multiple types of blur print(blurType) print('-' * 10) # for block in range(5): block = int(block_call) print("\nFOLD " + str(block+1) + ":") for i in range(5): if i == 0: blurLevels = [23, 11, 5, 3, 1] elif i == 1: blurLevels = [11, 5, 3, 1] elif i == 2: blurLevels = [5, 3, 1] elif i == 3: blurLevels = [3, 1] elif i == 4: blurLevels = [1] if modelType == 'vgg16': net = torchvision.models.vgg16(pretrained=False) num_ftrs = net.classifier[6].in_features net.classifier[6] = nn.Linear(num_ftrs, len(classes)) elif modelType == 'alexnet': net = torchvision.models.alexnet(pretrained=False) num_ftrs = net.classifier[6].in_features net.classifier[6] = nn.Linear(num_ftrs, len(classes)) else: net = torchvision.models.squeezenet1_1(pretrained=False) net.classifier[1] = nn.Conv2d(512, len(classes), kernel_size=(1, 1), stride=(1, 1)) net.num_classes = len(classes) optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) net = net.to(device) for i in range(len(blurLevels)): #5 levels of blur: 1, 3, 5, 11, 23 mult = blurLevels[i] trainloader, validloader = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set + '-320_' + str(mult) + '/train', block=block,shuffle=False,num_workers=0,batch_size=128) print('Start training on blur window of ' + str(mult)) train() print('Finished Training on ' + blurType + ' with blur window of ' + str(mult)) accs = [] permBlurLevels = [23, 11, 5, 3, 1] for j in range(len(permBlurLevels)): tempMult = permBlurLevels[j] correct = 0 total = 0 # newTestSet = torchvision.datasets.ImageFolder(root=data_dir + blurType + '/' + image_set + '-320_' + # str(tempMult) + '/val', # transform=transform) # newTestLoader = torch.utils.data.DataLoader(newTestSet, batch_size=128, # shuffle=True, num_workers=0) t2, validloader2 = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set + '-320_' + str(mult) + '/train', block=block,shuffle=False,num_workers=0,batch_size=128) with torch.no_grad(): for data in validloader2: images, labels = data images = images.to(device) labels = labels.to(device) outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() acc = 100 * correct / total print('Accuracy: %f %%' % (acc)) accs.append(acc) allAccs.append(accs) ---------- Forwarded message --------- From: Robert Kudyba <rkud...@fordham.edu> Date: Fri, Mar 13, 2020 at 11:36 AM Subject: gres/gpu: count changed for node node002 from 0 to 1 To: Slurm User Community List <slurm-users@lists.schedmd.com> We're running slurm-17.11.12 on Bright Cluster 8.1 and our node002 keeps going into a draining state: sinfo -a PARTITION AVAIL TIMELIMIT NODES STATE NODELIST defq* up infinite 1 drng node002 info -N -o "%.20N %.15C %.10t %.10m %.15P %.15G %.35E" NODELIST CPUS(A/I/O/T) STATE MEMORY PARTITION GRES REASON node001 9/15/0/24 mix 191800 defq* gpu:1 none node002 1/0/23/24 drng 191800 defq* gpu:1 gres/gpu count changed and jobs are node003 1/23/0/24 mix 191800 defq* gpu:1 none Node of the nodes have a separate slurm.conf file, it's all shared from the head node. What else could be causing this? [2020-03-13T08:54:02.269] gres/gpu: count changed for node node002 from 0 to 1 [2020-03-13T08:54:02.269] error: Setting node node002 state to DRAIN [2020-03-13T08:54:02.269] drain_nodes: node node002 state set to DRAIN [2020-03-13T08:54:02.269] error: _slurm_rpc_node_registration node=node002: Invalid argument