如何在Dataloader类之外的pytorch中创建数据预处理管道?
我正在尝试建立一个包含40个特征的数据模型,这些特征必须分为10类。我是PyTorch的新手,这是我的第一个项目 我得到一个自定义数据集类(,我不允许更改该类),如下所示:如何在Dataloader类之外的pytorch中创建数据预处理管道?,pytorch,conv-neural-network,pipeline,torch,pytorch-dataloader,Pytorch,Conv Neural Network,Pipeline,Torch,Pytorch Dataloader,我正在尝试建立一个包含40个特征的数据模型,这些特征必须分为10类。我是PyTorch的新手,这是我的第一个项目 我得到一个自定义数据集类(,我不允许更改该类),如下所示: class MyData(Dataset): def _init_(self, mode): with open(mode+'.pkl', 'rb') as handle: data = pickle.load(handle) self.X = data
class MyData(Dataset):
def _init_(self, mode):
with open(mode+'.pkl', 'rb') as handle:
data = pickle.load(handle)
self.X = data['x'].astype('float')
self.y = data['y'].astype('long')
def _len_(self):
return len(self.X)
def _getitem_(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = (self.X[idx], self.y[idx])
return sample
def train(epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data.double())
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
train_losses.append(loss.item())
train_counter.append(
(batch_idx*32) + ((epoch-1)*len(train_loader.dataset)))
save_model(model)
def test():
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in val_loader:
output = model(data.double())
test_loss += criterion(output, target).item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(val_loader.dataset)
test_losses.append(test_loss)
test()
for epoch in range(1, n_epochs + 1):
train(epoch)
test()
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
#self.flatten=nn.Flatten()
self.net_stack=nn.Sequential(
nn.Conv1d(in_channels=40, out_channels=256, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(256, affine=True),
nn.Conv1d(in_channels=256, out_channels=128, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(128, affine=True),
nn.Conv1d(in_channels=128, out_channels=64, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(64, affine=True),
nn.Conv1d(in_channels=64, out_channels=32, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(32, affine=True),
nn.Flatten(),
nn.Linear(32, 10),
nn.Softmax(dim=1)).double()
def forward(self,x):
# result=self.net_stack(x[None])
x=x.double()
result=self.net_stack(x[:, :, None]).double()
print(result.size())
return result
我对数据做了一些预处理,比如归一化,然后训练并保存模型。由于不允许我更改dataset类,我在dataset类之外进行了更改,然后使用了DataLoader
方法。预处理如下:
train_data=MyData("train")
features, labels = train_data[:]
df = pd.DataFrame(features)
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
input_array = x_scaled
output_array = labels
inputs = torch.Tensor(input_array)
targets = torch.Tensor(output_array).type(torch.LongTensor)
dataset = TensorDataset(inputs, targets)
train_ds, val_ds = random_split(dataset, [3300, 300])
batch_size = 300
n_epochs = 200
log_interval = 10
train_losses = []
train_counter = []
test_losses = []
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
在此之后,我将培训和测试功能定义如下(并删除打印语句,因为如果我这样做,自动签名者将无法为我的作业评分):
class MyData(Dataset):
def _init_(self, mode):
with open(mode+'.pkl', 'rb') as handle:
data = pickle.load(handle)
self.X = data['x'].astype('float')
self.y = data['y'].astype('long')
def _len_(self):
return len(self.X)
def _getitem_(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = (self.X[idx], self.y[idx])
return sample
def train(epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data.double())
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
train_losses.append(loss.item())
train_counter.append(
(batch_idx*32) + ((epoch-1)*len(train_loader.dataset)))
save_model(model)
def test():
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in val_loader:
output = model(data.double())
test_loss += criterion(output, target).item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(val_loader.dataset)
test_losses.append(test_loss)
test()
for epoch in range(1, n_epochs + 1):
train(epoch)
test()
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
#self.flatten=nn.Flatten()
self.net_stack=nn.Sequential(
nn.Conv1d(in_channels=40, out_channels=256, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(256, affine=True),
nn.Conv1d(in_channels=256, out_channels=128, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(128, affine=True),
nn.Conv1d(in_channels=128, out_channels=64, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(64, affine=True),
nn.Conv1d(in_channels=64, out_channels=32, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(32, affine=True),
nn.Flatten(),
nn.Linear(32, 10),
nn.Softmax(dim=1)).double()
def forward(self,x):
# result=self.net_stack(x[None])
x=x.double()
result=self.net_stack(x[:, :, None]).double()
print(result.size())
return result
即使这样做了,自动签名器仍然无法为我的代码评分。我主要认为这是因为我可能在如何将数据输入模型时犯了错误,但我无法缩小到问题的确切范围以及如何纠正它。由于我是pytorch的新手,我正在研究如何进行预处理,但所有这些都涉及Dataset类,因此我不确定如何进行预处理
我的模型如下:
class MyData(Dataset):
def _init_(self, mode):
with open(mode+'.pkl', 'rb') as handle:
data = pickle.load(handle)
self.X = data['x'].astype('float')
self.y = data['y'].astype('long')
def _len_(self):
return len(self.X)
def _getitem_(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = (self.X[idx], self.y[idx])
return sample
def train(epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data.double())
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
train_losses.append(loss.item())
train_counter.append(
(batch_idx*32) + ((epoch-1)*len(train_loader.dataset)))
save_model(model)
def test():
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in val_loader:
output = model(data.double())
test_loss += criterion(output, target).item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(val_loader.dataset)
test_losses.append(test_loss)
test()
for epoch in range(1, n_epochs + 1):
train(epoch)
test()
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
#self.flatten=nn.Flatten()
self.net_stack=nn.Sequential(
nn.Conv1d(in_channels=40, out_channels=256, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(256, affine=True),
nn.Conv1d(in_channels=256, out_channels=128, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(128, affine=True),
nn.Conv1d(in_channels=128, out_channels=64, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(64, affine=True),
nn.Conv1d(in_channels=64, out_channels=32, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(32, affine=True),
nn.Flatten(),
nn.Linear(32, 10),
nn.Softmax(dim=1)).double()
def forward(self,x):
# result=self.net_stack(x[None])
x=x.double()
result=self.net_stack(x[:, :, None]).double()
print(result.size())
return result
我得到的一个指示是他们写了:
# Please make sure we can load your model with:
# model = MyModel()
# This means you must give default values to all parameters you may wish to set, such as output size.
你可以试着在训练循环中做到这一点
对于枚举(列加载器)中的批处理idx(数据,目标):
#你可以在这里做一些事情来操纵你的输入
数据=转换(数据)
data.to(‘cuda’)#转到gpu,我注意到您在培训循环中没有这样做
#前传
输出=模型(数据)