Keras 为什么pytorch的实现如此低效?

Keras 为什么pytorch的实现如此低效?,keras,deep-learning,pytorch,Keras,Deep Learning,Pytorch,我已经在Keras和Pytorch中实现了一篇关于CNN体系结构的论文,但是Keras的实现效率要高得多:培训需要4GB的gpu,有50000个样本和10000个验证样本,但Pytorch one需要所有12GB的gpu,我甚至不能使用验证集! 它们的优化器都是sgd,带有动量和相同的设置。 有关论文的更多信息:[架构]: pytorch代码: class SimpleCNN(torch.nn.Module): def __init__(self): super(Sim

我已经在Keras和Pytorch中实现了一篇关于CNN体系结构的论文,但是Keras的实现效率要高得多:培训需要4GB的gpu,有50000个样本和10000个验证样本,但Pytorch one需要所有12GB的gpu,我甚至不能使用验证集! 它们的优化器都是sgd,带有动量和相同的设置。 有关论文的更多信息:[架构]:

pytorch代码:

class SimpleCNN(torch.nn.Module):

    def __init__(self):
        super(SimpleCNN, self).__init__()

        self.conv2d_11 = torch.nn.Conv2d(3, 64, kernel_size = 3, stride = 1, padding = 1)
        self.conv2d_12 = torch.nn.Conv2d(64, 64, kernel_size = 3, stride = 1, padding = 1)

        self.conv2d_21 = torch.nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1)
        self.conv2d_22 = torch.nn.Conv2d(128, 128, kernel_size = 3, stride = 1, padding = 1)

        self.conv2d_31 = torch.nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1)
        self.conv2d_32 = torch.nn.Conv2d(256, 256, kernel_size = 3, stride = 1, padding = 1)
        self.conv2d_33 = torch.nn.Conv2d(256, 256, kernel_size = 3, stride = 1, padding = 1)

        self.conv2d_41 = torch.nn.Conv2d(256, 512, kernel_size = 3, stride = 1, padding = 1)
        self.conv2d_42 = torch.nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1)

        self.conv2d_51 = torch.nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1)

        self.Batchnorm_1=torch.nn.BatchNorm2d(64)
        self.Batchnorm_2=torch.nn.BatchNorm2d(128)
        self.Batchnorm_3=torch.nn.BatchNorm2d(256)
        self.Batchnorm_4=torch.nn.BatchNorm2d(512)

        self.dropout2d_1=torch.nn.Dropout2d(p=0.3)
        self.dropout2d_2=torch.nn.Dropout2d(p=0.4)
        self.dropout2d_3=torch.nn.Dropout2d(p=0.5)

        self.dropout1d=torch.nn.Dropout(p=0.5)

        self.maxpool2d = torch.nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)

        self.avgpool2d = torch.nn.AvgPool2d(kernel_size = 2, stride = 2, padding = 0)

        self.fc = torch.nn.Linear(512, 10)

    def forward(self, x):

        ############################# Phase 1
        #print(x.size())
        x = F.relu(self.conv2d_11(x))
        x = self.dropout2d_1(x) #rate =0.3
        x = self.Batchnorm_1(x) #input 64
        #print(x.size())

        x = F.relu(self.conv2d_12(x))
        x = self.dropout2d_1(x) #rate=0.3
        x = self.Batchnorm_1(x) #input 64
        #print(x.size())

        x = self.maxpool2d(x)
        #print(x.size())
        ############################# Phase 2
        x = F.relu(self.conv2d_21(x))
        x = self.dropout2d_1(x) #rate=0.3
        x = self.Batchnorm_2(x) #input 128
        #print(x.size())

        x = F.relu(self.conv2d_22(x))
        x = self.dropout2d_1(x) #rate=0.3
        x = self.Batchnorm_2(x) #input 128
        #print(x.size())

        x = self.maxpool2d(x)
        #print(x.size())
        ############################# Phase 3
        x = F.relu(self.conv2d_31(x))
        x = self.dropout2d_2(x) #rate=0.4
        x = self.Batchnorm_3(x) #input 256
        #print(x.size())

        x = F.relu(self.conv2d_32(x))
        x = self.dropout2d_2(x) #rate=0.4
        x = self.Batchnorm_3(x) #input 256
        #print(x.size())

        x = F.relu(self.conv2d_33(x))
        x = self.dropout2d_2(x) #rate=0.4
        x = self.Batchnorm_3(x) #input 256
        #print(x.size())

        x = self.maxpool2d(x)
        #print(x.size())
        ############################# Phase 4
        x = F.relu(self.conv2d_41(x))
        x = self.dropout2d_2(x)
        x = self.Batchnorm_4(x)
        #print(x.size())

        x = F.relu(self.conv2d_42(x))
        x = self.dropout2d_2(x)
        x = self.Batchnorm_4(x)
        #print(x.size())

        x = self.maxpool2d(x)
        #print(x.size())
        ############################# Phase 5
        x = F.relu(self.conv2d_51(x))
        x = self.dropout2d_3(x)
        x = self.Batchnorm_4(x)
        #print(x.size())

        x = self.avgpool2d(x)
        #print(x.size())
        x = x.view(x.size(0), -1)
        #print(x.size())
        x = self.dropout1d(x)
        x = F.relu(self.fc(x))
        x = self.dropout1d(x)
        #print(x.size())
        x = F.softmax(x)
        ###############################

        return(x)


import time
from torch.optim.lr_scheduler import ReduceLROnPlateau

def trainNet(model, batch_size, n_epochs, learning_rate):

    lr=learning_rate

    #Print all of the hyperparameters of the training iteration:
    print("======= HYPERPARAMETERS =======")
    print("Batch size=", batch_size)
    print("Epochs=", n_epochs)
    print("Base learning_rate=", learning_rate)
    print("=" * 30)

    #Get training data
    n_batches = len(train_loader)

    #Time for printing
    training_start_time = time.time()

    #Loss function"
    loss = torch.nn.CrossEntropyLoss()
    optimizer = createOptimizer(model, lr)   

    scheduler = ReduceLROnPlateau(optimizer, 'min'
                                  ,patience=3,factor=0.9817
                                 ,verbose=True,)

    #Loop for n_epochs
    for epoch in range(n_epochs):

        #save the weightsevery 10 epochs
        if epoch % 10 == 0 :
            torch.save(model.state_dict(), 'model.ckpt')


        #print('learning rate : {:.3f} '.format(lr))     
        #Create our loss and optimizer functions

        running_loss = 0.0
        print_every = n_batches // 10
        start_time = time.time()
        total_train_loss = 0
        total_train_acc = 0
        epoch_time = 0

        for i, data in enumerate(train_loader, 0):

            #free up the cuda memory
            inputs=None
            labels=None

            inputs, labels = data

            inputs, labels = Variable(inputs.to(device)), Variable(labels.to(device))

            optimizer.zero_grad()

            outputs = model(inputs)

            score, predictions = torch.max(outputs.data, 1)
            acc = (labels==predictions).sum()
            total_train_acc += acc

            loss_size = loss(outputs, labels)
            loss_size.backward()
            optimizer.step()

            running_loss += loss_size.item()
            total_train_loss += loss_size.item()

            #Print every 10th batch of an epoch
            if (i + 1) % (print_every + 1) == 0:
                print("Epoch {}, {:d} % \t | train_loss: {:.3f} | train_acc:{}% | took: {:.2f}s".format(
                        epoch+1, int(100 * (i+1) / n_batches), running_loss / print_every
                        ,int(acc), time.time() - start_time))

                epoch_time += (time.time() - start_time)

                #Reset running loss and time
                running_loss = 0.0
                start_time = time.time()

        scheduler.step(total_train_loss)
        torch.cuda.empty_cache() 
        #At the end of the epoch, do a pass on the validation set
        total_val_loss = 0

        for inputs, labels in val_loader:

            #Wrap tensors in Variables
            inputs, labels = Variable(inputs.to(device)), Variable(labels.to(device))

            #Forward pass
            val_outputs = model(inputs)
            val_loss_size = loss(val_outputs, labels)
            total_val_loss += val_loss_size.item()

        print("-"*30)
        print("Train loss = {:.2f} | Train acc = {:.1f}% | Val loss={:.2f} | took: {:.2f}s".format(
            total_train_loss / len(train_loader),total_train_acc/ len(train_loader)
            ,total_val_loss/len(val_loader),epoch_time))
        print("="*60)


    print("Training finished, took {:.2f}s".format(time.time() - training_start_time))
CNN = SimpleCNN().to(device)
CNN.eval()

trainNet(CNN, batch_size=64, n_epochs=250, learning_rate=0.1)

凯拉斯:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten,Activation
from tensorflow.keras.layers import Conv2D, MaxPool2D,BatchNormalization,GlobalAveragePooling2D

model = Sequential()
#####################################################
# Phase 1
model.add(Conv2D(64,(3,3),input_shape=(32,32,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))
model.add(BatchNormalization())

#(32,32,3)

model.add(Conv2D(64,(3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))
model.add(BatchNormalization())
#(32,32,3)


model.add(MaxPool2D((2,2)))
#(16,16,3)

#####################################################
#Phase 2
model.add(Conv2D(128, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))
model.add(BatchNormalization())
#(16,16,3)

model.add(Conv2D(128, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))
model.add(BatchNormalization())
#(16,16,3)

model.add(MaxPool2D((2,2),padding='same'))
#(8,8,3)

#####################################################
#Phase 3
model.add(Conv2D(256, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.4))
model.add(BatchNormalization())
#(8,8,3)


model.add(Conv2D(256, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.4))
model.add(BatchNormalization())
#(8,8,3)

model.add(Conv2D(256, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.4))
model.add(BatchNormalization())
#(8,8,3)

model.add(MaxPool2D((2,2)))
#(4,4,3)

#####################################################
#Phase 4
model.add(Conv2D(512, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.4))
model.add(BatchNormalization())
#(4,4,3)

model.add(Conv2D(512, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.4))
model.add(BatchNormalization())
#(4,4,3)

model.add(MaxPool2D((2,2)))
#(2,2,3)

#####################################################
#Phase 5
model.add(Conv2D(512, (3,3),padding='same'))
model.add(Activation('relu'))
model.add(Dropout(rate=0.5))
model.add(BatchNormalization())
#(2,2,3)

model.add(GlobalAveragePooling2D(data_format='channels_last'))
model.add(Flatten())
model.add(Dropout(rate=0.5))

model.add(Dense(10,activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer=sgd_optimizer,loss='categorical_crossentropy',metrics=['accuracy'])
history=model.fit(x=x_train,y=y_train,batch_size=64,
                  epochs=250,verbose=1,callbacks=[checkpoint],validation_data=(x_test,y_test))
编辑:仔细看,
acc
似乎不需要渐变,所以这一段可能不适用。 看起来最重要的问题是
total\u train\u acc
在整个训练循环中累积历史记录(有关详细信息,请参阅)。 将
total\u train\u acc+=acc
更改为
total\u train\u acc+=acc.item()
应能解决此问题

对于验证循环,您还应该使用torch.no_grad()

不是关于速度,而是应使用
model.train()
model.eval()
进行培训/评估,以使batchnorm和dropout层以正确的模式工作