Python 932列车图像上10个历元后未学习任何内容的词数据的CRNN模型

Python 932列车图像上10个历元后未学习任何内容的词数据的CRNN模型,python,neural-network,conv-neural-network,pytorch,recurrent-neural-network,Python,Neural Network,Conv Neural Network,Pytorch,Recurrent Neural Network,我试图使用CRNN方法从孟加拉语单词的图像中预测单词。我正在一个有932张图片和10个时代的本地环境中训练它。损失正在减少,但准确度为0%。这个模型有点做随机猜测,没有学到任何东西。以下是我的模型: import torch.nn as nn class BidirectionalLSTM(nn.Module): def __init__(self, number_of_input, number_of_hidden, nunmer_of_out): super(B

我试图使用CRNN方法从孟加拉语单词的图像中预测单词。我正在一个有932张图片和10个时代的本地环境中训练它。损失正在减少,但准确度为0%。这个模型有点做随机猜测,没有学到任何东西。以下是我的模型:

import torch.nn as nn


class BidirectionalLSTM(nn.Module):

    def __init__(self, number_of_input, number_of_hidden, nunmer_of_out):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = nn.LSTM(number_of_input, number_of_hidden, bidirectional=True)
        self.embedding = nn.Linear(number_of_hidden * 2, nunmer_of_out)

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        # print("size from recurrent forward :{} {} {}".format(T,b,h))
        t_rec = recurrent.view(T * b, h)
        # print("size after recurrent view : {}".format(t_rec.size()))
        output = self.embedding(t_rec)  # [T * b, nOut]

        output = output.view(T, b, -1)
        return output


class CRNN(nn.Module):

    def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False):
        super(CRNN, self).__init__()
        # assert imgH % 16 == 0, 'imgH has to be a multiple of 16'

        ks = [3, 3, 3, 3, 3, 3, 2]  # original [3, 3, 3, 3, 3, 3, 2]
        ps = [1, 1, 1, 1, 1, 1, 0]
        ss = [1, 1, 1, 1, 1, 1, 1]
        nm = [64, 128, 256, 256, 512, 512, 512]

        cnn = nn.Sequential()

        def conv_relu(i, batchNormalization=False):
            nIn = nc if i == 0 else nm[i - 1]
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            if batchNormalization:
                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
            if leakyRelu:
                cnn.add_module('relu{0}'.format(i),
                               nn.LeakyReLU(0.2, inplace=True))
            else:
                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))

        conv_relu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64 original
        conv_relu(1)
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32 original
        conv_relu(2, True)
        conv_relu(3)
        cnn.add_module('pooling{0}'.format(2),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16 original
        conv_relu(4, True)
        conv_relu(5)
        cnn.add_module('pooling{0}'.format(3),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16 original
        conv_relu(6, True)  # 512x1x16
        # Extra max pull to bring down height to 1
        cnn.add_module('pooling{0}'.format(4),
                       nn.MaxPool2d((3, 3), (2, 4), (0, 1))) # final size 512x2x32
        self.cnn = cnn
        self.rnn = nn.Sequential(
            BidirectionalLSTM(512, nh, nh),
            BidirectionalLSTM(nh, nh, nclass))

    def forward(self, input):
        # conv features
        conv = self.cnn(input)
        b, c, h, w = conv.size()
        # assert h == 1, "the height of conv must be 1"
        # print('size before squeeze')
        # print(conv.size())
        conv = conv.squeeze(2)
        # print('size after squeeze')
        # print(conv.size())
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = self.rnn(conv)
        # print("from model {}".format(output))
        return output

以下是我的主要观点:

from data_preparation.dataloader import DataSetOCR
from utills.dataloader_services import *
from torch.utils.data import DataLoader
import parameters
from models import crnn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import CTCLoss
from utills.string_label_converter import averager, StrLabelConverter
import torch.nn.functional as F

train_dataset = DataSetOCR(
    csv_file_path= parameters.train_csv_path,
    text_file_path= parameters.text_file_path,
    root_directory= parameters.train_root)
assert train_dataset

test_dataset = DataSetOCR(
    csv_file_path= parameters.test_csv_path,
    text_file_path= parameters.text_file_path,
    root_directory= parameters.test_root)
assert test_dataset

dataloader_params = {
    'batch_size': 2,
    'shuffle': True,
    'collate_fn': my_collate

}

train_loader = DataLoader(train_dataset, **dataloader_params)
train_iter = iter(train_loader)

# custom weights initialization called on crnn
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)


crnn = crnn.CRNN(parameters.max_image_height, 3, parameters.number_of_classes, 256)
crnn.apply(weights_init)
criterion = CTCLoss()
optimizer = optim.Adam(crnn.parameters(), lr=0.001)
loss_avg = averager()

image = torch.FloatTensor(2, 3, parameters.max_image_width, parameters.max_image_height)
text = torch.LongTensor(2 * 5)
length = torch.LongTensor(2)


string_converter = StrLabelConverter()
# string_converter.convert_integer_to_string()

def val(net, dataset, criterion, max_iter=100):
    print('Start val')

    for p in crnn.parameters():
        p.requires_grad = False

    net.eval()
    data_loader = torch.utils.data.DataLoader(
        dataset, shuffle=True, batch_size=2, collate_fn=my_collate)
    val_iter = iter(data_loader)

    i = 0
    n_correct = 0
    loss_avg = averager()

    max_iter = min(max_iter, len(data_loader))
    # print('length of dataloader')
    # print(len(data_loader))
    for i in range(max_iter):
        data = val_iter.next()
        i += 1
        images, texts = data
        batch_size = images.size(0)
        # print(images)
        # print(image)
        loadData(image, images)
        t, l = string_converter.convert_string_to_integer(texts, [])
        loadData(text, t)
        loadData(length, l)
        # print('actual label')
        # print(text)
        # print(text.size())


        # print('input from val')
        # print(i)
        print(image)
        preds = crnn(image)
        preds = F.log_softmax(preds, 2)
        print(preds)
        preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
        # print('predictions')
        # print(preds.size())
        cost = criterion(preds, text, preds_size, length) / batch_size
        print('validation {}'.format(cost))
        loss_avg.add(cost)

        # print('predictions before max')
        # print(preds.size())
        _, preds = preds.max(2)
        # print('after max function')
        # print(preds)
        # print(preds.size())
        preds = preds.squeeze(1) # originial 2
        # print('after squeeze')
        # print(preds.size())
        preds = preds.transpose(1, 0).contiguous().view(-1)
        # print('after transpose')
        # print(preds.size())
        sim_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)

        cpu_texts = string_converter.convert_integer_to_string(text, length)

        # for pred, target in zip(sim_preds, cpu_texts):
        #     # if pred == target:
        #     #     n_correct += 1
        #     print((pred, target))
    #
    # raw_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)[:2]
    # for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts):
    #     print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))
    #
    # accuracy = n_correct / float(max_iter * 100)
    # print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))

def trainBatch(train_iter, criterion, optimizer):
    try:
        data = train_iter.next()
    except StopIteration:
        train_iter = iter(train_loader)
        data = train_iter.next()

    images,texts = data
    loadData(image,images)
    t,l =string_converter.convert_string_to_integer(texts, [])
    loadData(text, t)
    loadData(length,l)
    batch_size = dataloader_params['batch_size']
    optimizer.zero_grad()
    # # test_util.loadData(image, data['image'])
    # text = Variable(data['integer_sequence_label'])
    # print("from train {}".format(image))
    preds = crnn(image)
    preds = F.log_softmax(preds, 2)
    preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))



    cost = criterion(preds, text, preds_size, length) / batch_size
    # print('cost from train {}'.format(cost))
    crnn.zero_grad()
    cost.backward()
    optimizer.step()
    return cost

for epoch in range(25):
    i = 0
    while i < len(train_loader):
        for p in crnn.parameters():
            p.requires_grad = True
        crnn.train()

        cost = trainBatch(train_iter, criterion, optimizer)
        loss_avg.add(cost)
        i += 1

        if i % 50 == 0:
            print('[%d/%d][%d/%d] Loss: %f' % (epoch, 25, i, len(train_loader), loss_avg.val()))
            loss_avg.reset()

        if i % 50 == 0:
            val(crnn, test_dataset, criterion)

        # #do checkpointing
        # if i % 1 == 0:
        #     torch.save(crnn.state_dict(), '{0}/netCRNN_{1}_{2}.pth'.format('/home/bjit-531/PycharmProjects/python/bangla-ocr-version-2/bangla-ocr-version-2/weights/', epoch, i))
        #
从数据准备.dataloader导入DataSetOCR
从utills.dataloader\u服务导入*
从torch.utils.data导入数据加载器
导入参数
从模型导入crnn
将torch.optim导入为optim
从torch.autograd导入变量
来自torch.nn进口CTCLoss
从utills.string\u label\u converter导入平均器,StrLabelConverter
导入torch.nn.功能为F
列车数据集=数据集(
csv\u文件\u路径=参数。列车\u csv\u路径,
text\u file\u path=parameters.text\u file\u path,
root\u directory=parameters.train\u root)
断言序列u数据集
测试数据集=DataSetOCR(
csv\u文件\u路径=参数。测试\u csv\u路径,
text\u file\u path=parameters.text\u file\u path,
根目录=参数。测试根目录)
断言测试数据集
数据加载器参数={
“批次大小”:2,
“洗牌”:没错,
“collate\u fn”:我的“collate”
}
列车装载机=数据装载机(列车数据集,**数据装载机参数)
iter列=iter(iter列装载机)
#在crnn上调用自定义权重初始化
def重量_初始值(m):
classname=m.。\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu__
如果classname.find('Conv')!=-1:
m、 重量.数据.正常值(0.0,0.02)
elif classname.find('BatchNorm')!=-1:
m、 重量.数据.正常值(1.0,0.02)
m、 偏差。数据。填充(0)
crnn=crnn.crnn(parameters.max_image_height,3,parameters.number_of_class,256)
crnn.apply(权重_init)
标准=CTCLoss()
优化器=optim.Adam(crnn.parameters(),lr=0.001)
平均损失=平均值()
image=torch.FloatTensor(2,3,parameters.max\u image\u width,parameters.max\u image\u height)
文本=火炬长传感器(2*5)
长度=火炬长传感器(2)
string_converter=StrLabelConverter()
#字符串\u转换器。将\u整数\u转换为\u字符串()
定义值(净、数据集、标准、最大值=100):
打印('开始值')
对于crnn.parameters()中的p:
p、 需要_grad=False
净增值()
数据加载器=torch.utils.data.DataLoader(
数据集,shuffle=True,batch\u size=2,collate\u fn=my\u collate)
val_iter=iter(数据加载器)
i=0
n_correct=0
平均损失=平均值()
max_iter=min(max_iter,len(数据加载器))
#打印('数据加载器的长度')
#打印(透镜(数据加载器))
对于范围内的i(最大值):
data=val_iter.next()
i+=1
图像、文本=数据
批处理大小=图像。大小(0)
#打印(图像)
#打印(图像)
加载数据(图像,图像)
t、 l=字符串\u转换器。将字符串\u转换为整数(文本,[])
加载数据(文本,t)
载荷数据(长度,l)
#打印('实际标签')
#打印(文本)
#打印(text.size())
#打印('从val输入')
#印刷品(一)
打印(图像)
preds=crnn(图像)
preds=F.log\u softmax(preds,2)
打印(preds)
预加工尺寸=变量(torch.IntTensor([预加工尺寸(0)]*批量尺寸))
#打印(‘预测’)
#打印(preds.size())
成本=标准(预测值、文本、预测值大小、长度)/批次大小
打印('validation{}'。格式(成本))
平均增加损失(成本)
#打印('最大值之前的预测')
#打印(preds.size())
_,preds=preds.max(2)
#打印('max function'之后)
#打印(preds)
#打印(preds.size())
preds=preds.挤压(1)#原始2
#打印(“挤压后”)
#打印(preds.size())
preds=preds.transpose(1,0).continuous().view(-1)
#打印('转置后')
#打印(preds.size())
sim_preds=string_converter。将_integer_转换为_string(preds.data,preds_size.data)
cpu\u text=字符串\u转换器。将\u整数\u转换为\u字符串(文本,长度)
#对于pred,目标为zip(sim_pred,cpu_文本):
##如果pred==目标:
##n_正确+=1
#打印((pred,目标))
#
#raw_preds=string_converter。将_integer_转换为_string(preds.data,preds_size.data)[:2]
#对于zip中的原始pred、pred、gt(原始pred、模拟pred、cpu文本):
#打印(“%-20s=>%-20s,gt:%-20s%”(原始pred,pred,gt))
#
#精度=n正确/浮动(最大值*100)
#打印('测试损失:%f,accuray:%f'(损失平均值(),精度))
def列车批次(列车、标准、优化器):
尝试:
数据=下一列列车()
除停止迭代外:
iter列=iter(iter列装载机)
数据=下一列列车()
图像、文本=数据
加载数据(图像,图像)
t、 l=字符串\u转换器。将字符串\u转换为整数(文本,[])
加载数据(文本,t)
载荷数据(长度,l)
批处理大小=数据加载器参数['batch\u size']
optimizer.zero_grad()
##测试工具加载数据(图像,数据['image'])
#text=变量(数据['integer\u sequence\u label'])
#打印(“从序列{}”。格式(图像))
preds=crnn(图像)
preds=F.log\u softmax(preds,2)
预加工尺寸=变量(torch.IntTensor([预加工尺寸(0)]*批量尺寸))
成本=标准(预测值、文本、预测值大小、长度)/批次大小
#打印({}列的成本)。格式(成本))
crnn.zero_grad()
cost.backward()
optimizer.step()
退货成本
对于范围内的历元(25):
i=0
当i