Python 932列车图像上10个历元后未学习任何内容的词数据的CRNN模型
我试图使用CRNN方法从孟加拉语单词的图像中预测单词。我正在一个有932张图片和10个时代的本地环境中训练它。损失正在减少,但准确度为0%。这个模型有点做随机猜测,没有学到任何东西。以下是我的模型:Python 932列车图像上10个历元后未学习任何内容的词数据的CRNN模型,python,neural-network,conv-neural-network,pytorch,recurrent-neural-network,Python,Neural Network,Conv Neural Network,Pytorch,Recurrent Neural Network,我试图使用CRNN方法从孟加拉语单词的图像中预测单词。我正在一个有932张图片和10个时代的本地环境中训练它。损失正在减少,但准确度为0%。这个模型有点做随机猜测,没有学到任何东西。以下是我的模型: import torch.nn as nn class BidirectionalLSTM(nn.Module): def __init__(self, number_of_input, number_of_hidden, nunmer_of_out): super(B
import torch.nn as nn
class BidirectionalLSTM(nn.Module):
def __init__(self, number_of_input, number_of_hidden, nunmer_of_out):
super(BidirectionalLSTM, self).__init__()
self.rnn = nn.LSTM(number_of_input, number_of_hidden, bidirectional=True)
self.embedding = nn.Linear(number_of_hidden * 2, nunmer_of_out)
def forward(self, input):
recurrent, _ = self.rnn(input)
T, b, h = recurrent.size()
# print("size from recurrent forward :{} {} {}".format(T,b,h))
t_rec = recurrent.view(T * b, h)
# print("size after recurrent view : {}".format(t_rec.size()))
output = self.embedding(t_rec) # [T * b, nOut]
output = output.view(T, b, -1)
return output
class CRNN(nn.Module):
def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False):
super(CRNN, self).__init__()
# assert imgH % 16 == 0, 'imgH has to be a multiple of 16'
ks = [3, 3, 3, 3, 3, 3, 2] # original [3, 3, 3, 3, 3, 3, 2]
ps = [1, 1, 1, 1, 1, 1, 0]
ss = [1, 1, 1, 1, 1, 1, 1]
nm = [64, 128, 256, 256, 512, 512, 512]
cnn = nn.Sequential()
def conv_relu(i, batchNormalization=False):
nIn = nc if i == 0 else nm[i - 1]
nOut = nm[i]
cnn.add_module('conv{0}'.format(i),
nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
if batchNormalization:
cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
if leakyRelu:
cnn.add_module('relu{0}'.format(i),
nn.LeakyReLU(0.2, inplace=True))
else:
cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
conv_relu(0)
cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64 original
conv_relu(1)
cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32 original
conv_relu(2, True)
conv_relu(3)
cnn.add_module('pooling{0}'.format(2),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16 original
conv_relu(4, True)
conv_relu(5)
cnn.add_module('pooling{0}'.format(3),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16 original
conv_relu(6, True) # 512x1x16
# Extra max pull to bring down height to 1
cnn.add_module('pooling{0}'.format(4),
nn.MaxPool2d((3, 3), (2, 4), (0, 1))) # final size 512x2x32
self.cnn = cnn
self.rnn = nn.Sequential(
BidirectionalLSTM(512, nh, nh),
BidirectionalLSTM(nh, nh, nclass))
def forward(self, input):
# conv features
conv = self.cnn(input)
b, c, h, w = conv.size()
# assert h == 1, "the height of conv must be 1"
# print('size before squeeze')
# print(conv.size())
conv = conv.squeeze(2)
# print('size after squeeze')
# print(conv.size())
conv = conv.permute(2, 0, 1) # [w, b, c]
# rnn features
output = self.rnn(conv)
# print("from model {}".format(output))
return output
以下是我的主要观点:
from data_preparation.dataloader import DataSetOCR
from utills.dataloader_services import *
from torch.utils.data import DataLoader
import parameters
from models import crnn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import CTCLoss
from utills.string_label_converter import averager, StrLabelConverter
import torch.nn.functional as F
train_dataset = DataSetOCR(
csv_file_path= parameters.train_csv_path,
text_file_path= parameters.text_file_path,
root_directory= parameters.train_root)
assert train_dataset
test_dataset = DataSetOCR(
csv_file_path= parameters.test_csv_path,
text_file_path= parameters.text_file_path,
root_directory= parameters.test_root)
assert test_dataset
dataloader_params = {
'batch_size': 2,
'shuffle': True,
'collate_fn': my_collate
}
train_loader = DataLoader(train_dataset, **dataloader_params)
train_iter = iter(train_loader)
# custom weights initialization called on crnn
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(0.0, 0.02)
elif classname.find('BatchNorm') != -1:
m.weight.data.normal_(1.0, 0.02)
m.bias.data.fill_(0)
crnn = crnn.CRNN(parameters.max_image_height, 3, parameters.number_of_classes, 256)
crnn.apply(weights_init)
criterion = CTCLoss()
optimizer = optim.Adam(crnn.parameters(), lr=0.001)
loss_avg = averager()
image = torch.FloatTensor(2, 3, parameters.max_image_width, parameters.max_image_height)
text = torch.LongTensor(2 * 5)
length = torch.LongTensor(2)
string_converter = StrLabelConverter()
# string_converter.convert_integer_to_string()
def val(net, dataset, criterion, max_iter=100):
print('Start val')
for p in crnn.parameters():
p.requires_grad = False
net.eval()
data_loader = torch.utils.data.DataLoader(
dataset, shuffle=True, batch_size=2, collate_fn=my_collate)
val_iter = iter(data_loader)
i = 0
n_correct = 0
loss_avg = averager()
max_iter = min(max_iter, len(data_loader))
# print('length of dataloader')
# print(len(data_loader))
for i in range(max_iter):
data = val_iter.next()
i += 1
images, texts = data
batch_size = images.size(0)
# print(images)
# print(image)
loadData(image, images)
t, l = string_converter.convert_string_to_integer(texts, [])
loadData(text, t)
loadData(length, l)
# print('actual label')
# print(text)
# print(text.size())
# print('input from val')
# print(i)
print(image)
preds = crnn(image)
preds = F.log_softmax(preds, 2)
print(preds)
preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
# print('predictions')
# print(preds.size())
cost = criterion(preds, text, preds_size, length) / batch_size
print('validation {}'.format(cost))
loss_avg.add(cost)
# print('predictions before max')
# print(preds.size())
_, preds = preds.max(2)
# print('after max function')
# print(preds)
# print(preds.size())
preds = preds.squeeze(1) # originial 2
# print('after squeeze')
# print(preds.size())
preds = preds.transpose(1, 0).contiguous().view(-1)
# print('after transpose')
# print(preds.size())
sim_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)
cpu_texts = string_converter.convert_integer_to_string(text, length)
# for pred, target in zip(sim_preds, cpu_texts):
# # if pred == target:
# # n_correct += 1
# print((pred, target))
#
# raw_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)[:2]
# for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts):
# print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))
#
# accuracy = n_correct / float(max_iter * 100)
# print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def trainBatch(train_iter, criterion, optimizer):
try:
data = train_iter.next()
except StopIteration:
train_iter = iter(train_loader)
data = train_iter.next()
images,texts = data
loadData(image,images)
t,l =string_converter.convert_string_to_integer(texts, [])
loadData(text, t)
loadData(length,l)
batch_size = dataloader_params['batch_size']
optimizer.zero_grad()
# # test_util.loadData(image, data['image'])
# text = Variable(data['integer_sequence_label'])
# print("from train {}".format(image))
preds = crnn(image)
preds = F.log_softmax(preds, 2)
preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
cost = criterion(preds, text, preds_size, length) / batch_size
# print('cost from train {}'.format(cost))
crnn.zero_grad()
cost.backward()
optimizer.step()
return cost
for epoch in range(25):
i = 0
while i < len(train_loader):
for p in crnn.parameters():
p.requires_grad = True
crnn.train()
cost = trainBatch(train_iter, criterion, optimizer)
loss_avg.add(cost)
i += 1
if i % 50 == 0:
print('[%d/%d][%d/%d] Loss: %f' % (epoch, 25, i, len(train_loader), loss_avg.val()))
loss_avg.reset()
if i % 50 == 0:
val(crnn, test_dataset, criterion)
# #do checkpointing
# if i % 1 == 0:
# torch.save(crnn.state_dict(), '{0}/netCRNN_{1}_{2}.pth'.format('/home/bjit-531/PycharmProjects/python/bangla-ocr-version-2/bangla-ocr-version-2/weights/', epoch, i))
#
从数据准备.dataloader导入DataSetOCR
从utills.dataloader\u服务导入*
从torch.utils.data导入数据加载器
导入参数
从模型导入crnn
将torch.optim导入为optim
从torch.autograd导入变量
来自torch.nn进口CTCLoss
从utills.string\u label\u converter导入平均器,StrLabelConverter
导入torch.nn.功能为F
列车数据集=数据集(
csv\u文件\u路径=参数。列车\u csv\u路径,
text\u file\u path=parameters.text\u file\u path,
root\u directory=parameters.train\u root)
断言序列u数据集
测试数据集=DataSetOCR(
csv\u文件\u路径=参数。测试\u csv\u路径,
text\u file\u path=parameters.text\u file\u path,
根目录=参数。测试根目录)
断言测试数据集
数据加载器参数={
“批次大小”:2,
“洗牌”:没错,
“collate\u fn”:我的“collate”
}
列车装载机=数据装载机(列车数据集,**数据装载机参数)
iter列=iter(iter列装载机)
#在crnn上调用自定义权重初始化
def重量_初始值(m):
classname=m.。\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu__
如果classname.find('Conv')!=-1:
m、 重量.数据.正常值(0.0,0.02)
elif classname.find('BatchNorm')!=-1:
m、 重量.数据.正常值(1.0,0.02)
m、 偏差。数据。填充(0)
crnn=crnn.crnn(parameters.max_image_height,3,parameters.number_of_class,256)
crnn.apply(权重_init)
标准=CTCLoss()
优化器=optim.Adam(crnn.parameters(),lr=0.001)
平均损失=平均值()
image=torch.FloatTensor(2,3,parameters.max\u image\u width,parameters.max\u image\u height)
文本=火炬长传感器(2*5)
长度=火炬长传感器(2)
string_converter=StrLabelConverter()
#字符串\u转换器。将\u整数\u转换为\u字符串()
定义值(净、数据集、标准、最大值=100):
打印('开始值')
对于crnn.parameters()中的p:
p、 需要_grad=False
净增值()
数据加载器=torch.utils.data.DataLoader(
数据集,shuffle=True,batch\u size=2,collate\u fn=my\u collate)
val_iter=iter(数据加载器)
i=0
n_correct=0
平均损失=平均值()
max_iter=min(max_iter,len(数据加载器))
#打印('数据加载器的长度')
#打印(透镜(数据加载器))
对于范围内的i(最大值):
data=val_iter.next()
i+=1
图像、文本=数据
批处理大小=图像。大小(0)
#打印(图像)
#打印(图像)
加载数据(图像,图像)
t、 l=字符串\u转换器。将字符串\u转换为整数(文本,[])
加载数据(文本,t)
载荷数据(长度,l)
#打印('实际标签')
#打印(文本)
#打印(text.size())
#打印('从val输入')
#印刷品(一)
打印(图像)
preds=crnn(图像)
preds=F.log\u softmax(preds,2)
打印(preds)
预加工尺寸=变量(torch.IntTensor([预加工尺寸(0)]*批量尺寸))
#打印(‘预测’)
#打印(preds.size())
成本=标准(预测值、文本、预测值大小、长度)/批次大小
打印('validation{}'。格式(成本))
平均增加损失(成本)
#打印('最大值之前的预测')
#打印(preds.size())
_,preds=preds.max(2)
#打印('max function'之后)
#打印(preds)
#打印(preds.size())
preds=preds.挤压(1)#原始2
#打印(“挤压后”)
#打印(preds.size())
preds=preds.transpose(1,0).continuous().view(-1)
#打印('转置后')
#打印(preds.size())
sim_preds=string_converter。将_integer_转换为_string(preds.data,preds_size.data)
cpu\u text=字符串\u转换器。将\u整数\u转换为\u字符串(文本,长度)
#对于pred,目标为zip(sim_pred,cpu_文本):
##如果pred==目标:
##n_正确+=1
#打印((pred,目标))
#
#raw_preds=string_converter。将_integer_转换为_string(preds.data,preds_size.data)[:2]
#对于zip中的原始pred、pred、gt(原始pred、模拟pred、cpu文本):
#打印(“%-20s=>%-20s,gt:%-20s%”(原始pred,pred,gt))
#
#精度=n正确/浮动(最大值*100)
#打印('测试损失:%f,accuray:%f'(损失平均值(),精度))
def列车批次(列车、标准、优化器):
尝试:
数据=下一列列车()
除停止迭代外:
iter列=iter(iter列装载机)
数据=下一列列车()
图像、文本=数据
加载数据(图像,图像)
t、 l=字符串\u转换器。将字符串\u转换为整数(文本,[])
加载数据(文本,t)
载荷数据(长度,l)
批处理大小=数据加载器参数['batch\u size']
optimizer.zero_grad()
##测试工具加载数据(图像,数据['image'])
#text=变量(数据['integer\u sequence\u label'])
#打印(“从序列{}”。格式(图像))
preds=crnn(图像)
preds=F.log\u softmax(preds,2)
预加工尺寸=变量(torch.IntTensor([预加工尺寸(0)]*批量尺寸))
成本=标准(预测值、文本、预测值大小、长度)/批次大小
#打印({}列的成本)。格式(成本))
crnn.zero_grad()
cost.backward()
optimizer.step()
退货成本
对于范围内的历元(25):
i=0
当i