Python 操作tensorflow代码以添加不同的层
我正在试验用于文本分类的BERT嵌入。我正在使用这段代码创建一个用于二进制分类的伯特嵌入层和密集层Python 操作tensorflow代码以添加不同的层,python,tensorflow,keras,deep-learning,lstm,Python,Tensorflow,Keras,Deep Learning,Lstm,我正在试验用于文本分类的BERT嵌入。我正在使用这段代码创建一个用于二进制分类的伯特嵌入层和密集层 # Initialize session sess = tf.Session() class PaddingInputExample(object): """Fake example so the num input examples is a multiple of the batch size. When running eval/predict on the TPU, we n
# Initialize session
sess = tf.Session()
class PaddingInputExample(object):
"""Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU, we need to pad the number of examples
to be a multiple of the batch size, because the TPU requires a fixed batch
size. The alternative is to drop the last batch, which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors.
"""
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
def create_tokenizer_from_hub_module(bert_path):
"""Get the vocab file and casing info from the Hub module."""
bert_module = hub.Module(bert_path)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
vocab_file, do_lower_case = sess.run(
[tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]
)
return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def convert_single_example(tokenizer, example, max_seq_length=256):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
input_ids = [0] * max_seq_length
input_mask = [0] * max_seq_length
segment_ids = [0] * max_seq_length
label = 0
return input_ids, input_mask, segment_ids, label
tokens_a = tokenizer.tokenize(example.text_a)
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0 : (max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
return input_ids, input_mask, segment_ids, example.label
def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
input_ids, input_masks, segment_ids, labels = [], [], [], []
for example in tqdm(examples, desc="Converting examples to features"):
input_id, input_mask, segment_id, label = convert_single_example(
tokenizer, example, max_seq_length
)
input_ids.append(input_id)
input_masks.append(input_mask)
segment_ids.append(segment_id)
labels.append(label)
return (
np.array(input_ids),
np.array(input_masks),
np.array(segment_ids),
np.array(labels).reshape(-1, 1),
)
def convert_text_to_examples(texts, labels):
"""Create InputExamples"""
InputExamples = []
for text, label in zip(texts, labels):
InputExamples.append(
InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
)
return InputExamples
class BertLayer(tf.keras.layers.Layer):
def __init__(
self,
n_fine_tune_layers=10,
pooling="mean",
bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
**kwargs,
):
self.n_fine_tune_layers = n_fine_tune_layers
self.trainable = True
self.output_size = 768
self.pooling = pooling
self.bert_path = bert_path
if self.pooling not in ["first", "mean"]:
raise NameError(
f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
)
super(BertLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.bert = hub.Module(
self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
)
# Remove unused layers
trainable_vars = self.bert.variables
if self.pooling == "first":
trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
trainable_layers = ["pooler/dense"]
elif self.pooling == "mean":
trainable_vars = [
var
for var in trainable_vars
if not "/cls/" in var.name and not "/pooler/" in var.name
]
trainable_layers = []
else:
raise NameError(
f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
)
# Select how many layers to fine tune
for i in range(self.n_fine_tune_layers):
trainable_layers.append(f"encoder/layer_{str(11 - i)}")
# Update trainable vars to contain only the specified layers
trainable_vars = [
var
for var in trainable_vars
if any([l in var.name for l in trainable_layers])
]
# Add to trainable weights
for var in trainable_vars:
self._trainable_weights.append(var)
for var in self.bert.variables:
if var not in self._trainable_weights:
self._non_trainable_weights.append(var)
super(BertLayer, self).build(input_shape)
def call(self, inputs):
inputs = [K.cast(x, dtype="int32") for x in inputs]
input_ids, input_mask, segment_ids = inputs
bert_inputs = dict(
input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
)
if self.pooling == "first":
pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
"pooled_output"
]
elif self.pooling == "mean":
result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
"sequence_output"
]
mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
input_mask = tf.cast(input_mask, tf.float32)
pooled = masked_reduce_mean(result, input_mask)
else:
raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
return pooled
def compute_output_shape(self, input_shape):
return (input_shape[0], self.output_size)
# Build model
def build_model(max_seq_length):
in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]
bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)
dense = tf.keras.layers.Dense(256, activation="relu")(bert_output)
pred = tf.keras.layers.Dense(1, activation="sigmoid")(dense)
# embedding_size = 768
# bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)
# # Reshape bert_output before passing it the GRU
# bert_output_ = tf.keras.layers.Reshape((max_seq_length, embedding_size))(bert_output)
# gru_out = tf.keras.layers.GRU(100, activation='sigmoid')(bert_output_)
# dense = tf.keras.layers.Dense(256, activation="relu")(gru_out)
# pred = tf.keras.layers.Dense(1, activation="sigmoid")(dense)
model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()
return model
def initialize_vars(sess):
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
K.set_session(sess)
def main():
# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 256
train_df, test_df = master_df[:round(len(master_df)*.8)], master_df[round(len(master_df)*.8):]
# Create datasets (Only take up to max_seq_length words for memory)
train_text = train_df["words"].tolist()
train_text = [" ".join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df["new_grouping"].tolist()
test_text = test_df["words"].tolist()
test_text = [" ".join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df["new_grouping"].tolist()
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module(bert_path)
# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
test_examples = convert_text_to_examples(test_text, test_label)
# Convert to features
(
train_input_ids,
train_input_masks,
train_segment_ids,
train_labels,
) = convert_examples_to_features(
tokenizer, train_examples, max_seq_length=max_seq_length
)
(
test_input_ids,
test_input_masks,
test_segment_ids,
test_labels,
) = convert_examples_to_features(
tokenizer, test_examples, max_seq_length=max_seq_length
)
model = build_model(max_seq_length)
# Instantiate variables
initialize_vars(sess)
checkpoint_path = "bert_dir/cp.ckpt"
checkpoint_dir = os.path.dirname('checkpoint_path')
# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
save_weights_only=True,
verbose=1)
history = model.fit(
[train_input_ids, train_input_masks, train_segment_ids],
train_labels,
validation_data=(
[test_input_ids, test_input_masks, test_segment_ids],
test_labels,
),
epochs=1,
batch_size=32,
callbacks=[cp_callback]
)
model.save('bert_1.h5')
return history
if __name__ == "__main__":
history = main()
这篇文章提出了一个类似的问题:
然而,这篇文章的解决方案对我来说并不适用。《华盛顿邮报》建议这样做:
embedding_size = 768
in_id = Input(shape=(max_seq_length,), name="input_ids")
in_mask = Input(shape=(max_seq_length,), name="input_masks")
in_segment = Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]
bert_output = BertLayer(n_fine_tune_layers=12, pooling="mean")(bert_inputs)
bert_output = Reshape((max_seq_length, embedding_size))(bert_output)
bilstm = Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(bert_output)
output = Dense(output_size, activation="softmax")(bilstm)
但我得到了一个错误:
ValueError:传递了一个具有形状(9300,1)的目标数组
用作损失二进制_交叉熵时的形状输出(无、256、1)
编辑1
当我尝试使用下面Il.SQ建议的代码时,我得到以下错误:
---------------------------------------------------------------------------
ResourceExhaustedError回溯(最近一次调用上次)
在()
372
373如果uuuu name uuuuu==“uuuuu main”:
-->374历史记录,列车测向,val测向=main()
5帧
大体上
363个时代=1,
364批次尺寸=32,
-->365回调=[cp\U回调]
366 )
367
/usr/local/lib/python3.6/dist-packages/tensorflow\u core/python/keras/engine/training.py in fit(self、x、y、批大小、历元、冗余、回调、验证拆分、验证数据、无序排列、类权重、样本权重、初始历元、每历元步数、验证步骤、验证频率、最大队列大小、工作人员、使用多处理、**kwargs)
725最大队列大小=最大队列大小,
726名工人=工人,
-->727使用多处理=使用多处理)
728
729 def评估(自我,
/usr/local/lib/python3.6/dist-packages/tensorflow\u core/python/keras/engine/training\u arrays.py in fit(self、model、x、y、批大小、历元、冗余、回调、验证拆分、验证数据、洗牌、类权重、样本权重、初始历元、每历元的步骤、验证步骤、验证频率、**kwargs)
673验证步骤=验证步骤,
674验证频率=验证频率,
-->675个步骤(每个时代的步骤)
676
677 def评估(自我,
/模型迭代中的usr/local/lib/python3.6/dist-packages/tensorflow\u core/python/keras/engine/training\u arrays.py(模型、输入、目标、样本权重、批量大小、年代、详细程度、回调、val_输入、val_目标、val_样本权重、无序、初始历元、每历元步长、验证步骤、验证频率、模式、验证拟合、从数据集准备的反馈值、步骤名称、**kwargs)
392
393#获取输出。
-->394批次输出=f(批次输入)
395如果不存在(批次,列表):
396批次输出=[批次输出]
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py in_u_调用(self,inputs)
3474
3475 fetched=self.\u callable\u fn(*数组\u vals,
->3476运行单元元数据=self.run单元元数据)
3477 self.\u call\u fetch\u callbacks(fetched[-len(self.\u fetches):]))
3478输出_结构=nest.pack_序列_as(
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in_u_________(self,*args,**kwargs)
1470 ret=tf_session.tf_SessionRunCallable(self._session._session,
1471自动控制手柄,args,
->1472运行元数据(ptr)
1473如果运行元数据:
1474 proto_data=tf_session.tf_GetBuffer(运行元数据ptr)
ResourceExhaustedError:发现2个根错误。
(0)资源耗尽:通过分配器GPU\U 0\U bfc分配形状为[32,12256256]且类型为float on/job:localhost/replica:0/task:0/device:GPU:0的张量时OOM
[{{node bert_layer_6/bert_layer_6_module_apply_tokens/bert/encoder/layer_9/attention/self/Softmax}}]]
提示:如果您想在OOM发生时查看已分配的张量列表,请在OOM上添加report_tensor_allocations_on_to RunOptions以获取当前分配信息。
[[loss_2/mul/_8343]]
提示:如果您想在OOM发生时查看已分配的张量列表,请在OOM上添加report_tensor_allocations_on_to RunOptions以获取当前分配信息。
(1) 资源耗尽:通过分配器GPU_0_bfc分配形状为[32,12256256]且类型为float on/job:localhost/replica:0/task:0/device:GPU:0的tensor时OOM
[{{node bert_layer_6/bert_layer_6_module_apply_tokens/bert/encoder/layer_9/attention/self/Softmax}}]]
提示:如果您想在OOM发生时查看已分配的张量列表,请在OOM上添加report_tensor_allocations_on_to RunOptions以获取当前分配信息。
0成功的操作。
忽略0个派生错误。
首先,减小批大小
然后改为:
这将添加一个全局最大池1d层以使其变平
embedding_size = 768
in_id = Input(shape=(max_seq_length,), name="input_ids")
in_mask = Input(shape=(max_seq_length,), name="input_masks")
in_segment = Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]
bert_output = BertLayer(n_fine_tune_layers=12, pooling="mean")(bert_inputs)
bert_output = Reshape((embedding_size,1))(bert_output)
bilstm = Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(bert_output)
pool=GlobalMaxPooling1D()(bilstm)
output = Dense(output_size, activation="softmax")(pool)
如果不起作用,您的列车输入可能无效。我尝试了您的代码(通过一些编辑,将outputsize更改为1,并将activation更改为“sigmoid”。这引发了另一个错误,我将新的回溯放在了帖子的底部,请参见上面的编辑。此外,我的train输入在我刚刚将BERT层转换为稠密层时工作得非常完美,因此我认为该部分应该是实心的。从您的错误中,我可以看出tensorflow不能重塑到您指定的大小。您能打印出第一个密集模型摘要吗?如果您不知道,它的
model.summary()
谢谢!谢谢,我为密集模型添加了模型摘要,该模型有效,而LSTM模型无效。它与最大序列长度无关。伯特层将只输出(批量大小,768).我知道这一点,因为我打印了(BertLayer),张量形状是(None,768)顺便说一句,如果我的答案有效的话,你能把它标记为正确的吗?谢谢。它们没有太大的区别。人们使用它们的目的是为了同样的目的,也就是为了使其平坦化,但通常,人们使用globalExpooling1d
或globalAveragePoolg1d
,它们会给出更好的结果(在我的例子中)。
embedding_size = 768
in_id = Input(shape=(max_seq_length,), name="input_ids")
in_mask = Input(shape=(max_seq_length,), name="input_masks")
in_segment = Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]
bert_output = BertLayer(n_fine_tune_layers=12, pooling="mean")(bert_inputs)
bert_output = Reshape((embedding_size,1))(bert_output)
bilstm = Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(bert_output)
pool=GlobalMaxPooling1D()(bilstm)
output = Dense(output_size, activation="softmax")(pool)