Tensorflow 用于嵌入文本数据的TFR记录

Tensorflow 用于嵌入文本数据的TFR记录,tensorflow,tensorflow-datasets,tfrecord,question-answering,Tensorflow,Tensorflow Datasets,Tfrecord,Question Answering,对于Uni的一个项目,我正在使用TensorFlow中的神经网络实现一个问答系统(目前为bAbI数据集任务5,请参阅),我想将TFRecords用于我的输入管道 我的想法是,TFRecords术语中的一个例子应该包括问题的上下文、问题本身、答案和支持的句子编号(int,它指向上下文中能够回答问题的最重要的句子)。下面是我如何定义函数的: def make_example(context, question, answer, support): ex = tf.train.SequenceExa

对于Uni的一个项目,我正在使用TensorFlow中的神经网络实现一个问答系统(目前为bAbI数据集任务5,请参阅),我想将TFRecords用于我的输入管道

我的想法是,TFRecords术语中的一个例子应该包括问题的上下文、问题本身、答案和支持的句子编号(int,它指向上下文中能够回答问题的最重要的句子)。下面是我如何定义函数的:

def make_example(context, question, answer, support):
 ex = tf.train.SequenceExample()

 fl_context = ex.feature_lists.feature_list["context"]
 fl_question = ex.feature_lists.feature_list["question"]
 fl_answer = ex.feature_lists.feature_list["answer"]
 ex.context.feature["support"].int64_list.value.append(support)

 for token in context:
    fl_context.feature.add().int64_list.value.append(token)
 for qWord in question:
    fl_question.feature.add().int64_list.value.append(qWord)
 for ansWord in answer:
    fl_answer.feature.add().int64_list.value.append(ansWord)
 fl_support.feature.add().int64_list.value.append(support)   

return ex
但是,在传递上下文、问题和答案之前,我想嵌入单词,并用它们的手套向量表示它们,即(m,d)矩阵,其中m是句子中的标记数,d是每个单词向量的维数。我的
make_example
函数似乎没有很好地处理这一问题,因为我得到:

theTypeError: (array([[ -9.58490000e-01,   1.73210000e-01,   
2.51650000e-01,
 -5.61450000e-01,  -1.21440000e-01,   1.54350000e+00,
 -1.28930000e+00,  -9.77790000e-01,  -1.35480000e-01,
 -6.06930000e-01,  -1.37810000e+00,   6.33470000e-01,
  1.33160000e-01,   2.46320000e-01,   6.60260000e-01,
 -4.46130000e-02,   4.09510000e-01,  -7.61670000e-01,
  4.67530000e-01,  -6.67810000e-01,   2.99850000e-01,
 -2.74810000e-01,  -5.47990000e-01,  -8.56820000e-01,
  5.30880000e-02,  -2.01700000e+00,   7.48530000e-01,
 -1.27830000e-01,   1.32050000e-01,  -2.19450000e-01,
  2.29830000e+00,  -3.17680000e-01,  -8.64940000e-01,
 -1.08630000e-01,  -8.13770000e-02,  -7.03420000e-01,
  4.60000000e-01,  -3.34730000e-01,   4.37030000e-02,
 -7.55080000e-01,  -6.89710000e-01,   7.14380000e-01,
 -8.35950000e-02,   1.58620000e-02,  -5.23850000e-01,
  1.72520000e-01,  -4.98740000e-01,   2.30810000e-01,
 -3.64690000e-01,   1.5 has type <class 'tuple'>, but expected one of: 
(<class 'int'>,)
类型错误:(数组([[-9.5849000E-01,1.73210000e-01,
2.51650000e-01,
-5.61450000e-01,-1.21440000e-01,1.54350000e+00,
-1.28930000e+00,-9.77790000e-01,-1.35480000e-01,
-6.06930000e-01,-1.37810000e+00,6.3347000E-01,
1.331600E-01、2.46320000e-01、6.60260000e-01、,
-4.46130000e-02、4.09510000e-01、-7.61670000e-01、,
4.67530000e-01,-6.67810000e-01,2.9985000E-01,
-2.74810000e-01,-5.4799000E-01,-8.56820000e-01,
5.3088000E-02,-2.01700000e+00,7.48530000e-01,
-1.27830000e-01、1.320050000E-01、-2.19450000e-01、,
2.29830000e+00,-3.1768000E-01,-8.64940000e-01,
-1.08630000e-01,-8.1377000E-02,-7.03420000e-01,
4.6000000E-01、-3.34730000e-01、4.37030000e-02、,
-7.5508000E-01、-6.89710000e-01、7.1438000E-01、,
-8.35950000e-02、1.586200000E-02、-5.23850000e-01、,
1.7252000E-01,-4.98740000e-01,2.30810000e-01,
-3.6469000E-01,1.5具有类型,但预期为以下类型之一:
(,)
指向上面的
fl\u context.feature.add().int64\u list.value.append(token)
,有人能指出我在哪里误解了TFRecords的概念,并给我一个解决问题的建议吗?
我搜索了很多学习资料,但通常TFRecords上的示例都是图像数据。到目前为止,我的参考文献是和


非常感谢!

我的问题的答案可以在这里找到:

我的做法如下:

  • 将文本存储到csv文件中:每行(上下文、问题、答案)

  • 在我的例子中,定义一个将序列转换为tf_的函数

    def sequence_to_tf_example(context, question, answer):
        context_ids= vectorize(context, False, word_to_index)
        question_ids= vectorize(question, False, word_to_index)
        answer_ids= vectorize(answer, True, word_to_index)
        ex = tf.train.SequenceExample()
    
        context_tokens = ex.feature_lists.feature_list["context"]
        question_tokens = ex.feature_lists.feature_list["question"]
        answer_tokens = ex.feature_lists.feature_list["answer"]
    
        for token in context_ids:
            context_tokens.feature.add().int64_list.value.append(token)
        for token in question_ids:
            question_tokens.feature.add().int64_list.value.append(token)
        for token in answer_ids:
            #print(token)
            answer_tokens.feature.add().int64_list.value.append(token)
    
        return ex
    
  • 定义写函数

    def write_example_to_tfrecord(context, question, answer, tfrecord_file, writer):
          example= sequence_to_tf_example(context, question, answer)
          writer.write(example.SerializeToString())
    
    def write_data_to_tf_record(filename):
        file_csv= filename+'.csv'
        file_tfrecords= filename+'.tfrecords'
        with open(file_csv) as csvfile:
           readCSV = csv.reader(csvfile, delimiter=',')
           next(readCSV) #skip header
           writer= tf.python_io.TFRecordWriter(file_tfrecords)
           for row in readCSV:
           write_example_to_tfrecord(row[0], row[1], row[2], file_tfrecords, writer)
           writer.close()
    
    def read_from_tfrecord(ex):
    
       sequence_features = {
         "context": tf.FixedLenSequenceFeature([], dtype=tf.int64),
         "question": tf.FixedLenSequenceFeature([], dtype=tf.int64),
         "answer": tf.FixedLenSequenceFeature([], dtype=tf.int64)
     }
    
    # Parse the example (returns a dictionary of tensors)
    _, sequence_parsed = tf.parse_single_sequence_example(
        serialized=ex,
        sequence_features=sequence_features
    )
    
    return {"context": sequence_parsed['context'], "question": sequence_parsed['question'],
            "answer": sequence_parsed['answer']}
    
  • 定义读取函数

    def write_example_to_tfrecord(context, question, answer, tfrecord_file, writer):
          example= sequence_to_tf_example(context, question, answer)
          writer.write(example.SerializeToString())
    
    def write_data_to_tf_record(filename):
        file_csv= filename+'.csv'
        file_tfrecords= filename+'.tfrecords'
        with open(file_csv) as csvfile:
           readCSV = csv.reader(csvfile, delimiter=',')
           next(readCSV) #skip header
           writer= tf.python_io.TFRecordWriter(file_tfrecords)
           for row in readCSV:
           write_example_to_tfrecord(row[0], row[1], row[2], file_tfrecords, writer)
           writer.close()
    
    def read_from_tfrecord(ex):
    
       sequence_features = {
         "context": tf.FixedLenSequenceFeature([], dtype=tf.int64),
         "question": tf.FixedLenSequenceFeature([], dtype=tf.int64),
         "answer": tf.FixedLenSequenceFeature([], dtype=tf.int64)
     }
    
    # Parse the example (returns a dictionary of tensors)
    _, sequence_parsed = tf.parse_single_sequence_example(
        serialized=ex,
        sequence_features=sequence_features
    )
    
    return {"context": sequence_parsed['context'], "question": sequence_parsed['question'],
            "answer": sequence_parsed['answer']}
    
  • 创建数据集

    def make_dataset(path, batch_size=128):
      '''
      Makes  a Tensorflow dataset that is shuffled, batched and parsed.
      '''
       # Read a tf record file. This makes a dataset of raw TFRecords
       dataset = tf.data.TFRecordDataset([path])
       # Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors
       dataset =  dataset.map(read_from_tfrecord)
       #Shuffle the dataset
       dataset = dataset.shuffle(buffer_size=10000)
    
    # specify padding for each tensor seperatly
     dataset = dataset.padded_batch(batch_size, padded_shapes={
        "context": tf.TensorShape([None]), 
        "question": tf.TensorShape([None]), 
        "answer": tf.TensorShape([None]) 
    })
    
    return dataset
    

  • @萨拉瑟尔的问题?