Pandas 具有多值稀疏分类数据的输入函数

Pandas 具有多值稀疏分类数据的输入函数,pandas,tensorflow,tensorflow-datasets,Pandas,Tensorflow,Tensorflow Datasets,给定一个数据帧 df = pd.DataFrame([ [1, ["a", "b"], 10], [2, ["b"], 20], ], columns= ["a", "b", "label"]) 如果列“b”是一个值列表,表示稀疏的分类数据,我如何创建一个输入函数以提供给训练中的估计器并进行预测 使用padas\u input\u fn无法工作,因为b列: train_fn = tf.estimator.inputs.pandas_input_fn(x=df[["a", "

给定一个数据帧

df = pd.DataFrame([
    [1, ["a", "b"], 10], 
    [2, ["b"], 20], 
], columns= ["a", "b", "label"])
如果列“b”是一个值列表,表示稀疏的分类数据,我如何创建一个输入函数以提供给训练中的估计器并进行预测

使用
padas\u input\u fn
无法工作,因为b列:

train_fn = tf.estimator.inputs.pandas_input_fn(x=df[["a", "b"]], y=df.label, shuffle=True)
--错误--

我可以创建一个
tfrecords
文件,使用
BytesList
为b列写入数据,并使用
TFRecordDataset
读取数据,而不是使用parse funct使用
varLenFeature
解析b列,这样做很有效

但是如何使用内存中的对象/数据帧和/或输入fn将这些数据输入估计器呢

以下是我的全部代码:

import tensorflow as tf
import pandas as pd

from tensorflow.estimator.inputs import pandas_input_fn
from tensorflow.estimator import DNNRegressor
from tensorflow.feature_column import numeric_column, indicator_column, categorical_column_with_vocabulary_list
from tensorflow.train import Feature, Features, BytesList, FloatList, Example
from tensorflow.python_io import TFRecordWriter

df = pd.DataFrame([
    [1, ["a", "b"], 10], 
    [2, ["b"], 20], 
], columns= ["a", "b", "label"])


writer = TFRecordWriter("test.tfrecord")
for row in df.iterrows():
    dict_feature = {}
    label_values = []
    for e in row[1].iteritems():
        if e[0] =="a":
            dict_feature.update({e[0]: Feature(float_list=FloatList(value=[e[1]]))})
        elif e[0] == "b":
            dict_feature.update({e[0]: Feature(bytes_list=BytesList(value=[m.encode('utf-8') for m in e[1]]))})
        elif e[0] == "label":
            dict_feature.update({e[0]: Feature(float_list=FloatList(value=[e[1]]))})

    example = Example(features=Features(feature=dict_feature))
    writer.write(example.SerializeToString()) 
writer.close()


def parse_tfrecords(example_proto):
    feature_description = {}
    feature_description.update({"a": tf.FixedLenFeature(shape=[], dtype=tf.float32)})
    feature_description.update({"b": tf.VarLenFeature(dtype=tf.string)})
    feature_description.update({"label": tf.FixedLenFeature(shape=[], dtype=tf.float32)})

    parsed_features = tf.parse_single_example(example_proto, feature_description)   
    features = { key: parsed_features[key] for key in ["a", "b"] }
    label = parsed_features["label"]
    return features, label

def tf_record_input_fn(filenames_pattern):

    def _input_fn():
        dataset = tf.data.TFRecordDataset(filenames=filenames_pattern)
        dataset = dataset.shuffle(buffer_size=128)
        dataset = dataset.map(map_func=parse_tfrecords)
        dataset = dataset.batch(batch_size=128)

        return dataset
    return _input_fn


feature_columns = [
    numeric_column("a"),
    indicator_column(categorical_column_with_vocabulary_list("b", vocabulary_list=['a', 'b']))
]
estimator = DNNRegressor(feature_columns=feature_columns, hidden_units=[1])
train_input_fn = tf_record_input_fn("test.tfrecord")
# Next line does not work
# train_input_fn = tf.estimator.inputs.pandas_input_fn(x=df[["a", "b"]], y=df.label, shuffle=True)
estimator.train(train_input_fn)

由于我缺乏使用
tensorflow.estimator
API的经验,因此我没有一个完整的解决方案来解决您的查询,但是您是否可以改为重塑数据帧?如果列
b
列表中的值本质上是分类的,您可以尝试对它们进行热编码,并在该过程中向
df
添加更多列?这样,您的df将能够处理所有估计器

import tensorflow as tf
import pandas as pd

from tensorflow.estimator.inputs import pandas_input_fn
from tensorflow.estimator import DNNRegressor
from tensorflow.feature_column import numeric_column, indicator_column, categorical_column_with_vocabulary_list
from tensorflow.train import Feature, Features, BytesList, FloatList, Example
from tensorflow.python_io import TFRecordWriter

df = pd.DataFrame([
    [1, ["a", "b"], 10], 
    [2, ["b"], 20], 
], columns= ["a", "b", "label"])


writer = TFRecordWriter("test.tfrecord")
for row in df.iterrows():
    dict_feature = {}
    label_values = []
    for e in row[1].iteritems():
        if e[0] =="a":
            dict_feature.update({e[0]: Feature(float_list=FloatList(value=[e[1]]))})
        elif e[0] == "b":
            dict_feature.update({e[0]: Feature(bytes_list=BytesList(value=[m.encode('utf-8') for m in e[1]]))})
        elif e[0] == "label":
            dict_feature.update({e[0]: Feature(float_list=FloatList(value=[e[1]]))})

    example = Example(features=Features(feature=dict_feature))
    writer.write(example.SerializeToString()) 
writer.close()


def parse_tfrecords(example_proto):
    feature_description = {}
    feature_description.update({"a": tf.FixedLenFeature(shape=[], dtype=tf.float32)})
    feature_description.update({"b": tf.VarLenFeature(dtype=tf.string)})
    feature_description.update({"label": tf.FixedLenFeature(shape=[], dtype=tf.float32)})

    parsed_features = tf.parse_single_example(example_proto, feature_description)   
    features = { key: parsed_features[key] for key in ["a", "b"] }
    label = parsed_features["label"]
    return features, label

def tf_record_input_fn(filenames_pattern):

    def _input_fn():
        dataset = tf.data.TFRecordDataset(filenames=filenames_pattern)
        dataset = dataset.shuffle(buffer_size=128)
        dataset = dataset.map(map_func=parse_tfrecords)
        dataset = dataset.batch(batch_size=128)

        return dataset
    return _input_fn


feature_columns = [
    numeric_column("a"),
    indicator_column(categorical_column_with_vocabulary_list("b", vocabulary_list=['a', 'b']))
]
estimator = DNNRegressor(feature_columns=feature_columns, hidden_units=[1])
train_input_fn = tf_record_input_fn("test.tfrecord")
# Next line does not work
# train_input_fn = tf.estimator.inputs.pandas_input_fn(x=df[["a", "b"]], y=df.label, shuffle=True)
estimator.train(train_input_fn)