Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/tensorflow/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 在分割之前或之后使用dok矩阵(一个热编码)?_Python_Tensorflow_Matrix_One Hot Encoding - Fatal编程技术网

Python 在分割之前或之后使用dok矩阵(一个热编码)?

Python 在分割之前或之后使用dok矩阵(一个热编码)?,python,tensorflow,matrix,one-hot-encoding,Python,Tensorflow,Matrix,One Hot Encoding,我的数据框包括一次购买。买方(买方id)可以购买多个项目(项目id)。 我必须在一个热编码矩阵之前还是之后分割数据?虽然我已经研究过了,但我不确定 我看了看 数据帧: d = {'purchaseid': [0, 0, 0, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 9, 9, 9, 9], 'itemid': [ 3, 8, 2, 10, 3, 10, 4, 12, 3, 12, 3, 4, 8, 6, 3, 0, 5

我的数据框包括一次购买。买方(买方id)可以购买多个项目(项目id)。 我必须在一个热编码矩阵之前还是之后分割数据?虽然我已经研究过了,但我不确定

我看了看

数据帧

d = {'purchaseid': [0, 0, 0, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 9, 9, 9, 9],
     'itemid': [ 3, 8, 2, 10, 3, 10, 4, 12, 3, 12, 3, 4, 8, 6, 3, 0, 5, 12, 9, 9, 13, 1, 7, 11, 11]}
df = pd.DataFrame(data=d)

   purchaseid  itemid
0           0       3
1           0       8
2           0       2
3           1      10
4           2       3
PERCENTAGE_SPLIT = 20
NUM_NEGATIVES = 4
def splitter(df):
  df_ = pd.DataFrame()
  sum_purchase = df['purchaseid'].nunique()
  amount = round((sum_purchase / 100) * PERCENTAGE_SPLIT)

  random_list = random.sample(df['purchaseid'].unique().tolist(), amount)
  df_ = df.loc[df['purchaseid'].isin(random_list)]
  df_reduced = df.loc[~df['purchaseid'].isin(random_list)]
  return [df_reduced, df_]

def generate_matrix(df_main, dataframe, name):
  
  mat = sp.dok_matrix((df_main.shape[0], len(df_main['itemid'].unique())), dtype=np.float32)
  for purchaseid, itemid in zip(dataframe['purchaseid'], dataframe['itemid']):
    mat[purchaseid, itemid] = 1.0

  return mat

dfs = splitter(df)
df_tr = dfs[0].copy(deep=True)
df_val = dfs[1].copy(deep=True)

train_mat = generate_matrix(df, df_tr, 'train')
val_mat = generate_matrix(df, df_val, 'val')
代码

d = {'purchaseid': [0, 0, 0, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 9, 9, 9, 9],
     'itemid': [ 3, 8, 2, 10, 3, 10, 4, 12, 3, 12, 3, 4, 8, 6, 3, 0, 5, 12, 9, 9, 13, 1, 7, 11, 11]}
df = pd.DataFrame(data=d)

   purchaseid  itemid
0           0       3
1           0       8
2           0       2
3           1      10
4           2       3
PERCENTAGE_SPLIT = 20
NUM_NEGATIVES = 4
def splitter(df):
  df_ = pd.DataFrame()
  sum_purchase = df['purchaseid'].nunique()
  amount = round((sum_purchase / 100) * PERCENTAGE_SPLIT)

  random_list = random.sample(df['purchaseid'].unique().tolist(), amount)
  df_ = df.loc[df['purchaseid'].isin(random_list)]
  df_reduced = df.loc[~df['purchaseid'].isin(random_list)]
  return [df_reduced, df_]

def generate_matrix(df_main, dataframe, name):
  
  mat = sp.dok_matrix((df_main.shape[0], len(df_main['itemid'].unique())), dtype=np.float32)
  for purchaseid, itemid in zip(dataframe['purchaseid'], dataframe['itemid']):
    mat[purchaseid, itemid] = 1.0

  return mat

dfs = splitter(df)
df_tr = dfs[0].copy(deep=True)
df_val = dfs[1].copy(deep=True)

train_mat = generate_matrix(df, df_tr, 'train')
val_mat = generate_matrix(df, df_val, 'val')