Python 如何将kfold.split()应用于列表字典?
我想通过交叉验证来训练Keras模型,但我的数据是由列表组成的 我想要10倍,所以我想要每个验证步骤中10%的dict键的子集,在下一步中再增加10%(使用shuffle) 例如: 对于第一个验证步骤:Python 如何将kfold.split()应用于列表字典?,python,keras,cross-validation,Python,Keras,Cross Validation,我想通过交叉验证来训练Keras模型,但我的数据是由列表组成的 我想要10倍,所以我想要每个验证步骤中10%的dict键的子集,在下一步中再增加10%(使用shuffle) 例如: 对于第一个验证步骤: pairs_train = {'0': list1, '1': list2, '2': list3, '3': list4, '4': list5, '5': list6, '6': list7,
pairs_train = {'0': list1,
'1': list2,
'2': list3,
'3': list4,
'4': list5,
'5': list6,
'6': list7,
'7': list8,
'8': list9,
}
pairs_val = {'9': list10,
}
以下是我的功能:
def crossValidation(self, k_folds=10):
cv_accuracy_train = []
cv_accuracy_val = []
cv_loss_train = []
cv_loss_val = []
s = pd.Series(pairs)
idx = 0
for train_idx, val_idx in kfold.split(s):
print("=========================================")
print("====== K Fold Validation step => %d/%d =======" % (idx, k_folds))
print("=========================================")
train_gen = DataGenerator(pairs=s[train_idx], batch_size=self.param_grid['batch_size'],
nr_files=len(self.Data.all_files), nr_tests=len(self.Data.all_tests),
negative_ratio=self.param_grid['negative_ratio'])
val_gen = DataGenerator(pairs=s[val_idx], batch_size=self.param_grid['batch_size'],
nr_files=len(self.Data.all_files), nr_tests=len(self.Data.all_tests),
negative_ratio=self.param_grid['negative_ratio'])
# Train
h = self.model.fit(train_gen,
validation_data=val_gen,
epochs=self.param_grid['nb_epochs'],
verbose=2)
cv_accuracy_train.append(np.array(h.history['mae'])[-1])
cv_accuracy_val.append(np.array(h.history['val_mae'])[-1])
cv_loss_train.append(np.array(h.history['loss'])[-1])
cv_loss_val.append(np.array(h.history['val_loss'])[-1])
idx += 1
回溯:
File "/Users/joaolousada/Documents/5ºAno/Master-Thesis/main/Prioritizer/Prioritizer.py", line 173, in crossValidation
train_gen = DataGenerator(pairs=s[train_idx], batch_size=self.param_grid['batch_size'],
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/series.py", line 908, in __getitem__
return self._get_with(key)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/series.py", line 943, in _get_with
return self.loc[key]
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 879, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1099, in _getitem_axis
return self._getitem_iterable(key, axis=axis)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1037, in _getitem_iterable
keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1254, in _get_listlike_indexer
self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1298, in _validate_read_indexer
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n ...\n 3257, 3258, 3261, 3262, 3263, 3265, 3266, 3267, 3268, 3269],\n dtype='int64', length=2943)] are in the [index]"
如果有一个带有
列表的dict
。比如说
pairs = {'0': [1,2,3],
'1': [1,2,3],
'2': [4,6,8],
'3': [2,1,9],
'4': [9,7,8],
'5': [4,6,8],
'6': [9,7,8],
'7': [9,7,8],
'8': [1,2,3],
'9': [4,6,8],
}
以下函数将返回索引以按索引拆分dict
def kfold_split(pairs:dict, perc:float, shuffle:bool) -> list:
keys = list(pairs.keys())
sets = len(keys)
cv_perc = int(sets*perc)
folds = int(sets/cv_perc)
indices = []
for fold in range(folds):
# If you want to generate random keys
if shuffle:
# Choose random keys
random_keys = list(np.random.choice(keys, cv_perc))
other_keys = list(set(keys) - set(random_keys))
indices.append((other_keys, random_keys))
else:
if fold == 0:
fold_keys = keys[-cv_perc*(fold+1):]
else:
fold_keys = keys[-cv_perc*(fold+1):-cv_perc*(fold)]
other_keys = list(set(keys) - set(fold_keys))
indices.append((other_keys, fold_keys))
return indices
您可以检索随机索引
kfold_split(pairs, perc=.2, shuffle=True)
>>>
[(['6', '2', '1', '5', '4', '7', '0', '3'], ['9', '8']),
(['6', '1', '9', '5', '4', '7', '0', '3'], ['8', '2']),
(['2', '1', '8', '9', '5', '4', '7', '3'], ['6', '0']),
(['2', '8', '9', '5', '4', '7', '0', '3'], ['1', '6']),
(['6', '2', '8', '5', '4', '7', '0', '3'], ['9', '1'])]
kfold_split(pairs, perc=.2, shuffle=False)
>>>
[(['6', '2', '1', '5', '4', '7', '0', '3'], ['8', '9']),
(['2', '1', '8', '9', '5', '4', '0', '3'], ['6', '7']),
(['6', '2', '1', '8', '9', '7', '0', '3'], ['4', '5']),
(['6', '1', '8', '9', '5', '4', '7', '0'], ['2', '3']),
(['6', '2', '8', '9', '5', '4', '7', '3'], ['0', '1'])]
或订单指数
kfold_split(pairs, perc=.2, shuffle=True)
>>>
[(['6', '2', '1', '5', '4', '7', '0', '3'], ['9', '8']),
(['6', '1', '9', '5', '4', '7', '0', '3'], ['8', '2']),
(['2', '1', '8', '9', '5', '4', '7', '3'], ['6', '0']),
(['2', '8', '9', '5', '4', '7', '0', '3'], ['1', '6']),
(['6', '2', '8', '5', '4', '7', '0', '3'], ['9', '1'])]
kfold_split(pairs, perc=.2, shuffle=False)
>>>
[(['6', '2', '1', '5', '4', '7', '0', '3'], ['8', '9']),
(['2', '1', '8', '9', '5', '4', '0', '3'], ['6', '7']),
(['6', '2', '1', '8', '9', '7', '0', '3'], ['4', '5']),
(['6', '1', '8', '9', '5', '4', '7', '0'], ['2', '3']),
(['6', '2', '8', '9', '5', '4', '7', '3'], ['0', '1'])]
然后,您可以根据这些索引筛选字典,如下所示
for indices in result:
train_indices, test_indices = indices
# Filter dict by indices
pair_test = {k:v for k,v in pairs.items() if k in test_indices}
# Train data
pair_train = {k:v for k,v in pairs.items() if k not in train_indices}
# Some other stuff here
通过将所有dict键作为np.array
并在kf.split()。然后,通过获得的索引,我访问我想要的dict密钥。不确定是否有更优化的/pythonic的解决方案,但它工作正常
def crossValidation(self, k_folds=10):
cv_accuracy_train = []
cv_accuracy_val = []
cv_loss_train = []
cv_loss_val = []
s = np.array(list(self.Data.pairs.keys()))
kfold = KFold(n_splits=k_folds, shuffle=True)
idx = 0
for train_idx, val_idx in kfold.split(s):
print("=========================================")
print("====== K Fold Validation step => %d/%d =======" % (idx, k_folds))
print("=========================================")
pairs_train = {s[key]: self.Data.pairs[s[key]] for key in train_idx}
pairs_val = {s[key]: self.Data.pairs[s[key]] for key in val_idx}
train_gen = DataGenerator(pairs=pairs_train, batch_size=self.param_grid['batch_size'],
nr_files=len(self.Data.all_files), nr_tests=len(self.Data.all_tests),
negative_ratio=self.param_grid['negative_ratio'])
val_gen = DataGenerator(pairs=pairs_val, batch_size=self.param_grid['batch_size'],
nr_files=len(self.Data.all_files), nr_tests=len(self.Data.all_tests),
negative_ratio=self.param_grid['negative_ratio'])
# Train
h = self.model.fit(train_gen,
validation_data=val_gen,
epochs=self.param_grid['nb_epochs'],
verbose=2)
cv_accuracy_train.append(np.array(h.history['accuracy'])[-1])
cv_accuracy_val.append(np.array(h.history['val_accuracy'])[-1])
cv_loss_train.append(np.array(h.history['loss'])[-1])
cv_loss_val.append(np.array(h.history['val_loss'])[-1])
idx += 1
很好的答案,但不是我想要的。我想把整个dict分成80/20%,然后把这5部分内容排列起来data@johnnylousas,我认为当您不想加载sklearn库时,这是一种很好的方法。很抱歉误解了你的问题是的,这是正确的!谢谢你的贡献