sklearn 库常用用法
# sklearn 库常用用法
# train_test_split
方法
使用场景示例:我想要对 trainval
文件分成 train
文件 和 valid
文件
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
raw_train_df = pd.read_csv(cfg.path_raw_train_csv)
raw_train_df.columns = ['filename', 'label']
train_data, valid_data = train_test_split(
raw_train_df, shuffle=True, test_size=cfg.size_valid, random_state=cfg.seed_random)
train_data.to_csv(os.path.join(cfg.path_save_trainval_csv, f'train.csv'), index=False)
valid_data.to_csv(os.path.join(cfg.path_save_trainval_csv, f'valid.csv'), index=False)
print(f'train:{train_data.shape[0]}, valid:{valid_data.shape[0]}')
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# StratifiedKFold
用法
使用示例:我想要将一个 train
文件和 valid
文件分成多个 fold
import pandas as pd
from sklearn.model_selection import StratifiedKFold
raw_train_df = pd.read_csv(cfg.path_raw_train_csv)
raw_train_df.columns = ['filename', 'label']
skf = StratifiedKFold(
n_splits=cfg.num_KFold,
random_state=cfg.seed_random,
shuffle=True
)
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(x, y)):
fold_train = raw_train_df.iloc[train_idx]
fold_valid = raw_train_df.iloc[val_idx]
fold_train.to_csv(os.path.join(cfg.path_save_trainval_csv, f'train_fold{fold_idx}.csv'), index=False)
fold_valid.to_csv(os.path.join(cfg.path_save_trainval_csv, f'valid_fold{fold_idx}.csv'), index=False)
print(f'train_fold{fold_idx}: {fold_train.shape[0]}, valid_fold{fold_idx}: {fold_valid.shape[0]}')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
上次更新: 2021/08/17, 18:07:06
- 02
- README 美化05-20
- 03
- 常见 Tricks 代码片段05-12