-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdist_train.sh
106 lines (93 loc) · 2.54 KB
/
dist_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4
mkdir -p res_pretrain res_finetune
# export CUDA_VISIBLE_DEVICES=0,1,6,7
# NGPUS=4
# LR=0.0006
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NGPUS=8
LR=0.0012
## bs=2048, lr=1.2e-3
# mfm pretraining
torchrun --nproc_per_node=$NGPUS train_mfm.py \
--data-path ./imagenet/ \
--model resnet50 \
--epochs 300 \
--opt adamw \
--batch-size 256 \
--lr $LR \
--wd 0.05 \
--lr-scheduler cosineannealinglr \
--lr-warmup-epochs 20 \
--clip-grad-norm 3.0 \
--lr-warmup-method linear \
--output-dir ./res_pretrain \
--amp \
--use-dali \
--train-crop-size 224
# --resume res_pretrain/model_60.pth \
sleep 10
# finetune 100ep
PORT=45276
# export CUDA_VISIBLE_DEVICES=0,1,2,3
# export CUDA_VISIBLE_DEVICES=4,5,6,7
# export CUDA_VISIBLE_DEVICES=0,1,6,7
# NGPUS=4
# LR=0.012
# BS=512
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NGPUS=8
LR=0.012
BS=256
## bs=2048, lr=1.2e-2
torchrun --nproc_per_node=$NGPUS --master_port $PORT train_finetune.py \
--data-path ./imagenet/ \
--model resnet50 \
--batch-size $BS \
--epochs 100 \
--opt adamw \
--lr $LR \
--wd 0.02 \
--label-smoothing 0.1 \
--mixup-alpha 0.1 \
--cutmix-alpha 1.0 \
--lr-scheduler cosineannealinglr \
--lr-warmup-epochs 5 \
--lr-warmup-method linear \
--output-dir ./res_finetune \
--auto-augment ra_6_10 \
--weights ./res_pretrain/model_300.pth \
--amp \
--val-resize-size 235 \
--train-crop-size 160
## useless
# ckpts=(saved_model_19.pth saved_model_20.pth saved_model_21.pth saved_model_22.pth)
# for ckpt in ${ckpts[@]};
# do
# echo $ckpt
# for i in $(seq 1 1 3);
# do
# torchrun --nproc_per_node=$NGPUS --master_port $PORT train_finetune.py \
# --data-path ./imagenet/ \
# --model resnet50 \
# --batch-size $BS \
# --epochs 100 \
# --opt adamw \
# --lr $LR \
# --wd 0.02 \
# --label-smoothing 0.1 \
# --mixup-alpha 0.1 \
# --cutmix-alpha 1.0 \
# --lr-scheduler cosineannealinglr \
# --lr-warmup-epochs 5 \
# --lr-warmup-method linear \
# --output-dir ./res_finetune \
# --auto-augment ra_6_10 \
# --weights ./res_pretrain/$ckpt \
# --amp \
# --val-resize-size 235 \
# --train-crop-size 160
# cp -riv ./res_finetune/model_100.pth ./res_finetune/${ckpt}_$i
# sleep 60
# done
# done