-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathrecover_burst.sh
33 lines (22 loc) · 1.36 KB
/
recover_burst.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# recover_burst.sh -> a helper shell script written by zac, to improve participant QoL :)
experiment_id=$1
if [ -z "$experiment_id" ] || [ $# != 1 ] ; then
# Just a simple sanity check
echo "Usage: bash $0 <experiment_id>"
exit 1
fi
OUT_FOLDER="/root/exports/${experiment_id}/outputs" && latest_chk_id=$(ls "$OUT_FOLDER" |grep '^CHK[0-9]*'|sort -V|tail -1)
if [ -z "$latest_chk_id" ]; then
echo "Error: could not find checkpoint for experiment ${experiment_id}! Check the ID and try again."
exit 1
fi
echo "Found latest checkpoint: ${latest_chk_id}. Last training log was:"
tail "${OUT_FOLDER}/rank_0.txt" -n 1 # To help participants easily confirm it's the right checkpoint
RECOVER_FOLDER=/root/chess-hackathon/recover
echo "Copying checkpoint.pt to ${RECOVER_FOLDER} in 1 second (CTRL+C to cancel; will override if exists)..."
sleep 2 # Give a chance to cancel, e.g if the logs were wrong
mkdir -p "$RECOVER_FOLDER"
cp "${OUT_FOLDER}/rank_0.txt" "/root/chess-hackathon/latest_logs_rank0.txt" || echo "Error copying logs?" # Copy logs too :)
cp "${OUT_FOLDER}/${latest_chk_id}/checkpoint.pt" "${RECOVER_FOLDER}" && echo "Success! Now, run: 'isc train' as before, to resume burst!" && exit 0
# If we're here, the copying broke :(
echo "Error copying checkpoint from ${OUT_FOLDER}/${latest_chk_id}/checkpoint.pt to ${RECOVER_FOLDER}" !" && exit 1