From 0b27c4e77acd1cdd42dab4bd7d2c42d431e922f4 Mon Sep 17 00:00:00 2001 From: "Illuminatus [CCIO]" Date: Mon, 6 Jan 2025 10:29:57 -0800 Subject: [PATCH 01/14] Include healthcheck logic for helper scripts running as sidecars --- files/docker/node/addons/healthcheck.sh | 155 +++++++++++++++++++----- 1 file changed, 122 insertions(+), 33 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 4de0837b1..7bd9a7127 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -1,43 +1,132 @@ -#!/usr/bin/env bash +#!/bin/bash +# shellcheck source=/dev/null +# +###################################### +# User Variables - Change as desired # +# Common variables set in env file # +###################################### -source /opt/cardano/cnode/scripts/env +ENTRYPOINT_PROCESS="${ENTRYPOINT_PROCESS:-cnode.sh}" # Get the script from ENTRYPOINT_PROCESS or default to "cnode.sh" if not set +CPU_THRESHOLD="${CPU_THRESHOLD:-80}" # The CPU threshold to warn about if the sidecar process exceeds this for more than 60 seconds, defaults to 80% +RETRIES="${RETRIES:-20}" # The number of retries if tip is not incrementing, or cpu usage is over the threshold -CCLI=$(which cardano-cli) +###################################### +# Do NOT modify code below # +###################################### -if [[ "$NETWORK" == "guild-mainnet" ]]; then NETWORK=mainnet; fi +if [[ "${ENTRYPOINT_PROCESS}" == "cnode.sh" ]]; then + source /opt/cardano/cnode/scripts/env +else + # Source in offline mode for sidecar helper scripts + source /opt/cardano/cnode/scripts/env offline +fi -# For querying tip, the seperation of testnet-magic vs mainnet as argument is optional +# Define a mapping of scripts to their corresponding binaries, when defined check the binary is running and its CPU usage instead of the wrapper script. +declare -A SCRIPT_TO_BINARY_MAP +SCRIPT_TO_BINARY_MAP=( + ["cncli.sh"]="cncli" + ["mithril-signer.sh"]="mithril-signer" +) -FIRST=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) +# Define scripts which may sleep between executions of the binary. +SLEEPING_SCRIPTS=("cncli.sh") -if [[ "${ENABLE_KOIOS}" == "N" ]] || [[ -z "${KOIOS_API}" ]]; then - # when KOIOS is not enabled or KOIOS_API is unset, use default behavior - sleep 60; - SECOND=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) - if [[ "$FIRST" -ge "$SECOND" ]]; then - echo "there is a problem" - exit 1 +# Function to check if a process is running and its CPU usage +check_process() { + local process_name="$1" + local cpu_threshold="$2" + + for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do + # Check CPU usage of the process + CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print s}') + + # Check if CPU usage exceeds threshold + if (( CPU_USAGE > cpu_threshold )); then + echo "Warning: High CPU usage detected for '$process_name' ($CPU_USAGE%)" + sleep 3 # Retry after a pause + continue + fi + + # Check if ENTRYPOINT_PROCESS is in the SLEEPING_SCRIPTS array + if [[ " ${SLEEPING_SCRIPTS[@]} " =~ " ${ENTRYPOINT_PROCESS} " ]]; then + # If the process is in SLEEPING_SCRIPTS, check if either the process or 'sleep' is running + if ! pgrep -x "$process_name" > /dev/null && ! pgrep -x "sleep" > /dev/null; then + echo "Error: '$process_name' is not running, and no 'sleep' process found" + return 3 # Return 3 if the process is not running and sleep is not found + fi + else + # If the process is not in SLEEPING_SCRIPTS, only check for the specific process + if ! pgrep -x "$process_name" > /dev/null; then + echo "Error: '$process_name' is not running" + return 3 # Return 3 if the process is not running + fi + fi + + echo "We're healthy - $process_name" + return 0 # Return 0 if the process is healthy + done + + echo "Max retries reached for $process_name" + return 1 # Return 1 if retries are exhausted +} + + +# Function to check if the node is running and is on tip +check_node() { + CCLI=$(which cardano-cli) + + # Adjust NETWORK variable if needed + if [[ "$NETWORK" == "guild-mainnet" ]]; then NETWORK=mainnet; fi + + FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) + + if [[ "${ENABLE_KOIOS}" == "N" ]] || [[ -z "${KOIOS_API}" ]]; then + sleep 60 + SECOND=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) + if [[ "$FIRST" -ge "$SECOND" ]]; then + echo "There is a problem" + exit 1 + else + echo "We're healthy - node: $FIRST -> node: $SECOND" + fi else - echo "we're healthy - node: $FIRST -> node: $SECOND" + CURL=$(which curl) + JQ=$(which jq) + URL="${KOIOS_API}/tip" + SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no') + + for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do + if [[ "$FIRST" -eq "$SECOND" ]]; then + echo "We're healthy - node: $FIRST == koios: $SECOND" + exit 0 + elif [[ "$FIRST" -lt "$SECOND" ]]; then + sleep 3 + FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) + elif [[ "$FIRST" -gt "$SECOND" ]]; then + sleep 3 + SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no') + fi + done + echo "There is a problem" + exit 1 fi +} + +# MAIN +if [[ "$ENTRYPOINT_PROCESS" == "cnode.sh" ]]; then + # The original health check logic for "cnode.sh" + check_node else - # else leverage koios and only require the node is on tip - CURL=$(which curl) - JQ=$(which jq) - URL="${KOIOS_API}/tip" - SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no') - for (( CHECK=1; CHECK<=20; CHECK++ )); do - if [[ "$FIRST" -eq "$SECOND" ]]; then - echo "we're healthy - node: $FIRST == koios: $SECOND" - exit 0 - elif [[ "$FIRST" -lt "$SECOND" ]]; then - sleep 3 - FIRST=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) - elif [[ "$FIRST" -gt "$SECOND" ]]; then - sleep 3 - SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no') - fi - done - echo "there is a problem" - exit 1 + # Determine the process name or script to check health + if [[ -n "${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}" ]]; then + process="${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}" + fi + echo "Checking health for process: $process" + check_process "$process" "$CPU_THRESHOLD" + exit $? fi + +# If all checks pass, return healthy status +echo "Container is healthy" +exit 0 + From 257647c8c699ed519b942cc197bf92c4605430da Mon Sep 17 00:00:00 2001 From: illuminatus Date: Tue, 7 Jan 2025 16:18:05 -0800 Subject: [PATCH 02/14] Round CPU_USAGE to an int and if less than 0.5 from threshold cause it to round up. --- files/docker/node/addons/healthcheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 7bd9a7127..d91ded7a6 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -38,7 +38,7 @@ check_process() { for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do # Check CPU usage of the process - CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print s}') + CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print int(s + 0.5)}') # Check if CPU usage exceeds threshold if (( CPU_USAGE > cpu_threshold )); then From 859b0e50f497361713f872b13ee4f175ac05e0eb Mon Sep 17 00:00:00 2001 From: illuminatus Date: Sun, 12 Jan 2025 21:09:54 -0800 Subject: [PATCH 03/14] Improved/Corrected for loop logic Co-authored-by: Adam Matthews <52178922+adamsthws@users.noreply.github.com> --- files/docker/node/addons/healthcheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index d91ded7a6..ac59aa9cc 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -36,7 +36,7 @@ check_process() { local process_name="$1" local cpu_threshold="$2" - for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do + for (( CHECK=0; CHECK<=RETRIES; CHECK++ )); do # Check CPU usage of the process CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print int(s + 0.5)}') From f752b1c3494f534817b836fc1d96fa980609140f Mon Sep 17 00:00:00 2001 From: "Illuminatus [CCIO]" Date: Mon, 13 Jan 2025 22:44:40 -0800 Subject: [PATCH 04/14] Added DB checks for cncli and dbsync. --- files/docker/node/addons/healthcheck.sh | 190 ++++++++++++++++-------- files/docker/node/dockerfile_bin | 8 +- 2 files changed, 130 insertions(+), 68 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index ac59aa9cc..4f966be1a 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -1,73 +1,93 @@ #!/bin/bash # shellcheck source=/dev/null -# +# shellcheck disable=SC2317 ###################################### # User Variables - Change as desired # # Common variables set in env file # ###################################### -ENTRYPOINT_PROCESS="${ENTRYPOINT_PROCESS:-cnode.sh}" # Get the script from ENTRYPOINT_PROCESS or default to "cnode.sh" if not set -CPU_THRESHOLD="${CPU_THRESHOLD:-80}" # The CPU threshold to warn about if the sidecar process exceeds this for more than 60 seconds, defaults to 80% -RETRIES="${RETRIES:-20}" # The number of retries if tip is not incrementing, or cpu usage is over the threshold +ENTRYPOINT_PROCESS="${ENTRYPOINT_PROCESS:-cnode.sh}" # Get the script from ENTRYPOINT_PROCESS or default to "cnode.sh" if not set +HEALTHCHECK_CPU_THRESHOLD="${HEALTHCHECK_CPU_THRESHOLD:-80}" # The CPU threshold to warn about if the sidecar process exceeds this for more than 60 seconds, defaults to 80%. +HEALTHCHECK_RETRIES="${HEALTHCHECK_RETRIES:-20}" # The number of retries if tip is not incrementing, or cpu usage is over the threshold +DB_SYNC_ALLOWED_DRIFT="${DB_SYNC_ALLOWED_DRIFT:-3600}" # The allowed drift in seconds for the DB to be considered in sync +CNCLI_DB_ALLOWED_DRIFT="${CNCLI_DB_ALLOWED_DRIFT:-300}" # The allowed drift in slots for the CNCLI DB to be considered in sync ###################################### # Do NOT modify code below # ###################################### -if [[ "${ENTRYPOINT_PROCESS}" == "cnode.sh" ]]; then - source /opt/cardano/cnode/scripts/env -else - # Source in offline mode for sidecar helper scripts - source /opt/cardano/cnode/scripts/env offline -fi - -# Define a mapping of scripts to their corresponding binaries, when defined check the binary is running and its CPU usage instead of the wrapper script. -declare -A SCRIPT_TO_BINARY_MAP -SCRIPT_TO_BINARY_MAP=( - ["cncli.sh"]="cncli" - ["mithril-signer.sh"]="mithril-signer" +[[ ${0} != '-bash' ]] && PARENT="$(dirname $0)" || PARENT="$(pwd)" +# Check if env file is missing in current folder (no update checks as will mostly run as daemon), source env if present +[[ ! -f "${PARENT}"/env ]] && echo -e "\nCommon env file missing in \"${PARENT}\", please ensure latest guild-deploy.sh was run and this script is being run from ${CNODE_HOME}/scripts folder! \n" && exit 1 +. "${PARENT}"/env offline + +# Define a mapping of scripts to their corresponding health check functions +declare -A PROCESS_TO_HEALTHCHECK +PROCESS_TO_HEALTHCHECK=( + ["dbsync.sh"]="check_db_sync" + ["cnode.sh"]="check_node" + ["cncli.sh"]="check_cncli" ) -# Define scripts which may sleep between executions of the binary. -SLEEPING_SCRIPTS=("cncli.sh") - -# Function to check if a process is running and its CPU usage -check_process() { - local process_name="$1" - local cpu_threshold="$2" - - for (( CHECK=0; CHECK<=RETRIES; CHECK++ )); do - # Check CPU usage of the process - CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print int(s + 0.5)}') +# FUNCTIONS +check_cncli() { + cncli_pid=$(pgrep -f "${ENTRYPOINT_PROCESS}") + cncli_subcmd=$(ps -p "${cncli_pid}" -o cmd= | awk '{print $NF}') - # Check if CPU usage exceeds threshold - if (( CPU_USAGE > cpu_threshold )); then - echo "Warning: High CPU usage detected for '$process_name' ($CPU_USAGE%)" - sleep 3 # Retry after a pause - continue - fi - - # Check if ENTRYPOINT_PROCESS is in the SLEEPING_SCRIPTS array - if [[ " ${SLEEPING_SCRIPTS[@]} " =~ " ${ENTRYPOINT_PROCESS} " ]]; then - # If the process is in SLEEPING_SCRIPTS, check if either the process or 'sleep' is running - if ! pgrep -x "$process_name" > /dev/null && ! pgrep -x "sleep" > /dev/null; then - echo "Error: '$process_name' is not running, and no 'sleep' process found" - return 3 # Return 3 if the process is not running and sleep is not found - fi + if [[ "${cncli_subcmd}" != "ptsendtip" ]]; then + if check_cncli_db ; then + return 0 else - # If the process is not in SLEEPING_SCRIPTS, only check for the specific process - if ! pgrep -x "$process_name" > /dev/null; then - echo "Error: '$process_name' is not running" - return 3 # Return 3 if the process is not running - fi + return 1 fi + else + # No-op. placeholder for check_cncli_ptsendtip + : + # if ! check_cncli_ptsendtip; then + # return 1 + # else + # return 0 + fi +} - echo "We're healthy - $process_name" - return 0 # Return 0 if the process is healthy - done - echo "Max retries reached for $process_name" - return 1 # Return 1 if retries are exhausted +check_cncli_db() { + CCLI=$(which cardano-cli) + SQLITE=$(which sqlite3) + # Check if the DB is in sync + CNCLI_SLOT=$(${SQLITE} "${CNODE_HOME}/guild-db/cncli/cncli.db" 'select slot_number from chain order by id desc limit 1;') + NODE_SLOT=$(${CCLI} query tip --testnet-magic "${NWMAGIC}" | jq .slot) + if check_tip "${NODE_SLOT}" "${CNCLI_SLOT}" "${CNCLI_DB_ALLOWED_DRIFT}" ; then + echo "We're healthy - DB is in sync" + return 0 + else + echo "Error: DB is not in sync" + return 1 + fi +} + + +check_db_sync() { + # Check if the DB is in sync + [[ -z "${PGPASSFILE}" ]] && PGPASSFILE="${CNODE_HOME}/priv/.pgpass" + if [[ ! -f "${PGPASSFILE}" ]]; then + echo "ERROR: The PGPASSFILE (${PGPASSFILE}) not found, please ensure you've followed the instructions on guild-operators website!" && exit 1 + return 1 + else + # parse the password from the pgpass file + IFS=':' read -r PGHOST PGPORT _ PGUSER PGPASSWORD < "${PGPASSFILE}" + PGDATABASE=cexplorer + export PGHOST PGPORT PGDATABASE PGUSER PGPASSWORD + fi + CURRENT_TIME=$(date +%s) + LATEST_BLOCK_TIME=$(date --date="$(psql -qt -c 'select time from block order by id desc limit 1;')" +%s) + if check_tip "${CURRENT_TIME}""${LATEST_BLOCK_TIME}" "${DB_SYNC_ALLOWED_DRIFT}"; then + echo "We're healthy - DB is in sync" + return 0 + else + echo "Error: DB is not in sync" + return 1 + fi } @@ -85,9 +105,10 @@ check_node() { SECOND=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) if [[ "$FIRST" -ge "$SECOND" ]]; then echo "There is a problem" - exit 1 + return 1 else echo "We're healthy - node: $FIRST -> node: $SECOND" + return 0 fi else CURL=$(which curl) @@ -95,10 +116,10 @@ check_node() { URL="${KOIOS_API}/tip" SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no') - for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do + for (( CHECK=1; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do if [[ "$FIRST" -eq "$SECOND" ]]; then echo "We're healthy - node: $FIRST == koios: $SECOND" - exit 0 + return 0 elif [[ "$FIRST" -lt "$SECOND" ]]; then sleep 3 FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) @@ -108,25 +129,66 @@ check_node() { fi done echo "There is a problem" - exit 1 + return 1 + fi +} + +# Function to check if a process is running and its CPU usage +check_process() { + local process_name="$1" + local cpu_threshold="$2" + + for (( CHECK=0; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do + # Check CPU usage of the process + CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print s}') + + # Check if CPU usage exceeds threshold + if (( CPU_USAGE > cpu_threshold )); then + echo "Warning: High CPU usage detected for '$process_name' ($CPU_USAGE%)" + sleep 3 # Retry after a pause + continue + fi + + # Check if ENTRYPOINT_PROCESS is in the SLEEPING_SCRIPTS array + if ! pgrep -x "$process_name" > /dev/null && ! pgrep -x "sleep" > /dev/null; then + echo "Error: '$process_name' is not running, and no 'sleep' process found" + return 3 # Return 3 if the process is not running and sleep is not found + fi + + echo "We're healthy - $process_name" + return 0 # Return 0 if the process is healthy + done + + echo "Max retries reached for $process_name" + return 1 # Return 1 if retries are exhausted +} + + +check_tip() { + TIP=$1 + DB_TIP=$2 + ALLOWED_DRIFT=$3 + + if [[ $(( TIP - DB_TIP )) -lt ${ALLOWED_DRIFT} ]]; then + return 0 + else + return 1 fi } + # MAIN -if [[ "$ENTRYPOINT_PROCESS" == "cnode.sh" ]]; then - # The original health check logic for "cnode.sh" - check_node +if [[ -n "${PROCESS_TO_HEALTHCHECK[$ENTRYPOINT_PROCESS]}" ]]; then + echo "Checking health for $ENTRYPOINT_PROCESS" + eval "${PROCESS_TO_HEALTHCHECK[$ENTRYPOINT_PROCESS]}" + exit $? else + # When # Determine the process name or script to check health if [[ -n "${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}" ]]; then process="${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}" fi echo "Checking health for process: $process" - check_process "$process" "$CPU_THRESHOLD" + check_process "$process" "$HEALTHCHECK_CPU_THRESHOLD" exit $? fi - -# If all checks pass, return healthy status -echo "Container is healthy" -exit 0 - diff --git a/files/docker/node/dockerfile_bin b/files/docker/node/dockerfile_bin index 16a3fe00d..6b00b8492 100644 --- a/files/docker/node/dockerfile_bin +++ b/files/docker/node/dockerfile_bin @@ -94,9 +94,9 @@ RUN curl -sL -H "Accept: application/vnd.github.everest-preview+json" -H "Conte # ENTRY SCRIPT ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/banner.txt \ - https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/block_watcher.sh \ - https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/healthcheck.sh /home/guild/.scripts/ -ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/guild-deploy.sh \ + https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/block_watcher.sh /home/guild/.scripts/ +ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/healthcheck.sh \ + https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/guild-deploy.sh \ https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/mithril-client.sh \ https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/mithril-signer.sh \ https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/mithril-relay.sh /opt/cardano/cnode/scripts/ @@ -105,7 +105,7 @@ ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLO RUN sudo chmod -R a+rx /home/guild/.scripts/*.sh /opt/cardano/cnode/scripts/*.sh /home/guild/entrypoint.sh /conf \ && sudo chown -R guild:guild /home/guild/.* $CNODE_HOME /conf -HEALTHCHECK --start-period=5m --interval=5m --timeout=100s CMD /home/guild/.scripts/healthcheck.sh +HEALTHCHECK --start-period=5m --interval=5m --timeout=100s CMD /opt/cardano/cnode/scripts/healthcheck.sh ENTRYPOINT ["./entrypoint.sh"] From 74aa459ccb7dce2fa30bb2508e19f3604de8189f Mon Sep 17 00:00:00 2001 From: "Illuminatus [CCIO]" Date: Mon, 13 Jan 2025 23:00:12 -0800 Subject: [PATCH 05/14] Remove reference to old SLEEPING_SCRIPTS array --- files/docker/node/addons/healthcheck.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 4f966be1a..facd636a8 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -149,7 +149,6 @@ check_process() { continue fi - # Check if ENTRYPOINT_PROCESS is in the SLEEPING_SCRIPTS array if ! pgrep -x "$process_name" > /dev/null && ! pgrep -x "sleep" > /dev/null; then echo "Error: '$process_name' is not running, and no 'sleep' process found" return 3 # Return 3 if the process is not running and sleep is not found From 1eda07b3a979ca1608265a085a8d9de47e3c90e4 Mon Sep 17 00:00:00 2001 From: illuminatus Date: Tue, 14 Jan 2025 08:50:02 -0800 Subject: [PATCH 06/14] Restore the correct loop for check/retries. Co-authored-by: Adam Matthews <52178922+adamsthws@users.noreply.github.com> --- files/docker/node/addons/healthcheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index facd636a8..82d1af137 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -116,7 +116,7 @@ check_node() { URL="${KOIOS_API}/tip" SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no') - for (( CHECK=1; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do + for (( CHECK=0; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do if [[ "$FIRST" -eq "$SECOND" ]]; then echo "We're healthy - node: $FIRST == koios: $SECOND" return 0 From a7d9a188950072ce4bb4b25327f6c4b0b370637d Mon Sep 17 00:00:00 2001 From: "Illuminatus [CCIO]" Date: Tue, 14 Jan 2025 09:26:04 -0800 Subject: [PATCH 07/14] Apply Review changes Co-authored-by: Adam Matthews <52178922+adamsthws@users.noreply.github.com> --- files/docker/node/addons/healthcheck.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 82d1af137..da4d16ba5 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -9,6 +9,7 @@ ENTRYPOINT_PROCESS="${ENTRYPOINT_PROCESS:-cnode.sh}" # Get the script from ENTRYPOINT_PROCESS or default to "cnode.sh" if not set HEALTHCHECK_CPU_THRESHOLD="${HEALTHCHECK_CPU_THRESHOLD:-80}" # The CPU threshold to warn about if the sidecar process exceeds this for more than 60 seconds, defaults to 80%. HEALTHCHECK_RETRIES="${HEALTHCHECK_RETRIES:-20}" # The number of retries if tip is not incrementing, or cpu usage is over the threshold +HEALTHCHECK_RETRY_WAIT="${HEALTHCHECK_RETRY_WAIT:-3}" # The time (in seconds) to wait between retries DB_SYNC_ALLOWED_DRIFT="${DB_SYNC_ALLOWED_DRIFT:-3600}" # The allowed drift in seconds for the DB to be considered in sync CNCLI_DB_ALLOWED_DRIFT="${CNCLI_DB_ALLOWED_DRIFT:-300}" # The allowed drift in slots for the CNCLI DB to be considered in sync @@ -121,10 +122,10 @@ check_node() { echo "We're healthy - node: $FIRST == koios: $SECOND" return 0 elif [[ "$FIRST" -lt "$SECOND" ]]; then - sleep 3 + sleep "$HEALTHCHECK_RETRY_WAIT" FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) elif [[ "$FIRST" -gt "$SECOND" ]]; then - sleep 3 + sleep "$HEALTHCHECK_RETRY_WAIT" SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no') fi done @@ -145,7 +146,7 @@ check_process() { # Check if CPU usage exceeds threshold if (( CPU_USAGE > cpu_threshold )); then echo "Warning: High CPU usage detected for '$process_name' ($CPU_USAGE%)" - sleep 3 # Retry after a pause + sleep "$HEALTHCHECK_RETRY_WAIT" # Retry after a pause continue fi From bd6d3dd2024d8c61f96b1b0ce15f378ecb13d4bc Mon Sep 17 00:00:00 2001 From: RdLrT <3169068+rdlrt@users.noreply.github.com> Date: Wed, 15 Jan 2025 12:10:27 +1100 Subject: [PATCH 08/14] Apply suggestions from code review Co-authored-by: Adam Matthews <52178922+adamsthws@users.noreply.github.com> --- files/docker/node/addons/healthcheck.sh | 59 ++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index da4d16ba5..db9c5a8b0 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -42,12 +42,11 @@ check_cncli() { return 1 fi else - # No-op. placeholder for check_cncli_ptsendtip - : - # if ! check_cncli_ptsendtip; then - # return 1 - # else - # return 0 + if check_cncli_send_tip; then + return 0 + else + return 1 + fi fi } @@ -68,6 +67,54 @@ check_cncli_db() { } +# Function to check if the tip is successfully being sent to Pooltool +check_cncli_send_tip() { + # Timeout in seconds for capturing the log entry + log_entry_timeout=60 + + # Get the process ID of cncli + process_id=$(pgrep -of cncli) || { + echo "Error: cncli process not found." + return 1 # Return 1 if the process is not found + } + + # Loop through the retries + for (( CHECK=0; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do + + # Define the error suffix message for retries + if [ "$HEALTHCHECK_RETRIES" -ne 0 ]; then + error_message_suffix="Attempt $((CHECK + 1)). Retrying in $HEALTHCHECK_RETRY_WAIT seconds." + else error_message_suffix="Retries disabled (HEALTHCHECK_RETRIES=0)" + fi + + # Capture the next output from cncli that is related to Pooltool + pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep -i --line-buffered "pooltool" | head -n 1) + if [ -z "$pt_log_entry" ]; then + echo "Unable to capture cncli output within $log_entry_timeout seconds. $error_message_suffix" + sleep $HEALTHCHECK_RETRY_WAIT # Wait n seconds then retry + continue # Retry if the output capture fails + fi + + # Define the success message to check for + success_status='.*"success":true.*' + failure_status='.*"success":false.*' + + # Check if the success message exists in the captured log + if echo "$pt_log_entry" | grep -q $success_status; then + echo "Healthy: Tip is being sent to Pooltool." + return 0 # Return 0 if the success message is found + elif echo "$pt_log_entry" | grep -q $failure_status; then + failure_message=$(echo "$pt_log_entry" | grep -oP '"message":"\K[^"]+') + echo "Failed to send tip. $failure_message" + return 1 # Return 1 if the success message is not found + fi + done + + echo "Error: Max retries reached." + return 1 # Return 1 if retries are exhausted +} + + check_db_sync() { # Check if the DB is in sync [[ -z "${PGPASSFILE}" ]] && PGPASSFILE="${CNODE_HOME}/priv/.pgpass" From 29aa4ce56042faaa2d9246f4603e8874ba377a5c Mon Sep 17 00:00:00 2001 From: Adam Matthews <52178922+adamsthws@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:08:05 +0000 Subject: [PATCH 09/14] Update healthcheck.sh Removed retry logic from function: check_cncli_send_tip() --- files/docker/node/addons/healthcheck.sh | 55 ++++++++++--------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index db9c5a8b0..c418d7799 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -78,40 +78,29 @@ check_cncli_send_tip() { return 1 # Return 1 if the process is not found } - # Loop through the retries - for (( CHECK=0; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do - - # Define the error suffix message for retries - if [ "$HEALTHCHECK_RETRIES" -ne 0 ]; then - error_message_suffix="Attempt $((CHECK + 1)). Retrying in $HEALTHCHECK_RETRY_WAIT seconds." - else error_message_suffix="Retries disabled (HEALTHCHECK_RETRIES=0)" - fi - - # Capture the next output from cncli that is related to Pooltool - pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep -i --line-buffered "pooltool" | head -n 1) - if [ -z "$pt_log_entry" ]; then - echo "Unable to capture cncli output within $log_entry_timeout seconds. $error_message_suffix" - sleep $HEALTHCHECK_RETRY_WAIT # Wait n seconds then retry - continue # Retry if the output capture fails - fi - - # Define the success message to check for - success_status='.*"success":true.*' - failure_status='.*"success":false.*' - - # Check if the success message exists in the captured log - if echo "$pt_log_entry" | grep -q $success_status; then - echo "Healthy: Tip is being sent to Pooltool." - return 0 # Return 0 if the success message is found - elif echo "$pt_log_entry" | grep -q $failure_status; then - failure_message=$(echo "$pt_log_entry" | grep -oP '"message":"\K[^"]+') - echo "Failed to send tip. $failure_message" - return 1 # Return 1 if the success message is not found - fi - done + # Capture the next output from cncli that is related to Pooltool + pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep -i --line-buffered "pooltool" | head -n 1) + if [ -z "$pt_log_entry" ]; then + echo "Unable to capture cncli output within $log_entry_timeout seconds." + return 1 # Return 1 if the output capture fails + fi - echo "Error: Max retries reached." - return 1 # Return 1 if retries are exhausted + # Define the success message to check for + success_status='.*"success":true.*' + failure_status='.*"success":false.*' + + # Check if the success message exists in the captured log + if echo "$pt_log_entry" | grep -q $success_status; then + echo "Healthy: Tip is being sent to Pooltool." + return 0 # Return 0 if the success message is found + elif echo "$pt_log_entry" | grep -q $failure_status; then + failure_message=$(echo "$pt_log_entry" | grep -oP '"message":"\K[^"]+') + echo "Failed to send tip. $failure_message" + return 1 # Return 1 if the failure message is found + else + echo "Failed to send tip. Unknown reason." + return 1 # Return 1 if it fails for any other reason + fi } From 505487aac3466d81cb7aba8027089e479caecf6e Mon Sep 17 00:00:00 2001 From: illuminatus Date: Wed, 15 Jan 2025 09:28:52 -0800 Subject: [PATCH 10/14] Use URL not KOIOS_URL --- files/docker/node/addons/healthcheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index db9c5a8b0..ea452d496 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -173,7 +173,7 @@ check_node() { FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) elif [[ "$FIRST" -gt "$SECOND" ]]; then sleep "$HEALTHCHECK_RETRY_WAIT" - SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no') + SECOND=$($CURL "${URL}" | $JQ '.[0].block_no') fi done echo "There is a problem" From 338639c8dd9b80184111cb86004495471f4951bc Mon Sep 17 00:00:00 2001 From: Adam Matthews <52178922+adamsthws@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:37:25 +0000 Subject: [PATCH 11/14] Update healthcheck.sh - Increased accuracy of "Pooltool" log scrape pattern. - Included ($pt_log_entry) in output for improved logging/debugging. - Improved variable names for clarification. --- files/docker/node/addons/healthcheck.sh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index c418d7799..54bf7d917 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -79,26 +79,28 @@ check_cncli_send_tip() { } # Capture the next output from cncli that is related to Pooltool - pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep -i --line-buffered "pooltool" | head -n 1) + pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep --line-buffered "Pooltool" | head -n 1) if [ -z "$pt_log_entry" ]; then echo "Unable to capture cncli output within $log_entry_timeout seconds." return 1 # Return 1 if the output capture fails fi - # Define the success message to check for - success_status='.*"success":true.*' - failure_status='.*"success":false.*' + # Define the json success message to check for + json_success_status='.*"success":true.*' + json_failure_status='.*"success":false.*' - # Check if the success message exists in the captured log - if echo "$pt_log_entry" | grep -q $success_status; then + # Check if the json success message exists in the captured log + if echo "$pt_log_entry" | grep -q $json_success_status; then echo "Healthy: Tip is being sent to Pooltool." return 0 # Return 0 if the success message is found - elif echo "$pt_log_entry" | grep -q $failure_status; then + # Check if the json failure message exists in the captured log + elif echo "$pt_log_entry" | grep -q $json_failure_status; then failure_message=$(echo "$pt_log_entry" | grep -oP '"message":"\K[^"]+') echo "Failed to send tip. $failure_message" return 1 # Return 1 if the failure message is found + # If the log entry does not contain a json success or failure message else - echo "Failed to send tip. Unknown reason." + echo "Failed to send tip. $pt_log_entry" return 1 # Return 1 if it fails for any other reason fi } From 742432b560876a66d7cba5f2ad81039888c32bbb Mon Sep 17 00:00:00 2001 From: Adam Matthews <52178922+adamsthws@users.noreply.github.com> Date: Thu, 20 Feb 2025 16:01:09 +0000 Subject: [PATCH 12/14] Update function: check_cncli_send_tip(), within: healthcheck.sh (#2) * Update healthcheck.sh Update function: check_cncli_send_tip() - Added checks for the tip progressing to reduce false positives when the timeout is reached but the tip hasn't moved. - Increased the log_entry_timeout to 99, to be just within the container HEATHCHECK timout of 100. - Improved log output to include current tip, providing more info to the operator. Co-Authored-By: illuminatus <9167887+TrevorBenson@users.noreply.github.com> * Update healthcheck.sh Removed superfluous '-e' from 'echo -e' --------- Co-authored-by: illuminatus <9167887+TrevorBenson@users.noreply.github.com> --- files/docker/node/addons/healthcheck.sh | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 957d9cbca..11c660732 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -70,7 +70,7 @@ check_cncli_db() { # Function to check if the tip is successfully being sent to Pooltool check_cncli_send_tip() { # Timeout in seconds for capturing the log entry - log_entry_timeout=60 + log_entry_timeout=99 # Get the process ID of cncli process_id=$(pgrep -of cncli) || { @@ -78,11 +78,20 @@ check_cncli_send_tip() { return 1 # Return 1 if the process is not found } + # Get the current tip from the node + FIRST_TIP=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) # Capture the next output from cncli that is related to Pooltool pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep --line-buffered "Pooltool" | head -n 1) + # Get the current tip again + SECOND_TIP=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) if [ -z "$pt_log_entry" ]; then - echo "Unable to capture cncli output within $log_entry_timeout seconds." - return 1 # Return 1 if the output capture fails + if [[ "$FIRST_TIP" -eq "$SECOND_TIP" ]]; then + echo "Unable to capture cncli output within $log_entry_timeout seconds, but node has not moved tip. (Current tip = $SECOND_TIP)." + return 0 + else + echo "Unable to capture cncli output within $log_entry_timeout seconds. (Current tip = $SECOND_TIP)." + return 1 # Return 1 if the output capture fails + fi fi # Define the json success message to check for @@ -91,16 +100,17 @@ check_cncli_send_tip() { # Check if the json success message exists in the captured log if echo "$pt_log_entry" | grep -q $json_success_status; then - echo "Healthy: Tip is being sent to Pooltool." + echo "Healthy: Tip sent to Pooltool. (Current tip = $SECOND_TIP)." return 0 # Return 0 if the success message is found # Check if the json failure message exists in the captured log elif echo "$pt_log_entry" | grep -q $json_failure_status; then failure_message=$(echo "$pt_log_entry" | grep -oP '"message":"\K[^"]+') - echo "Failed to send tip. $failure_message" + echo "Failed to send tip. (Current tip = $SECOND_TIP). $failure_message" return 1 # Return 1 if the failure message is found # If the log entry does not contain a json success or failure message else - echo "Failed to send tip. $pt_log_entry" + # Log the raw output if no json message is found + echo "Failed to send tip. (Current tip = $SECOND_TIP). $pt_log_entry" return 1 # Return 1 if it fails for any other reason fi } From d69eca7b38b744ba0e22aec3f27f903559624bb9 Mon Sep 17 00:00:00 2001 From: illuminatus Date: Tue, 25 Feb 2025 12:01:09 -0800 Subject: [PATCH 13/14] check_tip less than or equal to. --- files/docker/node/addons/healthcheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 11c660732..9e8809167 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -217,7 +217,7 @@ check_tip() { DB_TIP=$2 ALLOWED_DRIFT=$3 - if [[ $(( TIP - DB_TIP )) -lt ${ALLOWED_DRIFT} ]]; then + if [[ $(( TIP - DB_TIP )) -le ${ALLOWED_DRIFT} ]]; then return 0 else return 1 From 1e4aafbd431a13a056e48065570a40d683ada97e Mon Sep 17 00:00:00 2001 From: Adam Matthews <52178922+adamsthws@users.noreply.github.com> Date: Thu, 27 Feb 2025 22:31:32 +0000 Subject: [PATCH 14/14] Update function: check_cncli_send_tip(), within: healthcheck.sh (#3) * Update healthcheck.sh Update function: check_cncli_send_tip() - Added checks for the tip progressing to reduce false positives when the timeout is reached but the tip hasn't moved. - Increased the log_entry_timeout to 99, to be just within the container HEATHCHECK timout of 100. - Improved log output to include current tip, providing more info to the operator. Co-Authored-By: illuminatus <9167887+TrevorBenson@users.noreply.github.com> * Update healthcheck.sh Removed superfluous '-e' from 'echo -e' * Update healthcheck.sh renamed uppercase variable to lowercase to conform to conventions as it's a locally scoped variable. * Update healthcheck.sh Added an allowable drift on the tip check as not every tip change is sent to pooltool. * Update healthcheck.sh Improved log output to make it a little clearer. * Update healthcheck.sh Moved variable 'log_entry_timeout' to be closer to where it's called. It doesn't need to be at the top of the function as it's not intended for the operator to change it. * Update healthcheck.sh updated comments * Update healthcheck.sh Renamed variable so it better represents what it relates to * Renamed function 'check_cncli_sendtip' instead of 'check_cncli_send_tip' to align with upstream naming. * Local variable > user variable Changed local variable to a user variable. 'log_entry_timeout' > 'NCLI_SENDTIP_LOG_TIMEOUT' * Local variable > user variable Changed local variable to a user variable. 'tip_allowed_difference' > 'NCNCLI_SENDTIP_ALLOWED_DIFF' * Changed container healthcheck params Reduced container healthcheck interval and timeout to 120s to improve observability. Co-Authored-By: illuminatus <9167887+TrevorBenson@users.noreply.github.com> * Renamed variable Renamed variable to better align with prior naming conventions. Co-Authored-By: illuminatus <9167887+TrevorBenson@users.noreply.github.com> * Make use of check_tip() Make use of check_tip() function to reduce code repetitiveness. Co-Authored-By: illuminatus <9167887+TrevorBenson@users.noreply.github.com> * Rename 'TIP' variables Rename check_tip() variables to make them more generic, allowing the check_tip() function to be used more broadly Co-Authored-By: illuminatus <9167887+TrevorBenson@users.noreply.github.com> * Fix function name Fix incorrect name of function. The function used to be named with an underscore. It was updated but this was missed prior. * Update variable names Continuation of updating TIP variable names within check_tip() Co-Authored-By: illuminatus <9167887+TrevorBenson@users.noreply.github.com> --------- Co-authored-by: illuminatus <9167887+TrevorBenson@users.noreply.github.com> --- files/docker/node/addons/healthcheck.sh | 38 ++++++++++++------------- files/docker/node/dockerfile_bin | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 9e8809167..c6b15850b 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -12,6 +12,8 @@ HEALTHCHECK_RETRIES="${HEALTHCHECK_RETRIES:-20}" # The number of HEALTHCHECK_RETRY_WAIT="${HEALTHCHECK_RETRY_WAIT:-3}" # The time (in seconds) to wait between retries DB_SYNC_ALLOWED_DRIFT="${DB_SYNC_ALLOWED_DRIFT:-3600}" # The allowed drift in seconds for the DB to be considered in sync CNCLI_DB_ALLOWED_DRIFT="${CNCLI_DB_ALLOWED_DRIFT:-300}" # The allowed drift in slots for the CNCLI DB to be considered in sync +CNCLI_SENDTIP_LOG_TIMEOUT="${CNCLI_SENDTIP_LOG_TIMEOUT:-119}" # log capturing timeout (should one second be lower than container healthcheck '--timeout', which defaults to 120) +CNCLI_SENDTIP_ALLOWED_DRIFT="${CNCLI_SENDTIP_ALLOWED_DRIFT:-3}" # The allowable difference of the tip moving before it's sent to Pooltool. (Not every tip progression is sent to Pooltool) ###################################### # Do NOT modify code below # @@ -42,7 +44,7 @@ check_cncli() { return 1 fi else - if check_cncli_send_tip; then + if check_cncli_sendtip; then return 0 else return 1 @@ -68,10 +70,7 @@ check_cncli_db() { # Function to check if the tip is successfully being sent to Pooltool -check_cncli_send_tip() { - # Timeout in seconds for capturing the log entry - log_entry_timeout=99 - +check_cncli_sendtip() { # Get the process ID of cncli process_id=$(pgrep -of cncli) || { echo "Error: cncli process not found." @@ -79,18 +78,19 @@ check_cncli_send_tip() { } # Get the current tip from the node - FIRST_TIP=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) + first_tip=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) # Capture the next output from cncli that is related to Pooltool - pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep --line-buffered "Pooltool" | head -n 1) + pt_log_entry=$(timeout $CNCLI_SENDTIP_LOG_TIMEOUT cat /proc/$process_id/fd/1 | grep --line-buffered "Pooltool" | head -n 1) # Get the current tip again - SECOND_TIP=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) + second_tip=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) + # If no output was captured... if [ -z "$pt_log_entry" ]; then - if [[ "$FIRST_TIP" -eq "$SECOND_TIP" ]]; then - echo "Unable to capture cncli output within $log_entry_timeout seconds, but node has not moved tip. (Current tip = $SECOND_TIP)." - return 0 + if check_tip "$second_tip" "$first_tip" "$CNCLI_SENDTIP_ALLOWED_DRIFT"; then + echo "Node tip didn't move before the healthcheck timeout was reached. (Current tip = $second_tip)." + return 0 # Return 0 if the tip didn't move else - echo "Unable to capture cncli output within $log_entry_timeout seconds. (Current tip = $SECOND_TIP)." - return 1 # Return 1 if the output capture fails + echo "Unable to capture cncli output before the healthcheck timeout was reached. (Current tip = $second_tip)." + return 1 # Return 1 if the tip did move fi fi @@ -100,17 +100,17 @@ check_cncli_send_tip() { # Check if the json success message exists in the captured log if echo "$pt_log_entry" | grep -q $json_success_status; then - echo "Healthy: Tip sent to Pooltool. (Current tip = $SECOND_TIP)." + echo "Healthy: Tip sent to Pooltool. (Current tip = $second_tip)." return 0 # Return 0 if the success message is found # Check if the json failure message exists in the captured log elif echo "$pt_log_entry" | grep -q $json_failure_status; then failure_message=$(echo "$pt_log_entry" | grep -oP '"message":"\K[^"]+') - echo "Failed to send tip. (Current tip = $SECOND_TIP). $failure_message" + echo "Failed to send tip. (Current tip = $second_tip). $failure_message" return 1 # Return 1 if the failure message is found # If the log entry does not contain a json success or failure message else # Log the raw output if no json message is found - echo "Failed to send tip. (Current tip = $SECOND_TIP). $pt_log_entry" + echo "Failed to send tip. (Current tip = $second_tip). $pt_log_entry" return 1 # Return 1 if it fails for any other reason fi } @@ -213,11 +213,11 @@ check_process() { check_tip() { - TIP=$1 - DB_TIP=$2 + TIP_X=$1 + TIP_Y=$2 ALLOWED_DRIFT=$3 - if [[ $(( TIP - DB_TIP )) -le ${ALLOWED_DRIFT} ]]; then + if [[ $(( TIP_X - TIP_Y )) -le ${ALLOWED_DRIFT} ]]; then return 0 else return 1 diff --git a/files/docker/node/dockerfile_bin b/files/docker/node/dockerfile_bin index 6b00b8492..3570ff1b2 100644 --- a/files/docker/node/dockerfile_bin +++ b/files/docker/node/dockerfile_bin @@ -105,7 +105,7 @@ ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLO RUN sudo chmod -R a+rx /home/guild/.scripts/*.sh /opt/cardano/cnode/scripts/*.sh /home/guild/entrypoint.sh /conf \ && sudo chown -R guild:guild /home/guild/.* $CNODE_HOME /conf -HEALTHCHECK --start-period=5m --interval=5m --timeout=100s CMD /opt/cardano/cnode/scripts/healthcheck.sh +HEALTHCHECK --start-period=5m --interval=120s --timeout=120s CMD /opt/cardano/cnode/scripts/healthcheck.sh ENTRYPOINT ["./entrypoint.sh"]