health-check.sh 2 KB
#!/bin/bash
# 健康检查脚本

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR" || exit
source "${SCRIPT_DIR}/config.sh"


LOG_FILE="${SCRIPT_DIR}/../logs/health-check.log"

echo $$ > "${SCRIPT_DIR}/../logs/health-check.pid"

# 记录日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}

# 捕捉SIGTERM信号,并优雅退出
term_handler() {
    log "Health check stopped by systemd"
    rm -f "${SCRIPT_DIR}/../logs/health-check.pid"
    exit 0
}

# 注册信号处理器
trap 'term_handler' SIGTERM

log "Health check will started for $APP_HEALTH_URL after 120 seconds"
sleep 120; # 等待120秒服务启动成功后才开始检查
log "Configuration: Interval=${HEALTH_CHECK_INTERVAL}s, MaxFailures=${MAX_HEALTH_FAILURES}"

failure_count=0

# 循环检查
while true; do
    # 使用curl检查健康端点
    HTTP_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 --max-time 10 "$APP_HEALTH_URL" 2>> "$LOG_FILE")
    CURL_EXIT_CODE=$?

    if [ $CURL_EXIT_CODE -eq 0 ] && [ "$HTTP_RESPONSE" -eq 200 ]; then
        # 健康检查成功,重置失败计数器
        if [ $failure_count -gt 0 ]; then
            log "Health restored after $failure_count failures. Resetting counter."
            failure_count=0
        else
            log "Health check success (Curl: $CURL_EXIT_CODE, HTTP: $HTTP_RESPONSE)."
        fi
    else
        # 健康检查失败
        ((failure_count++))
        log "Health check FAILED (Curl: $CURL_EXIT_CODE, HTTP: $HTTP_RESPONSE). Failure count: $failure_count/$MAX_HEALTH_FAILURES"

        # 检查是否达到最大失败次数
        if [ $failure_count -ge $MAX_HEALTH_FAILURES ]; then
            log "CRITICAL: Maximum failure count ($MAX_HEALTH_FAILURES) reached. Restarting service."
            sudo systemctl restart $SERVICE_NAME
            sleep 120; # 等待120秒服务启动成功后才继续开始检查
        fi
    fi

    # 使用可中断的sleep
    sleep $HEALTH_CHECK_INTERVAL &
    wait $!
done