health-check.sh
2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash
# 健康检查脚本
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR" || exit
source "${SCRIPT_DIR}/config.sh"
LOG_FILE="${SCRIPT_DIR}/../logs/health-check.log"
echo $$ > "${SCRIPT_DIR}/../logs/health-check.pid"
# 记录日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}
# 捕捉SIGTERM信号,并优雅退出
term_handler() {
log "Health check stopped by systemd"
rm -f "${SCRIPT_DIR}/../logs/health-check.pid"
exit 0
}
# 注册信号处理器
trap 'term_handler' SIGTERM
log "Health check will started for $APP_HEALTH_URL after 120 seconds"
sleep 120; # 等待120秒服务启动成功后才开始检查
log "Configuration: Interval=${HEALTH_CHECK_INTERVAL}s, MaxFailures=${MAX_HEALTH_FAILURES}"
failure_count=0
# 循环检查
while true; do
# 使用curl检查健康端点
HTTP_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 --max-time 10 "$APP_HEALTH_URL" 2>> "$LOG_FILE")
CURL_EXIT_CODE=$?
if [ $CURL_EXIT_CODE -eq 0 ] && [ "$HTTP_RESPONSE" -eq 200 ]; then
# 健康检查成功,重置失败计数器
if [ $failure_count -gt 0 ]; then
log "Health restored after $failure_count failures. Resetting counter."
failure_count=0
else
log "Health check success (Curl: $CURL_EXIT_CODE, HTTP: $HTTP_RESPONSE)."
fi
else
# 健康检查失败
((failure_count++))
log "Health check FAILED (Curl: $CURL_EXIT_CODE, HTTP: $HTTP_RESPONSE). Failure count: $failure_count/$MAX_HEALTH_FAILURES"
# 检查是否达到最大失败次数
if [ $failure_count -ge $MAX_HEALTH_FAILURES ]; then
log "CRITICAL: Maximum failure count ($MAX_HEALTH_FAILURES) reached. Restarting service."
sudo systemctl restart $SERVICE_NAME
sleep 120; # 等待120秒服务启动成功后才继续开始检查
fi
fi
# 使用可中断的sleep
sleep $HEALTH_CHECK_INTERVAL &
wait $!
done