merge error_handler

This commit is contained in:
CanbiZ (MickLesk)
2026-02-24 09:36:33 +01:00
parent 2be37f5e00
commit 81e39c0680

View File

@@ -37,25 +37,79 @@ if ! declare -f explain_exit_code &>/dev/null; then
case "$code" in case "$code" in
1) echo "General error / Operation not permitted" ;; 1) echo "General error / Operation not permitted" ;;
2) echo "Misuse of shell builtins (e.g. syntax error)" ;; 2) echo "Misuse of shell builtins (e.g. syntax error)" ;;
3) echo "General syntax or argument error" ;;
10) echo "Docker / privileged mode required (unsupported environment)" ;;
4) echo "curl: Feature not supported or protocol error" ;;
5) echo "curl: Could not resolve proxy" ;;
6) echo "curl: DNS resolution failed (could not resolve host)" ;; 6) echo "curl: DNS resolution failed (could not resolve host)" ;;
7) echo "curl: Failed to connect (network unreachable / host down)" ;; 7) echo "curl: Failed to connect (network unreachable / host down)" ;;
8) echo "curl: Server reply error (FTP/SFTP or apk untrusted key)" ;;
16) echo "curl: HTTP/2 framing layer error" ;;
18) echo "curl: Partial file (transfer not completed)" ;;
22) echo "curl: HTTP error returned (404, 429, 500+)" ;; 22) echo "curl: HTTP error returned (404, 429, 500+)" ;;
23) echo "curl: Write error (disk full or permissions)" ;;
24) echo "curl: Write to local file failed" ;;
25) echo "curl: Upload failed" ;;
26) echo "curl: Read error on local file (I/O)" ;;
27) echo "curl: Out of memory (memory allocation failed)" ;;
28) echo "curl: Operation timeout (network slow or server not responding)" ;; 28) echo "curl: Operation timeout (network slow or server not responding)" ;;
30) echo "curl: FTP port command failed" ;;
32) echo "curl: FTP SIZE command failed" ;;
33) echo "curl: HTTP range error" ;;
34) echo "curl: HTTP post error" ;;
35) echo "curl: SSL/TLS handshake failed (certificate error)" ;; 35) echo "curl: SSL/TLS handshake failed (certificate error)" ;;
36) echo "curl: FTP bad download resume" ;;
39) echo "curl: LDAP search failed" ;;
44) echo "curl: Internal error (bad function call order)" ;;
45) echo "curl: Interface error (failed to bind to specified interface)" ;;
46) echo "curl: Bad password entered" ;;
47) echo "curl: Too many redirects" ;;
48) echo "curl: Unknown command line option specified" ;;
51) echo "curl: SSL peer certificate or SSH host key verification failed" ;;
52) echo "curl: Empty reply from server (got nothing)" ;;
55) echo "curl: Failed sending network data" ;;
56) echo "curl: Receive error (connection reset by peer)" ;;
57) echo "curl: Unrecoverable poll/select error (system I/O failure)" ;;
59) echo "curl: Couldn't use specified SSL cipher" ;;
61) echo "curl: Bad/unrecognized transfer encoding" ;;
63) echo "curl: Maximum file size exceeded" ;;
75) echo "Temporary failure (retry later)" ;;
78) echo "curl: Remote file not found (404 on FTP/file)" ;;
79) echo "curl: SSH session error (key exchange/auth failed)" ;;
92) echo "curl: HTTP/2 stream error (protocol violation)" ;;
95) echo "curl: HTTP/3 layer error" ;;
64) echo "Usage error (wrong arguments)" ;;
65) echo "Data format error (bad input data)" ;;
66) echo "Input file not found (cannot open input)" ;;
67) echo "User not found (addressee unknown)" ;;
68) echo "Host not found (hostname unknown)" ;;
69) echo "Service unavailable" ;;
70) echo "Internal software error" ;;
71) echo "System error (OS-level failure)" ;;
72) echo "Critical OS file missing" ;;
73) echo "Cannot create output file" ;;
74) echo "I/O error" ;;
76) echo "Remote protocol error" ;;
77) echo "Permission denied" ;;
100) echo "APT: Package manager error (broken packages / dependency problems)" ;; 100) echo "APT: Package manager error (broken packages / dependency problems)" ;;
101) echo "APT: Configuration error (bad sources.list, malformed config)" ;; 101) echo "APT: Configuration error (bad sources.list, malformed config)" ;;
102) echo "APT: Lock held by another process (dpkg/apt still running)" ;; 102) echo "APT: Lock held by another process (dpkg/apt still running)" ;;
124) echo "Command timed out (timeout command)" ;; 124) echo "Command timed out (timeout command)" ;;
125) echo "Command failed to start (Docker daemon or execution error)" ;;
126) echo "Command invoked cannot execute (permission problem?)" ;; 126) echo "Command invoked cannot execute (permission problem?)" ;;
127) echo "Command not found" ;; 127) echo "Command not found" ;;
128) echo "Invalid argument to exit" ;; 128) echo "Invalid argument to exit" ;;
129) echo "Killed by SIGHUP (terminal closed / hangup)" ;; 129) echo "Killed by SIGHUP (terminal closed / hangup)" ;;
130) echo "Terminated by Ctrl+C (SIGINT)" ;; 130) echo "Aborted by user (SIGINT)" ;;
131) echo "Killed by SIGQUIT (core dumped)" ;;
132) echo "Killed by SIGILL (illegal CPU instruction)" ;;
134) echo "Process aborted (SIGABRT - possibly Node.js heap overflow)" ;; 134) echo "Process aborted (SIGABRT - possibly Node.js heap overflow)" ;;
137) echo "Killed (SIGKILL / Out of memory?)" ;; 137) echo "Killed (SIGKILL / Out of memory?)" ;;
139) echo "Segmentation fault (core dumped)" ;; 139) echo "Segmentation fault (core dumped)" ;;
141) echo "Broken pipe (SIGPIPE - output closed prematurely)" ;; 141) echo "Broken pipe (SIGPIPE - output closed prematurely)" ;;
143) echo "Terminated (SIGTERM)" ;; 143) echo "Terminated (SIGTERM)" ;;
144) echo "Killed by signal 16 (SIGUSR1 / SIGSTKFLT)" ;;
146) echo "Killed by signal 18 (SIGTSTP)" ;;
150) echo "Systemd: Service failed to start" ;; 150) echo "Systemd: Service failed to start" ;;
151) echo "Systemd: Service unit not found" ;; 151) echo "Systemd: Service unit not found" ;;
152) echo "Permission denied (EACCES)" ;; 152) echo "Permission denied (EACCES)" ;;
@@ -101,6 +155,7 @@ if ! declare -f explain_exit_code &>/dev/null; then
224) echo "Proxmox: PBS storage is for backups only" ;; 224) echo "Proxmox: PBS storage is for backups only" ;;
225) echo "Proxmox: No template available for OS/Version" ;; 225) echo "Proxmox: No template available for OS/Version" ;;
231) echo "Proxmox: LXC stack upgrade failed" ;; 231) echo "Proxmox: LXC stack upgrade failed" ;;
239) echo "npm/Node.js: Unexpected runtime error or dependency failure" ;;
243) echo "Node.js: Out of memory (JavaScript heap out of memory)" ;; 243) echo "Node.js: Out of memory (JavaScript heap out of memory)" ;;
245) echo "Node.js: Invalid command-line option" ;; 245) echo "Node.js: Invalid command-line option" ;;
246) echo "Node.js: Internal JavaScript Parse Error" ;; 246) echo "Node.js: Internal JavaScript Parse Error" ;;
@@ -149,6 +204,16 @@ error_handler() {
printf "\e[?25h" printf "\e[?25h"
# ALWAYS report failure to API immediately - don't wait for container checks
# This ensures we capture failures that occur before/after container exists
if declare -f post_update_to_api &>/dev/null; then
post_update_to_api "failed" "$exit_code" 2>/dev/null || true
else
# Container context: post_update_to_api not available (api.func not sourced)
# Send status directly via curl so container failures are never lost
_send_abort_telemetry "$exit_code" 2>/dev/null || true
fi
# Use msg_error if available, fallback to echo # Use msg_error if available, fallback to echo
if declare -f msg_error >/dev/null 2>&1; then if declare -f msg_error >/dev/null 2>&1; then
msg_error "in line ${line_number}: exit code ${exit_code} (${explanation}): while executing command ${command}" msg_error "in line ${line_number}: exit code ${exit_code} (${explanation}): while executing command ${command}"
@@ -175,55 +240,92 @@ error_handler() {
active_log="$SILENT_LOGFILE" active_log="$SILENT_LOGFILE"
fi fi
# If active_log points to a container-internal path that doesn't exist on host,
# fall back to BUILD_LOG (host-side log)
if [[ -n "$active_log" && ! -s "$active_log" && -n "${BUILD_LOG:-}" && -s "${BUILD_LOG}" ]]; then
active_log="$BUILD_LOG"
fi
# Show last log lines if available
if [[ -n "$active_log" && -s "$active_log" ]]; then if [[ -n "$active_log" && -s "$active_log" ]]; then
echo "--- Last 20 lines of silent log ---" echo -e "\n${TAB}--- Last 20 lines of log ---"
tail -n 20 "$active_log" tail -n 20 "$active_log"
echo "-----------------------------------" echo -e "${TAB}-----------------------------------\n"
fi
# Detect context: Container (INSTALL_LOG set + /root exists) vs Host (BUILD_LOG) # Detect context: Container (INSTALL_LOG set + inside container /root) vs Host
if [[ -n "${INSTALL_LOG:-}" && -d /root ]]; then if [[ -n "${INSTALL_LOG:-}" && -f "${INSTALL_LOG:-}" && -d /root ]]; then
# CONTAINER CONTEXT: Copy log and create flag file for host # CONTAINER CONTEXT: Copy log and create flag file for host
local container_log="/root/.install-${SESSION_ID:-error}.log" local container_log="/root/.install-${SESSION_ID:-error}.log"
cp "$active_log" "$container_log" 2>/dev/null || true cp "${INSTALL_LOG}" "$container_log" 2>/dev/null || true
# Create error flag file with exit code for host detection # Create error flag file with exit code for host detection
echo "$exit_code" >"/root/.install-${SESSION_ID:-error}.failed" 2>/dev/null || true echo "$exit_code" >"/root/.install-${SESSION_ID:-error}.failed" 2>/dev/null || true
# Log path is shown by host as combined log - no need to show container path # Log path is shown by host as combined log - no need to show container path
else else
# HOST CONTEXT: Show local log path and offer container cleanup # HOST CONTEXT: Show local log path and offer container cleanup
if [[ -n "$active_log" && -s "$active_log" ]]; then
if declare -f msg_custom >/dev/null 2>&1; then if declare -f msg_custom >/dev/null 2>&1; then
msg_custom "📋" "${YW}" "Full log: ${active_log}" msg_custom "📋" "${YW}" "Full log: ${active_log}"
else else
echo -e "${YW}Full log:${CL} ${BL}${active_log}${CL}" echo -e "${YW}Full log:${CL} ${BL}${active_log}${CL}"
fi fi
fi
# Offer to remove container if it exists (build errors after container creation) # Offer to remove container if it exists (build errors after container creation)
if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null && pct status "$CTID" &>/dev/null; then if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null && pct status "$CTID" &>/dev/null; then
# Report failure to API before container cleanup echo ""
if declare -f post_update_to_api &>/dev/null; then if declare -f msg_custom >/dev/null 2>&1; then
post_update_to_api "failed" "$exit_code" echo -en "${TAB}${TAB}${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
fi else
echo ""
echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}" echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
fi
if read -t 60 -r response; then if read -t 60 -r response; then
if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
echo -e "\n${YW}Removing container ${CTID}${CL}" echo ""
pct stop "$CTID" &>/dev/null || true if declare -f msg_info >/dev/null 2>&1; then
pct destroy "$CTID" &>/dev/null || true msg_info "Removing container ${CTID}"
echo -e "${GN}${CL} Container ${CTID} removed" else
elif [[ "$response" =~ ^[Nn]$ ]]; then echo -e "${YW}Removing container ${CTID}${CL}"
echo -e "\n${YW}Container ${CTID} kept for debugging${CL}"
fi fi
else
# Timeout - auto-remove
echo -e "\n${YW}No response - auto-removing container${CL}"
pct stop "$CTID" &>/dev/null || true pct stop "$CTID" &>/dev/null || true
pct destroy "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true
if declare -f msg_ok >/dev/null 2>&1; then
msg_ok "Container ${CTID} removed"
else
echo -e "${GN}${CL} Container ${CTID} removed"
fi
elif [[ "$response" =~ ^[Nn]$ ]]; then
echo ""
if declare -f msg_warn >/dev/null 2>&1; then
msg_warn "Container ${CTID} kept for debugging"
else
echo -e "${YW}Container ${CTID} kept for debugging${CL}"
fi
fi
else
# Timeout - auto-remove
echo ""
if declare -f msg_info >/dev/null 2>&1; then
msg_info "No response - removing container ${CTID}"
else
echo -e "${YW}No response - removing container ${CTID}${CL}"
fi
pct stop "$CTID" &>/dev/null || true
pct destroy "$CTID" &>/dev/null || true
if declare -f msg_ok >/dev/null 2>&1; then
msg_ok "Container ${CTID} removed"
else
echo -e "${GN}${CL} Container ${CTID} removed" echo -e "${GN}${CL} Container ${CTID} removed"
fi fi
fi fi
# Force one final status update attempt after cleanup
# This ensures status is updated even if the first attempt failed (e.g., HTTP 400)
if declare -f post_update_to_api &>/dev/null; then
post_update_to_api "failed" "$exit_code" "force"
fi
fi fi
fi fi
@@ -231,19 +333,97 @@ error_handler() {
} }
# ============================================================================== # ==============================================================================
# SECTION 3: SIGNAL HANDLERS # SECTION 3: TELEMETRY & CLEANUP HELPERS FOR SIGNAL HANDLERS
# ==============================================================================
# ------------------------------------------------------------------------------
# _send_abort_telemetry()
#
# - Sends failure/abort status to telemetry API
# - Works in BOTH host context (post_update_to_api available) and
# container context (only curl available, api.func not sourced)
# - Container context is critical: without this, container-side failures
# and signal exits are never reported, leaving records stuck in
# "installing" or "configuring" forever
# - Arguments: $1 = exit_code
# ------------------------------------------------------------------------------
_send_abort_telemetry() {
local exit_code="${1:-1}"
# Try full API function first (host context - api.func sourced)
if declare -f post_update_to_api &>/dev/null; then
post_update_to_api "failed" "$exit_code" 2>/dev/null || true
return
fi
# Fallback: direct curl (container context - api.func NOT sourced)
# This is the ONLY way containers can report failures to telemetry
command -v curl &>/dev/null || return 0
[[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
[[ -z "${RANDOM_UUID:-}" ]] && return 0
curl -fsS -m 5 -X POST "${TELEMETRY_URL:-https://telemetry.community-scripts.org/telemetry}" \
-H "Content-Type: application/json" \
-d "{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}}" &>/dev/null || true
}
# ------------------------------------------------------------------------------
# _stop_container_if_installing()
#
# - Stops the LXC container if we're in the install phase
# - Prevents orphaned container processes when the host exits due to a signal
# (SSH disconnect, Ctrl+C, SIGTERM) — without this, the container keeps
# running and may send "configuring" status AFTER the host already sent
# "failed", leaving records permanently stuck in "configuring"
# - Only acts when:
# * CONTAINER_INSTALLING flag is set (during lxc-attach in build_container)
# * CTID is set (container was created)
# * pct command is available (we're on the Proxmox host, not inside a container)
# - Does NOT destroy the container — just stops it for potential debugging
# ------------------------------------------------------------------------------
_stop_container_if_installing() {
[[ "${CONTAINER_INSTALLING:-}" == "true" ]] || return 0
[[ -n "${CTID:-}" ]] || return 0
command -v pct &>/dev/null || return 0
pct stop "$CTID" 2>/dev/null || true
}
# ==============================================================================
# SECTION 4: SIGNAL HANDLERS
# ============================================================================== # ==============================================================================
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# on_exit() # on_exit()
# #
# - EXIT trap handler # - EXIT trap handler — runs on EVERY script termination
# - Cleans up lock files if lockfile variable is set # - Catches orphaned "installing"/"configuring" records:
# - Exits with captured exit code # * If post_to_api sent "installing" but post_update_to_api never ran
# - Always runs on script termination (success or failure) # * Reports final status to prevent records stuck forever
# - Best-effort log collection for failed installs
# - Stops orphaned container processes on failure
# - Cleans up lock files
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
on_exit() { on_exit() {
local exit_code=$? local exit_code=$?
# Report orphaned "installing" records to telemetry API
# Catches ALL exit paths: errors, signals, AND clean exits where
# post_to_api was called but post_update_to_api was never called
if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
if [[ $exit_code -ne 0 ]]; then
_send_abort_telemetry "$exit_code"
elif declare -f post_update_to_api >/dev/null 2>&1; then
post_update_to_api "done" "0" 2>/dev/null || true
fi
fi
# Best-effort log collection on failure (non-critical, telemetry already sent)
if [[ $exit_code -ne 0 ]] && declare -f ensure_log_on_host >/dev/null 2>&1; then
ensure_log_on_host 2>/dev/null || true
fi
# Stop orphaned container if we're in the install phase and exiting with error
if [[ $exit_code -ne 0 ]]; then
_stop_container_if_installing
fi
[[ -n "${lockfile:-}" && -e "$lockfile" ]] && rm -f "$lockfile" [[ -n "${lockfile:-}" && -e "$lockfile" ]] && rm -f "$lockfile"
exit "$exit_code" exit "$exit_code"
} }
@@ -252,14 +432,17 @@ on_exit() {
# on_interrupt() # on_interrupt()
# #
# - SIGINT (Ctrl+C) trap handler # - SIGINT (Ctrl+C) trap handler
# - Displays "Interrupted by user" message # - Reports status FIRST (time-critical: container may be dying)
# - Stops orphaned container to prevent "configuring" ghost records
# - Exits with code 130 (128 + SIGINT=2) # - Exits with code 130 (128 + SIGINT=2)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
on_interrupt() { on_interrupt() {
_send_abort_telemetry "130"
_stop_container_if_installing
if declare -f msg_error >/dev/null 2>&1; then if declare -f msg_error >/dev/null 2>&1; then
msg_error "Interrupted by user (SIGINT)" msg_error "Interrupted by user (SIGINT)" 2>/dev/null || true
else else
echo -e "\n${RD}Interrupted by user (SIGINT)${CL}" echo -e "\n${RD}Interrupted by user (SIGINT)${CL}" 2>/dev/null || true
fi fi
exit 130 exit 130
} }
@@ -268,21 +451,40 @@ on_interrupt() {
# on_terminate() # on_terminate()
# #
# - SIGTERM trap handler # - SIGTERM trap handler
# - Displays "Terminated by signal" message # - Reports status FIRST (time-critical: process being killed)
# - Stops orphaned container to prevent "configuring" ghost records
# - Exits with code 143 (128 + SIGTERM=15) # - Exits with code 143 (128 + SIGTERM=15)
# - Triggered by external process termination
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
on_terminate() { on_terminate() {
_send_abort_telemetry "143"
_stop_container_if_installing
if declare -f msg_error >/dev/null 2>&1; then if declare -f msg_error >/dev/null 2>&1; then
msg_error "Terminated by signal (SIGTERM)" msg_error "Terminated by signal (SIGTERM)" 2>/dev/null || true
else else
echo -e "\n${RD}Terminated by signal (SIGTERM)${CL}" echo -e "\n${RD}Terminated by signal (SIGTERM)${CL}" 2>/dev/null || true
fi fi
exit 143 exit 143
} }
# ------------------------------------------------------------------------------
# on_hangup()
#
# - SIGHUP trap handler (SSH disconnect, terminal closed)
# - CRITICAL: This was previously MISSING from catch_errors(), causing
# container processes to become orphans on SSH disconnect — the #1 cause
# of records stuck in "installing" and "configuring" states
# - Reports status via direct curl (terminal is already closed, no output)
# - Stops orphaned container to prevent ghost records
# - Exits with code 129 (128 + SIGHUP=1)
# ------------------------------------------------------------------------------
on_hangup() {
_send_abort_telemetry "129"
_stop_container_if_installing
exit 129
}
# ============================================================================== # ==============================================================================
# SECTION 4: INITIALIZATION # SECTION 5: INITIALIZATION
# ============================================================================== # ==============================================================================
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
@@ -294,10 +496,11 @@ on_terminate() {
# * set -o pipefail: Pipeline fails if any command fails # * set -o pipefail: Pipeline fails if any command fails
# * set -u: (optional) Exit on undefined variable (if STRICT_UNSET=1) # * set -u: (optional) Exit on undefined variable (if STRICT_UNSET=1)
# - Sets up traps: # - Sets up traps:
# * ERR → error_handler # * ERR → error_handler (script errors)
# * EXIT → on_exit # * EXIT → on_exit (any termination — cleanup + orphan detection)
# * INT → on_interrupt # * INT → on_interrupt (Ctrl+C)
# * TERM → on_terminate # * TERM → on_terminate (kill / systemd stop)
# * HUP → on_hangup (SSH disconnect / terminal closed)
# - Call this function early in every script # - Call this function early in every script
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
catch_errors() { catch_errors() {
@@ -310,4 +513,5 @@ catch_errors() {
trap on_exit EXIT trap on_exit EXIT
trap on_interrupt INT trap on_interrupt INT
trap on_terminate TERM trap on_terminate TERM
trap on_hangup HUP
} }