From 81e39c06801ceeecdc8e33fdca9fc589577772d7 Mon Sep 17 00:00:00 2001 From: "CanbiZ (MickLesk)" <47820557+MickLesk@users.noreply.github.com> Date: Tue, 24 Feb 2026 09:36:33 +0100 Subject: [PATCH] merge error_handler --- misc/error_handler.func | 302 +++++++++++++++++++++++++++++++++------- 1 file changed, 253 insertions(+), 49 deletions(-) diff --git a/misc/error_handler.func b/misc/error_handler.func index bd6d9892..cea4639a 100644 --- a/misc/error_handler.func +++ b/misc/error_handler.func @@ -37,25 +37,79 @@ if ! declare -f explain_exit_code &>/dev/null; then case "$code" in 1) echo "General error / Operation not permitted" ;; 2) echo "Misuse of shell builtins (e.g. syntax error)" ;; + 3) echo "General syntax or argument error" ;; + 10) echo "Docker / privileged mode required (unsupported environment)" ;; + 4) echo "curl: Feature not supported or protocol error" ;; + 5) echo "curl: Could not resolve proxy" ;; 6) echo "curl: DNS resolution failed (could not resolve host)" ;; 7) echo "curl: Failed to connect (network unreachable / host down)" ;; + 8) echo "curl: Server reply error (FTP/SFTP or apk untrusted key)" ;; + 16) echo "curl: HTTP/2 framing layer error" ;; + 18) echo "curl: Partial file (transfer not completed)" ;; 22) echo "curl: HTTP error returned (404, 429, 500+)" ;; + 23) echo "curl: Write error (disk full or permissions)" ;; + 24) echo "curl: Write to local file failed" ;; + 25) echo "curl: Upload failed" ;; + 26) echo "curl: Read error on local file (I/O)" ;; + 27) echo "curl: Out of memory (memory allocation failed)" ;; 28) echo "curl: Operation timeout (network slow or server not responding)" ;; + 30) echo "curl: FTP port command failed" ;; + 32) echo "curl: FTP SIZE command failed" ;; + 33) echo "curl: HTTP range error" ;; + 34) echo "curl: HTTP post error" ;; 35) echo "curl: SSL/TLS handshake failed (certificate error)" ;; + 36) echo "curl: FTP bad download resume" ;; + 39) echo "curl: LDAP search failed" ;; + 44) echo "curl: Internal error (bad function call order)" ;; + 45) echo "curl: Interface error (failed to bind to specified interface)" ;; + 46) echo "curl: Bad password entered" ;; + 47) echo "curl: Too many redirects" ;; + 48) echo "curl: Unknown command line option specified" ;; + 51) echo "curl: SSL peer certificate or SSH host key verification failed" ;; + 52) echo "curl: Empty reply from server (got nothing)" ;; + 55) echo "curl: Failed sending network data" ;; + 56) echo "curl: Receive error (connection reset by peer)" ;; + 57) echo "curl: Unrecoverable poll/select error (system I/O failure)" ;; + 59) echo "curl: Couldn't use specified SSL cipher" ;; + 61) echo "curl: Bad/unrecognized transfer encoding" ;; + 63) echo "curl: Maximum file size exceeded" ;; + 75) echo "Temporary failure (retry later)" ;; + 78) echo "curl: Remote file not found (404 on FTP/file)" ;; + 79) echo "curl: SSH session error (key exchange/auth failed)" ;; + 92) echo "curl: HTTP/2 stream error (protocol violation)" ;; + 95) echo "curl: HTTP/3 layer error" ;; + 64) echo "Usage error (wrong arguments)" ;; + 65) echo "Data format error (bad input data)" ;; + 66) echo "Input file not found (cannot open input)" ;; + 67) echo "User not found (addressee unknown)" ;; + 68) echo "Host not found (hostname unknown)" ;; + 69) echo "Service unavailable" ;; + 70) echo "Internal software error" ;; + 71) echo "System error (OS-level failure)" ;; + 72) echo "Critical OS file missing" ;; + 73) echo "Cannot create output file" ;; + 74) echo "I/O error" ;; + 76) echo "Remote protocol error" ;; + 77) echo "Permission denied" ;; 100) echo "APT: Package manager error (broken packages / dependency problems)" ;; 101) echo "APT: Configuration error (bad sources.list, malformed config)" ;; 102) echo "APT: Lock held by another process (dpkg/apt still running)" ;; 124) echo "Command timed out (timeout command)" ;; + 125) echo "Command failed to start (Docker daemon or execution error)" ;; 126) echo "Command invoked cannot execute (permission problem?)" ;; 127) echo "Command not found" ;; 128) echo "Invalid argument to exit" ;; 129) echo "Killed by SIGHUP (terminal closed / hangup)" ;; - 130) echo "Terminated by Ctrl+C (SIGINT)" ;; + 130) echo "Aborted by user (SIGINT)" ;; + 131) echo "Killed by SIGQUIT (core dumped)" ;; + 132) echo "Killed by SIGILL (illegal CPU instruction)" ;; 134) echo "Process aborted (SIGABRT - possibly Node.js heap overflow)" ;; 137) echo "Killed (SIGKILL / Out of memory?)" ;; 139) echo "Segmentation fault (core dumped)" ;; 141) echo "Broken pipe (SIGPIPE - output closed prematurely)" ;; 143) echo "Terminated (SIGTERM)" ;; + 144) echo "Killed by signal 16 (SIGUSR1 / SIGSTKFLT)" ;; + 146) echo "Killed by signal 18 (SIGTSTP)" ;; 150) echo "Systemd: Service failed to start" ;; 151) echo "Systemd: Service unit not found" ;; 152) echo "Permission denied (EACCES)" ;; @@ -101,6 +155,7 @@ if ! declare -f explain_exit_code &>/dev/null; then 224) echo "Proxmox: PBS storage is for backups only" ;; 225) echo "Proxmox: No template available for OS/Version" ;; 231) echo "Proxmox: LXC stack upgrade failed" ;; + 239) echo "npm/Node.js: Unexpected runtime error or dependency failure" ;; 243) echo "Node.js: Out of memory (JavaScript heap out of memory)" ;; 245) echo "Node.js: Invalid command-line option" ;; 246) echo "Node.js: Internal JavaScript Parse Error" ;; @@ -149,6 +204,16 @@ error_handler() { printf "\e[?25h" + # ALWAYS report failure to API immediately - don't wait for container checks + # This ensures we capture failures that occur before/after container exists + if declare -f post_update_to_api &>/dev/null; then + post_update_to_api "failed" "$exit_code" 2>/dev/null || true + else + # Container context: post_update_to_api not available (api.func not sourced) + # Send status directly via curl so container failures are never lost + _send_abort_telemetry "$exit_code" 2>/dev/null || true + fi + # Use msg_error if available, fallback to echo if declare -f msg_error >/dev/null 2>&1; then msg_error "in line ${line_number}: exit code ${exit_code} (${explanation}): while executing command ${command}" @@ -175,55 +240,92 @@ error_handler() { active_log="$SILENT_LOGFILE" fi + # If active_log points to a container-internal path that doesn't exist on host, + # fall back to BUILD_LOG (host-side log) + if [[ -n "$active_log" && ! -s "$active_log" && -n "${BUILD_LOG:-}" && -s "${BUILD_LOG}" ]]; then + active_log="$BUILD_LOG" + fi + + # Show last log lines if available if [[ -n "$active_log" && -s "$active_log" ]]; then - echo "--- Last 20 lines of silent log ---" + echo -e "\n${TAB}--- Last 20 lines of log ---" tail -n 20 "$active_log" - echo "-----------------------------------" + echo -e "${TAB}-----------------------------------\n" + fi - # Detect context: Container (INSTALL_LOG set + /root exists) vs Host (BUILD_LOG) - if [[ -n "${INSTALL_LOG:-}" && -d /root ]]; then - # CONTAINER CONTEXT: Copy log and create flag file for host - local container_log="/root/.install-${SESSION_ID:-error}.log" - cp "$active_log" "$container_log" 2>/dev/null || true + # Detect context: Container (INSTALL_LOG set + inside container /root) vs Host + if [[ -n "${INSTALL_LOG:-}" && -f "${INSTALL_LOG:-}" && -d /root ]]; then + # CONTAINER CONTEXT: Copy log and create flag file for host + local container_log="/root/.install-${SESSION_ID:-error}.log" + cp "${INSTALL_LOG}" "$container_log" 2>/dev/null || true - # Create error flag file with exit code for host detection - echo "$exit_code" >"/root/.install-${SESSION_ID:-error}.failed" 2>/dev/null || true - # Log path is shown by host as combined log - no need to show container path - else - # HOST CONTEXT: Show local log path and offer container cleanup + # Create error flag file with exit code for host detection + echo "$exit_code" >"/root/.install-${SESSION_ID:-error}.failed" 2>/dev/null || true + # Log path is shown by host as combined log - no need to show container path + else + # HOST CONTEXT: Show local log path and offer container cleanup + if [[ -n "$active_log" && -s "$active_log" ]]; then if declare -f msg_custom >/dev/null 2>&1; then msg_custom "📋" "${YW}" "Full log: ${active_log}" else echo -e "${YW}Full log:${CL} ${BL}${active_log}${CL}" fi + fi - # Offer to remove container if it exists (build errors after container creation) - if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null && pct status "$CTID" &>/dev/null; then - # Report failure to API before container cleanup - if declare -f post_update_to_api &>/dev/null; then - post_update_to_api "failed" "$exit_code" - fi - - echo "" + # Offer to remove container if it exists (build errors after container creation) + if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null && pct status "$CTID" &>/dev/null; then + echo "" + if declare -f msg_custom >/dev/null 2>&1; then + echo -en "${TAB}❓${TAB}${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}" + else echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}" + fi - if read -t 60 -r response; then - if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then - echo -e "\n${YW}Removing container ${CTID}${CL}" - pct stop "$CTID" &>/dev/null || true - pct destroy "$CTID" &>/dev/null || true - echo -e "${GN}✔${CL} Container ${CTID} removed" - elif [[ "$response" =~ ^[Nn]$ ]]; then - echo -e "\n${YW}Container ${CTID} kept for debugging${CL}" + if read -t 60 -r response; then + if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then + echo "" + if declare -f msg_info >/dev/null 2>&1; then + msg_info "Removing container ${CTID}" + else + echo -e "${YW}Removing container ${CTID}${CL}" fi - else - # Timeout - auto-remove - echo -e "\n${YW}No response - auto-removing container${CL}" pct stop "$CTID" &>/dev/null || true pct destroy "$CTID" &>/dev/null || true + if declare -f msg_ok >/dev/null 2>&1; then + msg_ok "Container ${CTID} removed" + else + echo -e "${GN}✔${CL} Container ${CTID} removed" + fi + elif [[ "$response" =~ ^[Nn]$ ]]; then + echo "" + if declare -f msg_warn >/dev/null 2>&1; then + msg_warn "Container ${CTID} kept for debugging" + else + echo -e "${YW}Container ${CTID} kept for debugging${CL}" + fi + fi + else + # Timeout - auto-remove + echo "" + if declare -f msg_info >/dev/null 2>&1; then + msg_info "No response - removing container ${CTID}" + else + echo -e "${YW}No response - removing container ${CTID}${CL}" + fi + pct stop "$CTID" &>/dev/null || true + pct destroy "$CTID" &>/dev/null || true + if declare -f msg_ok >/dev/null 2>&1; then + msg_ok "Container ${CTID} removed" + else echo -e "${GN}✔${CL} Container ${CTID} removed" fi fi + + # Force one final status update attempt after cleanup + # This ensures status is updated even if the first attempt failed (e.g., HTTP 400) + if declare -f post_update_to_api &>/dev/null; then + post_update_to_api "failed" "$exit_code" "force" + fi fi fi @@ -231,19 +333,97 @@ error_handler() { } # ============================================================================== -# SECTION 3: SIGNAL HANDLERS +# SECTION 3: TELEMETRY & CLEANUP HELPERS FOR SIGNAL HANDLERS +# ============================================================================== + +# ------------------------------------------------------------------------------ +# _send_abort_telemetry() +# +# - Sends failure/abort status to telemetry API +# - Works in BOTH host context (post_update_to_api available) and +# container context (only curl available, api.func not sourced) +# - Container context is critical: without this, container-side failures +# and signal exits are never reported, leaving records stuck in +# "installing" or "configuring" forever +# - Arguments: $1 = exit_code +# ------------------------------------------------------------------------------ +_send_abort_telemetry() { + local exit_code="${1:-1}" + # Try full API function first (host context - api.func sourced) + if declare -f post_update_to_api &>/dev/null; then + post_update_to_api "failed" "$exit_code" 2>/dev/null || true + return + fi + # Fallback: direct curl (container context - api.func NOT sourced) + # This is the ONLY way containers can report failures to telemetry + command -v curl &>/dev/null || return 0 + [[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0 + [[ -z "${RANDOM_UUID:-}" ]] && return 0 + curl -fsS -m 5 -X POST "${TELEMETRY_URL:-https://telemetry.community-scripts.org/telemetry}" \ + -H "Content-Type: application/json" \ + -d "{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}}" &>/dev/null || true +} + +# ------------------------------------------------------------------------------ +# _stop_container_if_installing() +# +# - Stops the LXC container if we're in the install phase +# - Prevents orphaned container processes when the host exits due to a signal +# (SSH disconnect, Ctrl+C, SIGTERM) — without this, the container keeps +# running and may send "configuring" status AFTER the host already sent +# "failed", leaving records permanently stuck in "configuring" +# - Only acts when: +# * CONTAINER_INSTALLING flag is set (during lxc-attach in build_container) +# * CTID is set (container was created) +# * pct command is available (we're on the Proxmox host, not inside a container) +# - Does NOT destroy the container — just stops it for potential debugging +# ------------------------------------------------------------------------------ +_stop_container_if_installing() { + [[ "${CONTAINER_INSTALLING:-}" == "true" ]] || return 0 + [[ -n "${CTID:-}" ]] || return 0 + command -v pct &>/dev/null || return 0 + pct stop "$CTID" 2>/dev/null || true +} + +# ============================================================================== +# SECTION 4: SIGNAL HANDLERS # ============================================================================== # ------------------------------------------------------------------------------ # on_exit() # -# - EXIT trap handler -# - Cleans up lock files if lockfile variable is set -# - Exits with captured exit code -# - Always runs on script termination (success or failure) +# - EXIT trap handler — runs on EVERY script termination +# - Catches orphaned "installing"/"configuring" records: +# * If post_to_api sent "installing" but post_update_to_api never ran +# * Reports final status to prevent records stuck forever +# - Best-effort log collection for failed installs +# - Stops orphaned container processes on failure +# - Cleans up lock files # ------------------------------------------------------------------------------ on_exit() { local exit_code=$? + + # Report orphaned "installing" records to telemetry API + # Catches ALL exit paths: errors, signals, AND clean exits where + # post_to_api was called but post_update_to_api was never called + if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then + if [[ $exit_code -ne 0 ]]; then + _send_abort_telemetry "$exit_code" + elif declare -f post_update_to_api >/dev/null 2>&1; then + post_update_to_api "done" "0" 2>/dev/null || true + fi + fi + + # Best-effort log collection on failure (non-critical, telemetry already sent) + if [[ $exit_code -ne 0 ]] && declare -f ensure_log_on_host >/dev/null 2>&1; then + ensure_log_on_host 2>/dev/null || true + fi + + # Stop orphaned container if we're in the install phase and exiting with error + if [[ $exit_code -ne 0 ]]; then + _stop_container_if_installing + fi + [[ -n "${lockfile:-}" && -e "$lockfile" ]] && rm -f "$lockfile" exit "$exit_code" } @@ -252,14 +432,17 @@ on_exit() { # on_interrupt() # # - SIGINT (Ctrl+C) trap handler -# - Displays "Interrupted by user" message +# - Reports status FIRST (time-critical: container may be dying) +# - Stops orphaned container to prevent "configuring" ghost records # - Exits with code 130 (128 + SIGINT=2) # ------------------------------------------------------------------------------ on_interrupt() { + _send_abort_telemetry "130" + _stop_container_if_installing if declare -f msg_error >/dev/null 2>&1; then - msg_error "Interrupted by user (SIGINT)" + msg_error "Interrupted by user (SIGINT)" 2>/dev/null || true else - echo -e "\n${RD}Interrupted by user (SIGINT)${CL}" + echo -e "\n${RD}Interrupted by user (SIGINT)${CL}" 2>/dev/null || true fi exit 130 } @@ -268,21 +451,40 @@ on_interrupt() { # on_terminate() # # - SIGTERM trap handler -# - Displays "Terminated by signal" message +# - Reports status FIRST (time-critical: process being killed) +# - Stops orphaned container to prevent "configuring" ghost records # - Exits with code 143 (128 + SIGTERM=15) -# - Triggered by external process termination # ------------------------------------------------------------------------------ on_terminate() { + _send_abort_telemetry "143" + _stop_container_if_installing if declare -f msg_error >/dev/null 2>&1; then - msg_error "Terminated by signal (SIGTERM)" + msg_error "Terminated by signal (SIGTERM)" 2>/dev/null || true else - echo -e "\n${RD}Terminated by signal (SIGTERM)${CL}" + echo -e "\n${RD}Terminated by signal (SIGTERM)${CL}" 2>/dev/null || true fi exit 143 } +# ------------------------------------------------------------------------------ +# on_hangup() +# +# - SIGHUP trap handler (SSH disconnect, terminal closed) +# - CRITICAL: This was previously MISSING from catch_errors(), causing +# container processes to become orphans on SSH disconnect — the #1 cause +# of records stuck in "installing" and "configuring" states +# - Reports status via direct curl (terminal is already closed, no output) +# - Stops orphaned container to prevent ghost records +# - Exits with code 129 (128 + SIGHUP=1) +# ------------------------------------------------------------------------------ +on_hangup() { + _send_abort_telemetry "129" + _stop_container_if_installing + exit 129 +} + # ============================================================================== -# SECTION 4: INITIALIZATION +# SECTION 5: INITIALIZATION # ============================================================================== # ------------------------------------------------------------------------------ @@ -294,10 +496,11 @@ on_terminate() { # * set -o pipefail: Pipeline fails if any command fails # * set -u: (optional) Exit on undefined variable (if STRICT_UNSET=1) # - Sets up traps: -# * ERR → error_handler -# * EXIT → on_exit -# * INT → on_interrupt -# * TERM → on_terminate +# * ERR → error_handler (script errors) +# * EXIT → on_exit (any termination — cleanup + orphan detection) +# * INT → on_interrupt (Ctrl+C) +# * TERM → on_terminate (kill / systemd stop) +# * HUP → on_hangup (SSH disconnect / terminal closed) # - Call this function early in every script # ------------------------------------------------------------------------------ catch_errors() { @@ -310,4 +513,5 @@ catch_errors() { trap on_exit EXIT trap on_interrupt INT trap on_terminate TERM + trap on_hangup HUP }