From 81e39c06801ceeecdc8e33fdca9fc589577772d7 Mon Sep 17 00:00:00 2001
From: "CanbiZ (MickLesk)" <47820557+MickLesk@users.noreply.github.com>
Date: Tue, 24 Feb 2026 09:36:33 +0100
Subject: [PATCH] merge error_handler

---
 misc/error_handler.func | 302 +++++++++++++++++++++++++++++++++-------
 1 file changed, 253 insertions(+), 49 deletions(-)

diff --git a/misc/error_handler.func b/misc/error_handler.func
index bd6d9892..cea4639a 100644
--- a/misc/error_handler.func
+++ b/misc/error_handler.func
@@ -37,25 +37,79 @@ if ! declare -f explain_exit_code &>/dev/null; then
     case "$code" in
     1) echo "General error / Operation not permitted" ;;
     2) echo "Misuse of shell builtins (e.g. syntax error)" ;;
+    3) echo "General syntax or argument error" ;;
+    10) echo "Docker / privileged mode required (unsupported environment)" ;;
+    4) echo "curl: Feature not supported or protocol error" ;;
+    5) echo "curl: Could not resolve proxy" ;;
     6) echo "curl: DNS resolution failed (could not resolve host)" ;;
     7) echo "curl: Failed to connect (network unreachable / host down)" ;;
+    8) echo "curl: Server reply error (FTP/SFTP or apk untrusted key)" ;;
+    16) echo "curl: HTTP/2 framing layer error" ;;
+    18) echo "curl: Partial file (transfer not completed)" ;;
     22) echo "curl: HTTP error returned (404, 429, 500+)" ;;
+    23) echo "curl: Write error (disk full or permissions)" ;;
+    24) echo "curl: Write to local file failed" ;;
+    25) echo "curl: Upload failed" ;;
+    26) echo "curl: Read error on local file (I/O)" ;;
+    27) echo "curl: Out of memory (memory allocation failed)" ;;
     28) echo "curl: Operation timeout (network slow or server not responding)" ;;
+    30) echo "curl: FTP port command failed" ;;
+    32) echo "curl: FTP SIZE command failed" ;;
+    33) echo "curl: HTTP range error" ;;
+    34) echo "curl: HTTP post error" ;;
     35) echo "curl: SSL/TLS handshake failed (certificate error)" ;;
+    36) echo "curl: FTP bad download resume" ;;
+    39) echo "curl: LDAP search failed" ;;
+    44) echo "curl: Internal error (bad function call order)" ;;
+    45) echo "curl: Interface error (failed to bind to specified interface)" ;;
+    46) echo "curl: Bad password entered" ;;
+    47) echo "curl: Too many redirects" ;;
+    48) echo "curl: Unknown command line option specified" ;;
+    51) echo "curl: SSL peer certificate or SSH host key verification failed" ;;
+    52) echo "curl: Empty reply from server (got nothing)" ;;
+    55) echo "curl: Failed sending network data" ;;
+    56) echo "curl: Receive error (connection reset by peer)" ;;
+    57) echo "curl: Unrecoverable poll/select error (system I/O failure)" ;;
+    59) echo "curl: Couldn't use specified SSL cipher" ;;
+    61) echo "curl: Bad/unrecognized transfer encoding" ;;
+    63) echo "curl: Maximum file size exceeded" ;;
+    75) echo "Temporary failure (retry later)" ;;
+    78) echo "curl: Remote file not found (404 on FTP/file)" ;;
+    79) echo "curl: SSH session error (key exchange/auth failed)" ;;
+    92) echo "curl: HTTP/2 stream error (protocol violation)" ;;
+    95) echo "curl: HTTP/3 layer error" ;;
+    64) echo "Usage error (wrong arguments)" ;;
+    65) echo "Data format error (bad input data)" ;;
+    66) echo "Input file not found (cannot open input)" ;;
+    67) echo "User not found (addressee unknown)" ;;
+    68) echo "Host not found (hostname unknown)" ;;
+    69) echo "Service unavailable" ;;
+    70) echo "Internal software error" ;;
+    71) echo "System error (OS-level failure)" ;;
+    72) echo "Critical OS file missing" ;;
+    73) echo "Cannot create output file" ;;
+    74) echo "I/O error" ;;
+    76) echo "Remote protocol error" ;;
+    77) echo "Permission denied" ;;
     100) echo "APT: Package manager error (broken packages / dependency problems)" ;;
     101) echo "APT: Configuration error (bad sources.list, malformed config)" ;;
     102) echo "APT: Lock held by another process (dpkg/apt still running)" ;;
     124) echo "Command timed out (timeout command)" ;;
+    125) echo "Command failed to start (Docker daemon or execution error)" ;;
     126) echo "Command invoked cannot execute (permission problem?)" ;;
     127) echo "Command not found" ;;
     128) echo "Invalid argument to exit" ;;
     129) echo "Killed by SIGHUP (terminal closed / hangup)" ;;
-    130) echo "Terminated by Ctrl+C (SIGINT)" ;;
+    130) echo "Aborted by user (SIGINT)" ;;
+    131) echo "Killed by SIGQUIT (core dumped)" ;;
+    132) echo "Killed by SIGILL (illegal CPU instruction)" ;;
     134) echo "Process aborted (SIGABRT - possibly Node.js heap overflow)" ;;
     137) echo "Killed (SIGKILL / Out of memory?)" ;;
     139) echo "Segmentation fault (core dumped)" ;;
     141) echo "Broken pipe (SIGPIPE - output closed prematurely)" ;;
     143) echo "Terminated (SIGTERM)" ;;
+    144) echo "Killed by signal 16 (SIGUSR1 / SIGSTKFLT)" ;;
+    146) echo "Killed by signal 18 (SIGTSTP)" ;;
     150) echo "Systemd: Service failed to start" ;;
     151) echo "Systemd: Service unit not found" ;;
     152) echo "Permission denied (EACCES)" ;;
@@ -101,6 +155,7 @@ if ! declare -f explain_exit_code &>/dev/null; then
     224) echo "Proxmox: PBS storage is for backups only" ;;
     225) echo "Proxmox: No template available for OS/Version" ;;
     231) echo "Proxmox: LXC stack upgrade failed" ;;
+    239) echo "npm/Node.js: Unexpected runtime error or dependency failure" ;;
     243) echo "Node.js: Out of memory (JavaScript heap out of memory)" ;;
     245) echo "Node.js: Invalid command-line option" ;;
     246) echo "Node.js: Internal JavaScript Parse Error" ;;
@@ -149,6 +204,16 @@ error_handler() {
 
   printf "\e[?25h"
 
+  # ALWAYS report failure to API immediately - don't wait for container checks
+  # This ensures we capture failures that occur before/after container exists
+  if declare -f post_update_to_api &>/dev/null; then
+    post_update_to_api "failed" "$exit_code" 2>/dev/null || true
+  else
+    # Container context: post_update_to_api not available (api.func not sourced)
+    # Send status directly via curl so container failures are never lost
+    _send_abort_telemetry "$exit_code" 2>/dev/null || true
+  fi
+
   # Use msg_error if available, fallback to echo
   if declare -f msg_error >/dev/null 2>&1; then
     msg_error "in line ${line_number}: exit code ${exit_code} (${explanation}): while executing command ${command}"
@@ -175,55 +240,92 @@ error_handler() {
     active_log="$SILENT_LOGFILE"
   fi
 
+  # If active_log points to a container-internal path that doesn't exist on host,
+  # fall back to BUILD_LOG (host-side log)
+  if [[ -n "$active_log" && ! -s "$active_log" && -n "${BUILD_LOG:-}" && -s "${BUILD_LOG}" ]]; then
+    active_log="$BUILD_LOG"
+  fi
+
+  # Show last log lines if available
   if [[ -n "$active_log" && -s "$active_log" ]]; then
-    echo "--- Last 20 lines of silent log ---"
+    echo -e "\n${TAB}--- Last 20 lines of log ---"
     tail -n 20 "$active_log"
-    echo "-----------------------------------"
+    echo -e "${TAB}-----------------------------------\n"
+  fi
 
-    # Detect context: Container (INSTALL_LOG set + /root exists) vs Host (BUILD_LOG)
-    if [[ -n "${INSTALL_LOG:-}" && -d /root ]]; then
-      # CONTAINER CONTEXT: Copy log and create flag file for host
-      local container_log="/root/.install-${SESSION_ID:-error}.log"
-      cp "$active_log" "$container_log" 2>/dev/null || true
+  # Detect context: Container (INSTALL_LOG set + inside container /root) vs Host
+  if [[ -n "${INSTALL_LOG:-}" && -f "${INSTALL_LOG:-}" && -d /root ]]; then
+    # CONTAINER CONTEXT: Copy log and create flag file for host
+    local container_log="/root/.install-${SESSION_ID:-error}.log"
+    cp "${INSTALL_LOG}" "$container_log" 2>/dev/null || true
 
-      # Create error flag file with exit code for host detection
-      echo "$exit_code" >"/root/.install-${SESSION_ID:-error}.failed" 2>/dev/null || true
-      # Log path is shown by host as combined log - no need to show container path
-    else
-      # HOST CONTEXT: Show local log path and offer container cleanup
+    # Create error flag file with exit code for host detection
+    echo "$exit_code" >"/root/.install-${SESSION_ID:-error}.failed" 2>/dev/null || true
+    # Log path is shown by host as combined log - no need to show container path
+  else
+    # HOST CONTEXT: Show local log path and offer container cleanup
+    if [[ -n "$active_log" && -s "$active_log" ]]; then
       if declare -f msg_custom >/dev/null 2>&1; then
         msg_custom "📋" "${YW}" "Full log: ${active_log}"
       else
         echo -e "${YW}Full log:${CL} ${BL}${active_log}${CL}"
       fi
+    fi
 
-      # Offer to remove container if it exists (build errors after container creation)
-      if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null && pct status "$CTID" &>/dev/null; then
-        # Report failure to API before container cleanup
-        if declare -f post_update_to_api &>/dev/null; then
-          post_update_to_api "failed" "$exit_code"
-        fi
-
-        echo ""
+    # Offer to remove container if it exists (build errors after container creation)
+    if [[ -n "${CTID:-}" ]] && command -v pct &>/dev/null && pct status "$CTID" &>/dev/null; then
+      echo ""
+      if declare -f msg_custom >/dev/null 2>&1; then
+        echo -en "${TAB}❓${TAB}${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
+      else
         echo -en "${YW}Remove broken container ${CTID}? (Y/n) [auto-remove in 60s]: ${CL}"
+      fi
 
-        if read -t 60 -r response; then
-          if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
-            echo -e "\n${YW}Removing container ${CTID}${CL}"
-            pct stop "$CTID" &>/dev/null || true
-            pct destroy "$CTID" &>/dev/null || true
-            echo -e "${GN}✔${CL} Container ${CTID} removed"
-          elif [[ "$response" =~ ^[Nn]$ ]]; then
-            echo -e "\n${YW}Container ${CTID} kept for debugging${CL}"
+      if read -t 60 -r response; then
+        if [[ -z "$response" || "$response" =~ ^[Yy]$ ]]; then
+          echo ""
+          if declare -f msg_info >/dev/null 2>&1; then
+            msg_info "Removing container ${CTID}"
+          else
+            echo -e "${YW}Removing container ${CTID}${CL}"
           fi
-        else
-          # Timeout - auto-remove
-          echo -e "\n${YW}No response - auto-removing container${CL}"
           pct stop "$CTID" &>/dev/null || true
           pct destroy "$CTID" &>/dev/null || true
+          if declare -f msg_ok >/dev/null 2>&1; then
+            msg_ok "Container ${CTID} removed"
+          else
+            echo -e "${GN}✔${CL} Container ${CTID} removed"
+          fi
+        elif [[ "$response" =~ ^[Nn]$ ]]; then
+          echo ""
+          if declare -f msg_warn >/dev/null 2>&1; then
+            msg_warn "Container ${CTID} kept for debugging"
+          else
+            echo -e "${YW}Container ${CTID} kept for debugging${CL}"
+          fi
+        fi
+      else
+        # Timeout - auto-remove
+        echo ""
+        if declare -f msg_info >/dev/null 2>&1; then
+          msg_info "No response - removing container ${CTID}"
+        else
+          echo -e "${YW}No response - removing container ${CTID}${CL}"
+        fi
+        pct stop "$CTID" &>/dev/null || true
+        pct destroy "$CTID" &>/dev/null || true
+        if declare -f msg_ok >/dev/null 2>&1; then
+          msg_ok "Container ${CTID} removed"
+        else
           echo -e "${GN}✔${CL} Container ${CTID} removed"
         fi
       fi
+
+      # Force one final status update attempt after cleanup
+      # This ensures status is updated even if the first attempt failed (e.g., HTTP 400)
+      if declare -f post_update_to_api &>/dev/null; then
+        post_update_to_api "failed" "$exit_code" "force"
+      fi
     fi
   fi
 
@@ -231,19 +333,97 @@ error_handler() {
 }
 
 # ==============================================================================
-# SECTION 3: SIGNAL HANDLERS
+# SECTION 3: TELEMETRY & CLEANUP HELPERS FOR SIGNAL HANDLERS
+# ==============================================================================
+
+# ------------------------------------------------------------------------------
+# _send_abort_telemetry()
+#
+# - Sends failure/abort status to telemetry API
+# - Works in BOTH host context (post_update_to_api available) and
+#   container context (only curl available, api.func not sourced)
+# - Container context is critical: without this, container-side failures
+#   and signal exits are never reported, leaving records stuck in
+#   "installing" or "configuring" forever
+# - Arguments: $1 = exit_code
+# ------------------------------------------------------------------------------
+_send_abort_telemetry() {
+  local exit_code="${1:-1}"
+  # Try full API function first (host context - api.func sourced)
+  if declare -f post_update_to_api &>/dev/null; then
+    post_update_to_api "failed" "$exit_code" 2>/dev/null || true
+    return
+  fi
+  # Fallback: direct curl (container context - api.func NOT sourced)
+  # This is the ONLY way containers can report failures to telemetry
+  command -v curl &>/dev/null || return 0
+  [[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
+  [[ -z "${RANDOM_UUID:-}" ]] && return 0
+  curl -fsS -m 5 -X POST "${TELEMETRY_URL:-https://telemetry.community-scripts.org/telemetry}" \
+    -H "Content-Type: application/json" \
+    -d "{\"random_id\":\"${RANDOM_UUID}\",\"execution_id\":\"${EXECUTION_ID:-${RANDOM_UUID}}\",\"type\":\"${TELEMETRY_TYPE:-lxc}\",\"nsapp\":\"${NSAPP:-${app:-unknown}}\",\"status\":\"failed\",\"exit_code\":${exit_code}}" &>/dev/null || true
+}
+
+# ------------------------------------------------------------------------------
+# _stop_container_if_installing()
+#
+# - Stops the LXC container if we're in the install phase
+# - Prevents orphaned container processes when the host exits due to a signal
+#   (SSH disconnect, Ctrl+C, SIGTERM) — without this, the container keeps
+#   running and may send "configuring" status AFTER the host already sent
+#   "failed", leaving records permanently stuck in "configuring"
+# - Only acts when:
+#   * CONTAINER_INSTALLING flag is set (during lxc-attach in build_container)
+#   * CTID is set (container was created)
+#   * pct command is available (we're on the Proxmox host, not inside a container)
+# - Does NOT destroy the container — just stops it for potential debugging
+# ------------------------------------------------------------------------------
+_stop_container_if_installing() {
+  [[ "${CONTAINER_INSTALLING:-}" == "true" ]] || return 0
+  [[ -n "${CTID:-}" ]] || return 0
+  command -v pct &>/dev/null || return 0
+  pct stop "$CTID" 2>/dev/null || true
+}
+
+# ==============================================================================
+# SECTION 4: SIGNAL HANDLERS
 # ==============================================================================
 
 # ------------------------------------------------------------------------------
 # on_exit()
 #
-# - EXIT trap handler
-# - Cleans up lock files if lockfile variable is set
-# - Exits with captured exit code
-# - Always runs on script termination (success or failure)
+# - EXIT trap handler — runs on EVERY script termination
+# - Catches orphaned "installing"/"configuring" records:
+#   * If post_to_api sent "installing" but post_update_to_api never ran
+#   * Reports final status to prevent records stuck forever
+# - Best-effort log collection for failed installs
+# - Stops orphaned container processes on failure
+# - Cleans up lock files
 # ------------------------------------------------------------------------------
 on_exit() {
   local exit_code=$?
+
+  # Report orphaned "installing" records to telemetry API
+  # Catches ALL exit paths: errors, signals, AND clean exits where
+  # post_to_api was called but post_update_to_api was never called
+  if [[ "${POST_TO_API_DONE:-}" == "true" && "${POST_UPDATE_DONE:-}" != "true" ]]; then
+    if [[ $exit_code -ne 0 ]]; then
+      _send_abort_telemetry "$exit_code"
+    elif declare -f post_update_to_api >/dev/null 2>&1; then
+      post_update_to_api "done" "0" 2>/dev/null || true
+    fi
+  fi
+
+  # Best-effort log collection on failure (non-critical, telemetry already sent)
+  if [[ $exit_code -ne 0 ]] && declare -f ensure_log_on_host >/dev/null 2>&1; then
+    ensure_log_on_host 2>/dev/null || true
+  fi
+
+  # Stop orphaned container if we're in the install phase and exiting with error
+  if [[ $exit_code -ne 0 ]]; then
+    _stop_container_if_installing
+  fi
+
   [[ -n "${lockfile:-}" && -e "$lockfile" ]] && rm -f "$lockfile"
   exit "$exit_code"
 }
@@ -252,14 +432,17 @@ on_exit() {
 # on_interrupt()
 #
 # - SIGINT (Ctrl+C) trap handler
-# - Displays "Interrupted by user" message
+# - Reports status FIRST (time-critical: container may be dying)
+# - Stops orphaned container to prevent "configuring" ghost records
 # - Exits with code 130 (128 + SIGINT=2)
 # ------------------------------------------------------------------------------
 on_interrupt() {
+  _send_abort_telemetry "130"
+  _stop_container_if_installing
   if declare -f msg_error >/dev/null 2>&1; then
-    msg_error "Interrupted by user (SIGINT)"
+    msg_error "Interrupted by user (SIGINT)" 2>/dev/null || true
   else
-    echo -e "\n${RD}Interrupted by user (SIGINT)${CL}"
+    echo -e "\n${RD}Interrupted by user (SIGINT)${CL}" 2>/dev/null || true
   fi
   exit 130
 }
@@ -268,21 +451,40 @@ on_interrupt() {
 # on_terminate()
 #
 # - SIGTERM trap handler
-# - Displays "Terminated by signal" message
+# - Reports status FIRST (time-critical: process being killed)
+# - Stops orphaned container to prevent "configuring" ghost records
 # - Exits with code 143 (128 + SIGTERM=15)
-# - Triggered by external process termination
 # ------------------------------------------------------------------------------
 on_terminate() {
+  _send_abort_telemetry "143"
+  _stop_container_if_installing
   if declare -f msg_error >/dev/null 2>&1; then
-    msg_error "Terminated by signal (SIGTERM)"
+    msg_error "Terminated by signal (SIGTERM)" 2>/dev/null || true
   else
-    echo -e "\n${RD}Terminated by signal (SIGTERM)${CL}"
+    echo -e "\n${RD}Terminated by signal (SIGTERM)${CL}" 2>/dev/null || true
   fi
   exit 143
 }
 
+# ------------------------------------------------------------------------------
+# on_hangup()
+#
+# - SIGHUP trap handler (SSH disconnect, terminal closed)
+# - CRITICAL: This was previously MISSING from catch_errors(), causing
+#   container processes to become orphans on SSH disconnect — the #1 cause
+#   of records stuck in "installing" and "configuring" states
+# - Reports status via direct curl (terminal is already closed, no output)
+# - Stops orphaned container to prevent ghost records
+# - Exits with code 129 (128 + SIGHUP=1)
+# ------------------------------------------------------------------------------
+on_hangup() {
+  _send_abort_telemetry "129"
+  _stop_container_if_installing
+  exit 129
+}
+
 # ==============================================================================
-# SECTION 4: INITIALIZATION
+# SECTION 5: INITIALIZATION
 # ==============================================================================
 
 # ------------------------------------------------------------------------------
@@ -294,10 +496,11 @@ on_terminate() {
 #   * set -o pipefail: Pipeline fails if any command fails
 #   * set -u: (optional) Exit on undefined variable (if STRICT_UNSET=1)
 # - Sets up traps:
-#   * ERR → error_handler
-#   * EXIT → on_exit
-#   * INT → on_interrupt
-#   * TERM → on_terminate
+#   * ERR  → error_handler (script errors)
+#   * EXIT → on_exit      (any termination — cleanup + orphan detection)
+#   * INT  → on_interrupt  (Ctrl+C)
+#   * TERM → on_terminate  (kill / systemd stop)
+#   * HUP  → on_hangup     (SSH disconnect / terminal closed)
 # - Call this function early in every script
 # ------------------------------------------------------------------------------
 catch_errors() {
@@ -310,4 +513,5 @@ catch_errors() {
   trap on_exit EXIT
   trap on_interrupt INT
   trap on_terminate TERM
+  trap on_hangup HUP
 }