From 887a899f2448f851d6405568d580c3ebcf57069f Mon Sep 17 00:00:00 2001 From: "CanbiZ (MickLesk)" <47820557+MickLesk@users.noreply.github.com> Date: Tue, 10 Feb 2026 08:14:45 +0100 Subject: [PATCH] Extend telemetry reporting and ingestion Add extended telemetry functions and server-side support: misc/api.func gains helpers (categorize_error, install timer, detect_gpu) and new reporters for tools, addons, and an extended post_update_to_api with duration, GPU and error_category. misc/data/service.go updated to accept and validate new fields (type: tool/addon, tool_name, parent_ct, gpu_vendor, gpu_passthrough, install_duration, error_category), expand allowed enums, include new fields in UpsertTelemetry and mapping, and add input sanitization. Also add telemetry-ingest.exe binary. These changes enable richer telemetry (tool/addon events, GPU info, durations and categorized errors) and server ingestion/validation for them. --- misc/api.func | 316 +++++++++++++++++++++++++++++++++++++++++ misc/data/dashboard.go | 39 +++++ misc/data/service.go | 132 +++++++++++++---- 3 files changed, 460 insertions(+), 27 deletions(-) diff --git a/misc/api.func b/misc/api.func index 1d3d1bcd7..a1f6802dd 100644 --- a/misc/api.func +++ b/misc/api.func @@ -407,3 +407,319 @@ EOF POST_UPDATE_DONE=true } + +# ============================================================================== +# SECTION 3: EXTENDED TELEMETRY FUNCTIONS +# ============================================================================== + +# ------------------------------------------------------------------------------ +# categorize_error() +# +# - Maps exit codes to error categories for better analytics +# - Categories: network, storage, dependency, permission, timeout, config, resource, unknown +# - Used to group errors in dashboard +# ------------------------------------------------------------------------------ +categorize_error() { + local code="$1" + case "$code" in + # Network errors + 6|7|22|28|35) echo "network" ;; + + # Storage errors + 214|217|219) echo "storage" ;; + + # Dependency/Package errors + 100|101|102|127|160|161|162) echo "dependency" ;; + + # Permission errors + 126|152) echo "permission" ;; + + # Timeout errors + 124|28|211) echo "timeout" ;; + + # Configuration errors + 203|204|205|206|207|208) echo "config" ;; + + # Resource errors (OOM, etc) + 137|134) echo "resource" ;; + + # Default + *) echo "unknown" ;; + esac +} + +# ------------------------------------------------------------------------------ +# start_install_timer() +# +# - Captures start time for installation duration tracking +# - Call at the beginning of installation +# - Sets INSTALL_START_TIME global variable +# ------------------------------------------------------------------------------ +start_install_timer() { + INSTALL_START_TIME=$(date +%s) + export INSTALL_START_TIME +} + +# ------------------------------------------------------------------------------ +# get_install_duration() +# +# - Returns elapsed seconds since start_install_timer() was called +# - Returns 0 if timer was not started +# ------------------------------------------------------------------------------ +get_install_duration() { + if [[ -z "${INSTALL_START_TIME:-}" ]]; then + echo "0" + return + fi + local now=$(date +%s) + echo $((now - INSTALL_START_TIME)) +} + +# ------------------------------------------------------------------------------ +# detect_gpu() +# +# - Detects GPU vendor and passthrough type +# - Sets GPU_VENDOR and GPU_PASSTHROUGH globals +# - Used for GPU analytics +# ------------------------------------------------------------------------------ +detect_gpu() { + GPU_VENDOR="" + GPU_PASSTHROUGH="none" + + # Detect Intel GPU + if lspci 2>/dev/null | grep -qi "VGA.*Intel"; then + GPU_VENDOR="intel" + GPU_PASSTHROUGH="igpu" + fi + + # Detect AMD GPU + if lspci 2>/dev/null | grep -qi "VGA.*AMD\|VGA.*ATI"; then + GPU_VENDOR="amd" + # Check if discrete + if lspci 2>/dev/null | grep -qi "AMD.*Radeon"; then + GPU_PASSTHROUGH="dgpu" + else + GPU_PASSTHROUGH="igpu" + fi + fi + + # Detect NVIDIA GPU + if lspci 2>/dev/null | grep -qi "VGA.*NVIDIA\|3D.*NVIDIA"; then + GPU_VENDOR="nvidia" + GPU_PASSTHROUGH="dgpu" + fi + + export GPU_VENDOR GPU_PASSTHROUGH +} + +# ------------------------------------------------------------------------------ +# post_tool_to_api() +# +# - Reports tool usage to telemetry +# - Arguments: +# * $1: tool_name (e.g., "microcode", "lxc-update", "post-pve-install") +# * $2: status ("success" or "failed") +# * $3: exit_code (optional, default: 0 for success, 1 for failed) +# - For PVE host tools, not container installations +# ------------------------------------------------------------------------------ +post_tool_to_api() { + command -v curl &>/dev/null || return 0 + [[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0 + + local tool_name="${1:-unknown}" + local status="${2:-success}" + local exit_code="${3:-0}" + local error="" error_category="" + local uuid duration + + # Generate UUID for this tool execution + uuid=$(cat /proc/sys/kernel/random/uuid 2>/dev/null || uuidgen 2>/dev/null || echo "tool-$(date +%s)") + duration=$(get_install_duration) + + # Map status + [[ "$status" == "done" ]] && status="success" + + if [[ "$status" == "failed" ]]; then + [[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1 + error=$(explain_exit_code "$exit_code") + error_category=$(categorize_error "$exit_code") + fi + + local pve_version="" + if command -v pveversion &>/dev/null; then + pve_version=$(pveversion 2>/dev/null | awk -F'[/ ]' '{print $2}') || true + fi + + local JSON_PAYLOAD + JSON_PAYLOAD=$(cat </dev/null || true +} + +# ------------------------------------------------------------------------------ +# post_addon_to_api() +# +# - Reports addon installation to telemetry +# - Arguments: +# * $1: addon_name (e.g., "filebrowser", "netdata") +# * $2: status ("success" or "failed") +# * $3: parent_ct (optional, name of parent container) +# * $4: exit_code (optional) +# - For addons installed inside containers +# ------------------------------------------------------------------------------ +post_addon_to_api() { + command -v curl &>/dev/null || return 0 + [[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0 + + local addon_name="${1:-unknown}" + local status="${2:-success}" + local parent_ct="${3:-}" + local exit_code="${4:-0}" + local error="" error_category="" + local uuid duration + + # Generate UUID for this addon installation + uuid=$(cat /proc/sys/kernel/random/uuid 2>/dev/null || uuidgen 2>/dev/null || echo "addon-$(date +%s)") + duration=$(get_install_duration) + + # Map status + [[ "$status" == "done" ]] && status="success" + + if [[ "$status" == "failed" ]]; then + [[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1 + error=$(explain_exit_code "$exit_code") + error_category=$(categorize_error "$exit_code") + fi + + # Detect OS info + local os_type="" os_version="" + if [[ -f /etc/os-release ]]; then + os_type=$(grep "^ID=" /etc/os-release | cut -d= -f2 | tr -d '"') + os_version=$(grep "^VERSION_ID=" /etc/os-release | cut -d= -f2 | tr -d '"') + fi + + local JSON_PAYLOAD + JSON_PAYLOAD=$(cat </dev/null || true +} + +# ------------------------------------------------------------------------------ +# post_update_to_api_extended() +# +# - Extended version of post_update_to_api with duration, GPU, and error category +# - Same arguments as post_update_to_api: +# * $1: status ("done" or "failed") +# * $2: exit_code (numeric) +# - Automatically includes: +# * Install duration (if start_install_timer was called) +# * Error category (for failed status) +# * GPU info (if detect_gpu was called) +# ------------------------------------------------------------------------------ +post_update_to_api_extended() { + # Silent fail - telemetry should never break scripts + command -v curl &>/dev/null || return 0 + + # Prevent duplicate submissions + POST_UPDATE_DONE=${POST_UPDATE_DONE:-false} + [[ "$POST_UPDATE_DONE" == "true" ]] && return 0 + + [[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0 + [[ -z "${RANDOM_UUID:-}" ]] && return 0 + + local status="${1:-failed}" + local raw_exit_code="${2:-1}" + local exit_code=0 error="" pb_status error_category="" + local duration gpu_vendor gpu_passthrough + + # Get duration + duration=$(get_install_duration) + + # Get GPU info (if detected) + gpu_vendor="${GPU_VENDOR:-}" + gpu_passthrough="${GPU_PASSTHROUGH:-}" + + # Map status to telemetry values + case "$status" in + done | success) + pb_status="success" + exit_code=0 + error="" + error_category="" + ;; + failed) + pb_status="failed" + ;; + *) + pb_status="unknown" + ;; + esac + + # For failed/unknown status, resolve exit code and error description + if [[ "$pb_status" == "failed" ]] || [[ "$pb_status" == "unknown" ]]; then + if [[ "$raw_exit_code" =~ ^[0-9]+$ ]]; then + exit_code="$raw_exit_code" + else + exit_code=1 + fi + error=$(explain_exit_code "$exit_code") + error_category=$(categorize_error "$exit_code") + [[ -z "$error" ]] && error="Unknown error" + fi + + local JSON_PAYLOAD + JSON_PAYLOAD=$(cat </dev/null || true + + POST_UPDATE_DONE=true +} \ No newline at end of file diff --git a/misc/data/dashboard.go b/misc/data/dashboard.go index 16746e81b..acba2a89d 100644 --- a/misc/data/dashboard.go +++ b/misc/data/dashboard.go @@ -26,6 +26,15 @@ type DashboardData struct { FailedApps []AppFailure `json:"failed_apps"` RecentRecords []TelemetryRecord `json:"recent_records"` DailyStats []DailyStat `json:"daily_stats"` + + // Extended metrics + GPUStats []GPUCount `json:"gpu_stats"` + ErrorCategories []ErrorCatCount `json:"error_categories"` + TopTools []ToolCount `json:"top_tools"` + TopAddons []AddonCount `json:"top_addons"` + AvgInstallDuration float64 `json:"avg_install_duration"` // seconds + TotalTools int `json:"total_tools"` + TotalAddons int `json:"total_addons"` } type AppCount struct { @@ -72,6 +81,29 @@ type DailyStat struct { Failed int `json:"failed"` } +// Extended metric types +type GPUCount struct { + Vendor string `json:"vendor"` + Passthrough string `json:"passthrough"` + Count int `json:"count"` +} + +type ErrorCatCount struct { + Category string `json:"category"` + Count int `json:"count"` +} + +type ToolCount struct { + Tool string `json:"tool"` + Count int `json:"count"` +} + +type AddonCount struct { + Addon string `json:"addon"` + ParentCT string `json:"parent_ct"` + Count int `json:"count"` +} + // FetchDashboardData retrieves aggregated data from PocketBase func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*DashboardData, error) { if err := p.ensureAuth(ctx); err != nil { @@ -101,6 +133,13 @@ func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*Dashboard dailySuccess := make(map[string]int) dailyFailed := make(map[string]int) + // Extended metrics maps + gpuCounts := make(map[string]int) // "vendor|passthrough" -> count + errorCatCounts := make(map[string]int) // category -> count + toolCounts := make(map[string]int) // tool_name -> count + addonCounts := make(map[string]int) // addon_name -> count + var totalDuration, durationCount int + for _, r := range records { data.TotalInstalls++ diff --git a/misc/data/service.go b/misc/data/service.go index 3a9034ba2..95ea0f390 100644 --- a/misc/data/service.go +++ b/misc/data/service.go @@ -62,7 +62,7 @@ type Config struct { type TelemetryIn struct { // Required RandomID string `json:"random_id"` // Session UUID - Type string `json:"type"` // "lxc" or "vm" + Type string `json:"type"` // "lxc", "vm", "tool", "addon" NSAPP string `json:"nsapp"` // Application name (e.g., "jellyfin") Status string `json:"status"` // "installing", "success", "failed", "unknown" @@ -81,6 +81,24 @@ type TelemetryIn struct { Method string `json:"method,omitempty"` // "default", "advanced" Error string `json:"error,omitempty"` // Error description (max 120 chars) ExitCode int `json:"exit_code,omitempty"` // 0-255 + + // === NEW FIELDS === + + // Tool telemetry (type="tool") + ToolName string `json:"tool_name,omitempty"` // "microcode", "lxc-update", "post-pve-install", etc. + + // Addon telemetry (type="addon") + ParentCT string `json:"parent_ct,omitempty"` // Parent container name (e.g., "jellyfin") + + // GPU Passthrough stats + GPUVendor string `json:"gpu_vendor,omitempty"` // "intel", "amd", "nvidia" + GPUPassthrough string `json:"gpu_passthrough,omitempty"` // "igpu", "dgpu", "vgpu", "none" + + // Performance metrics + InstallDuration int `json:"install_duration,omitempty"` // Seconds + + // Error categorization + ErrorCategory string `json:"error_category,omitempty"` // "network", "storage", "dependency", "permission", "timeout", "unknown" } // TelemetryOut is sent to PocketBase (matches _dev_telemetry_data collection) @@ -99,13 +117,25 @@ type TelemetryOut struct { Method string `json:"method,omitempty"` Error string `json:"error,omitempty"` ExitCode int `json:"exit_code,omitempty"` + + // Extended fields + ToolName string `json:"tool_name,omitempty"` + ParentCT string `json:"parent_ct,omitempty"` + GPUVendor string `json:"gpu_vendor,omitempty"` + GPUPassthrough string `json:"gpu_passthrough,omitempty"` + InstallDuration int `json:"install_duration,omitempty"` + ErrorCategory string `json:"error_category,omitempty"` } // TelemetryStatusUpdate contains only fields needed for status updates type TelemetryStatusUpdate struct { - Status string `json:"status"` - Error string `json:"error,omitempty"` - ExitCode int `json:"exit_code"` + Status string `json:"status"` + Error string `json:"error,omitempty"` + ExitCode int `json:"exit_code"` + InstallDuration int `json:"install_duration,omitempty"` + ErrorCategory string `json:"error_category,omitempty"` + GPUVendor string `json:"gpu_vendor,omitempty"` + GPUPassthrough string `json:"gpu_passthrough,omitempty"` } type PBClient struct { @@ -332,11 +362,15 @@ func (p *PBClient) UpsertTelemetry(ctx context.Context, payload TelemetryOut) er return p.CreateTelemetry(ctx, payload) } - // Update only status, error, and exit_code + // Update only status, error, exit_code, and new metrics fields update := TelemetryStatusUpdate{ - Status: payload.Status, - Error: payload.Error, - ExitCode: payload.ExitCode, + Status: payload.Status, + Error: payload.Error, + ExitCode: payload.ExitCode, + InstallDuration: payload.InstallDuration, + ErrorCategory: payload.ErrorCategory, + GPUVendor: payload.GPUVendor, + GPUPassthrough: payload.GPUPassthrough, } return p.UpdateTelemetryStatus(ctx, recordID, update) } @@ -491,7 +525,7 @@ func getClientIP(r *http.Request, pt *ProxyTrust) net.IP { var ( // Allowed values for 'type' field - allowedType = map[string]bool{"lxc": true, "vm": true} + allowedType = map[string]bool{"lxc": true, "vm": true, "tool": true, "addon": true} // Allowed values for 'status' field allowedStatus = map[string]bool{"installing": true, "success": true, "failed": true, "unknown": true} @@ -502,6 +536,18 @@ var ( "fedora": true, "rocky": true, "alma": true, "centos": true, "opensuse": true, "gentoo": true, "openeuler": true, } + + // Allowed values for 'gpu_vendor' field + allowedGPUVendor = map[string]bool{"intel": true, "amd": true, "nvidia": true, "": true} + + // Allowed values for 'gpu_passthrough' field + allowedGPUPassthrough = map[string]bool{"igpu": true, "dgpu": true, "vgpu": true, "none": true, "": true} + + // Allowed values for 'error_category' field + allowedErrorCategory = map[string]bool{ + "network": true, "storage": true, "dependency": true, "permission": true, + "timeout": true, "config": true, "resource": true, "unknown": true, "": true, + } ) func sanitizeShort(s string, max int) string { @@ -529,6 +575,13 @@ func validate(in *TelemetryIn) error { in.PveVer = sanitizeShort(in.PveVer, 32) in.Method = sanitizeShort(in.Method, 32) + // Sanitize new fields + in.ToolName = sanitizeShort(in.ToolName, 64) + in.ParentCT = sanitizeShort(in.ParentCT, 64) + in.GPUVendor = strings.ToLower(sanitizeShort(in.GPUVendor, 16)) + in.GPUPassthrough = strings.ToLower(sanitizeShort(in.GPUPassthrough, 16)) + in.ErrorCategory = strings.ToLower(sanitizeShort(in.ErrorCategory, 32)) + // IMPORTANT: "error" must be short and not contain identifiers/logs in.Error = sanitizeShort(in.Error, 120) @@ -537,20 +590,36 @@ func validate(in *TelemetryIn) error { return errors.New("missing required fields: random_id, type, nsapp, status") } + // Normalize common typos for backwards compatibility + if in.Status == "sucess" { + in.Status = "success" + } + // Validate enums if !allowedType[in.Type] { - return errors.New("invalid type (must be 'lxc' or 'vm')") + return errors.New("invalid type (must be 'lxc', 'vm', 'tool', or 'addon')") } if !allowedStatus[in.Status] { return errors.New("invalid status") } + // Validate new enum fields + if !allowedGPUVendor[in.GPUVendor] { + return errors.New("invalid gpu_vendor (must be 'intel', 'amd', 'nvidia', or empty)") + } + if !allowedGPUPassthrough[in.GPUPassthrough] { + return errors.New("invalid gpu_passthrough (must be 'igpu', 'dgpu', 'vgpu', 'none', or empty)") + } + if !allowedErrorCategory[in.ErrorCategory] { + return errors.New("invalid error_category") + } + // For status updates (not installing), skip numeric field validation // These are only required for initial creation isUpdate := in.Status != "installing" - // os_type is optional but if provided must be valid - if in.OsType != "" && !allowedOsType[in.OsType] { + // os_type is optional but if provided must be valid (only for lxc/vm) + if (in.Type == "lxc" || in.Type == "vm") && in.OsType != "" && !allowedOsType[in.OsType] { return errors.New("invalid os_type") } @@ -558,7 +627,7 @@ func validate(in *TelemetryIn) error { // Values like "default", "advanced", "mydefaults-global", "mydefaults-app" are all valid // Validate numeric ranges (only strict for new records) - if !isUpdate { + if !isUpdate && (in.Type == "lxc" || in.Type == "vm") { if in.CTType < 0 || in.CTType > 2 { return errors.New("invalid ct_type (must be 0, 1, or 2)") } @@ -575,6 +644,9 @@ func validate(in *TelemetryIn) error { if in.ExitCode < 0 || in.ExitCode > 255 { return errors.New("invalid exit_code") } + if in.InstallDuration < 0 || in.InstallDuration > 86400 { + return errors.New("invalid install_duration (max 24h)") + } return nil } @@ -897,20 +969,26 @@ func main() { // Map input to PocketBase schema out := TelemetryOut{ - RandomID: in.RandomID, - Type: in.Type, - NSAPP: in.NSAPP, - Status: in.Status, - CTType: in.CTType, - DiskSize: in.DiskSize, - CoreCount: in.CoreCount, - RAMSize: in.RAMSize, - OsType: in.OsType, - OsVersion: in.OsVersion, - PveVer: in.PveVer, - Method: in.Method, - Error: in.Error, - ExitCode: in.ExitCode, + RandomID: in.RandomID, + Type: in.Type, + NSAPP: in.NSAPP, + Status: in.Status, + CTType: in.CTType, + DiskSize: in.DiskSize, + CoreCount: in.CoreCount, + RAMSize: in.RAMSize, + OsType: in.OsType, + OsVersion: in.OsVersion, + PveVer: in.PveVer, + Method: in.Method, + Error: in.Error, + ExitCode: in.ExitCode, + ToolName: in.ToolName, + ParentCT: in.ParentCT, + GPUVendor: in.GPUVendor, + GPUPassthrough: in.GPUPassthrough, + InstallDuration: in.InstallDuration, + ErrorCategory: in.ErrorCategory, } _ = computeHash(out) // For future deduplication