Extend telemetry reporting and ingestion

Add extended telemetry functions and server-side support: misc/api.func gains helpers (categorize_error, install timer, detect_gpu) and new reporters for tools, addons, and an extended post_update_to_api with duration, GPU and error_category. misc/data/service.go updated to accept and validate new fields (type: tool/addon, tool_name, parent_ct, gpu_vendor, gpu_passthrough, install_duration, error_category), expand allowed enums, include new fields in UpsertTelemetry and mapping, and add input sanitization. Also add telemetry-ingest.exe binary. These changes enable richer telemetry (tool/addon events, GPU info, durations and categorized errors) and server ingestion/validation for them.
This commit is contained in:
CanbiZ (MickLesk) 2026-02-10 08:14:45 +01:00
parent 6f747ed36d
commit 887a899f24
3 changed files with 460 additions and 27 deletions

View File

@ -407,3 +407,319 @@ EOF
POST_UPDATE_DONE=true
}
# ==============================================================================
# SECTION 3: EXTENDED TELEMETRY FUNCTIONS
# ==============================================================================
# ------------------------------------------------------------------------------
# categorize_error()
#
# - Maps exit codes to error categories for better analytics
# - Categories: network, storage, dependency, permission, timeout, config, resource, unknown
# - Used to group errors in dashboard
# ------------------------------------------------------------------------------
categorize_error() {
local code="$1"
case "$code" in
# Network errors
6|7|22|28|35) echo "network" ;;
# Storage errors
214|217|219) echo "storage" ;;
# Dependency/Package errors
100|101|102|127|160|161|162) echo "dependency" ;;
# Permission errors
126|152) echo "permission" ;;
# Timeout errors
124|28|211) echo "timeout" ;;
# Configuration errors
203|204|205|206|207|208) echo "config" ;;
# Resource errors (OOM, etc)
137|134) echo "resource" ;;
# Default
*) echo "unknown" ;;
esac
}
# ------------------------------------------------------------------------------
# start_install_timer()
#
# - Captures start time for installation duration tracking
# - Call at the beginning of installation
# - Sets INSTALL_START_TIME global variable
# ------------------------------------------------------------------------------
start_install_timer() {
INSTALL_START_TIME=$(date +%s)
export INSTALL_START_TIME
}
# ------------------------------------------------------------------------------
# get_install_duration()
#
# - Returns elapsed seconds since start_install_timer() was called
# - Returns 0 if timer was not started
# ------------------------------------------------------------------------------
get_install_duration() {
if [[ -z "${INSTALL_START_TIME:-}" ]]; then
echo "0"
return
fi
local now=$(date +%s)
echo $((now - INSTALL_START_TIME))
}
# ------------------------------------------------------------------------------
# detect_gpu()
#
# - Detects GPU vendor and passthrough type
# - Sets GPU_VENDOR and GPU_PASSTHROUGH globals
# - Used for GPU analytics
# ------------------------------------------------------------------------------
detect_gpu() {
GPU_VENDOR=""
GPU_PASSTHROUGH="none"
# Detect Intel GPU
if lspci 2>/dev/null | grep -qi "VGA.*Intel"; then
GPU_VENDOR="intel"
GPU_PASSTHROUGH="igpu"
fi
# Detect AMD GPU
if lspci 2>/dev/null | grep -qi "VGA.*AMD\|VGA.*ATI"; then
GPU_VENDOR="amd"
# Check if discrete
if lspci 2>/dev/null | grep -qi "AMD.*Radeon"; then
GPU_PASSTHROUGH="dgpu"
else
GPU_PASSTHROUGH="igpu"
fi
fi
# Detect NVIDIA GPU
if lspci 2>/dev/null | grep -qi "VGA.*NVIDIA\|3D.*NVIDIA"; then
GPU_VENDOR="nvidia"
GPU_PASSTHROUGH="dgpu"
fi
export GPU_VENDOR GPU_PASSTHROUGH
}
# ------------------------------------------------------------------------------
# post_tool_to_api()
#
# - Reports tool usage to telemetry
# - Arguments:
# * $1: tool_name (e.g., "microcode", "lxc-update", "post-pve-install")
# * $2: status ("success" or "failed")
# * $3: exit_code (optional, default: 0 for success, 1 for failed)
# - For PVE host tools, not container installations
# ------------------------------------------------------------------------------
post_tool_to_api() {
command -v curl &>/dev/null || return 0
[[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
local tool_name="${1:-unknown}"
local status="${2:-success}"
local exit_code="${3:-0}"
local error="" error_category=""
local uuid duration
# Generate UUID for this tool execution
uuid=$(cat /proc/sys/kernel/random/uuid 2>/dev/null || uuidgen 2>/dev/null || echo "tool-$(date +%s)")
duration=$(get_install_duration)
# Map status
[[ "$status" == "done" ]] && status="success"
if [[ "$status" == "failed" ]]; then
[[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1
error=$(explain_exit_code "$exit_code")
error_category=$(categorize_error "$exit_code")
fi
local pve_version=""
if command -v pveversion &>/dev/null; then
pve_version=$(pveversion 2>/dev/null | awk -F'[/ ]' '{print $2}') || true
fi
local JSON_PAYLOAD
JSON_PAYLOAD=$(cat <<EOF
{
"random_id": "${uuid}",
"type": "tool",
"nsapp": "${tool_name}",
"tool_name": "${tool_name}",
"status": "${status}",
"exit_code": ${exit_code},
"error": "${error}",
"error_category": "${error_category}",
"install_duration": ${duration:-0},
"pve_version": "${pve_version}"
}
EOF
)
curl -fsS -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \
-H "Content-Type: application/json" \
-d "$JSON_PAYLOAD" &>/dev/null || true
}
# ------------------------------------------------------------------------------
# post_addon_to_api()
#
# - Reports addon installation to telemetry
# - Arguments:
# * $1: addon_name (e.g., "filebrowser", "netdata")
# * $2: status ("success" or "failed")
# * $3: parent_ct (optional, name of parent container)
# * $4: exit_code (optional)
# - For addons installed inside containers
# ------------------------------------------------------------------------------
post_addon_to_api() {
command -v curl &>/dev/null || return 0
[[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
local addon_name="${1:-unknown}"
local status="${2:-success}"
local parent_ct="${3:-}"
local exit_code="${4:-0}"
local error="" error_category=""
local uuid duration
# Generate UUID for this addon installation
uuid=$(cat /proc/sys/kernel/random/uuid 2>/dev/null || uuidgen 2>/dev/null || echo "addon-$(date +%s)")
duration=$(get_install_duration)
# Map status
[[ "$status" == "done" ]] && status="success"
if [[ "$status" == "failed" ]]; then
[[ ! "$exit_code" =~ ^[0-9]+$ ]] && exit_code=1
error=$(explain_exit_code "$exit_code")
error_category=$(categorize_error "$exit_code")
fi
# Detect OS info
local os_type="" os_version=""
if [[ -f /etc/os-release ]]; then
os_type=$(grep "^ID=" /etc/os-release | cut -d= -f2 | tr -d '"')
os_version=$(grep "^VERSION_ID=" /etc/os-release | cut -d= -f2 | tr -d '"')
fi
local JSON_PAYLOAD
JSON_PAYLOAD=$(cat <<EOF
{
"random_id": "${uuid}",
"type": "addon",
"nsapp": "${addon_name}",
"status": "${status}",
"parent_ct": "${parent_ct}",
"exit_code": ${exit_code},
"error": "${error}",
"error_category": "${error_category}",
"install_duration": ${duration:-0},
"os_type": "${os_type}",
"os_version": "${os_version}"
}
EOF
)
curl -fsS -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \
-H "Content-Type: application/json" \
-d "$JSON_PAYLOAD" &>/dev/null || true
}
# ------------------------------------------------------------------------------
# post_update_to_api_extended()
#
# - Extended version of post_update_to_api with duration, GPU, and error category
# - Same arguments as post_update_to_api:
# * $1: status ("done" or "failed")
# * $2: exit_code (numeric)
# - Automatically includes:
# * Install duration (if start_install_timer was called)
# * Error category (for failed status)
# * GPU info (if detect_gpu was called)
# ------------------------------------------------------------------------------
post_update_to_api_extended() {
# Silent fail - telemetry should never break scripts
command -v curl &>/dev/null || return 0
# Prevent duplicate submissions
POST_UPDATE_DONE=${POST_UPDATE_DONE:-false}
[[ "$POST_UPDATE_DONE" == "true" ]] && return 0
[[ "${DIAGNOSTICS:-no}" == "no" ]] && return 0
[[ -z "${RANDOM_UUID:-}" ]] && return 0
local status="${1:-failed}"
local raw_exit_code="${2:-1}"
local exit_code=0 error="" pb_status error_category=""
local duration gpu_vendor gpu_passthrough
# Get duration
duration=$(get_install_duration)
# Get GPU info (if detected)
gpu_vendor="${GPU_VENDOR:-}"
gpu_passthrough="${GPU_PASSTHROUGH:-}"
# Map status to telemetry values
case "$status" in
done | success)
pb_status="success"
exit_code=0
error=""
error_category=""
;;
failed)
pb_status="failed"
;;
*)
pb_status="unknown"
;;
esac
# For failed/unknown status, resolve exit code and error description
if [[ "$pb_status" == "failed" ]] || [[ "$pb_status" == "unknown" ]]; then
if [[ "$raw_exit_code" =~ ^[0-9]+$ ]]; then
exit_code="$raw_exit_code"
else
exit_code=1
fi
error=$(explain_exit_code "$exit_code")
error_category=$(categorize_error "$exit_code")
[[ -z "$error" ]] && error="Unknown error"
fi
local JSON_PAYLOAD
JSON_PAYLOAD=$(cat <<EOF
{
"random_id": "${RANDOM_UUID}",
"type": "${TELEMETRY_TYPE:-lxc}",
"nsapp": "${NSAPP:-unknown}",
"status": "${pb_status}",
"exit_code": ${exit_code},
"error": "${error}",
"error_category": "${error_category}",
"install_duration": ${duration:-0},
"gpu_vendor": "${gpu_vendor}",
"gpu_passthrough": "${gpu_passthrough}"
}
EOF
)
curl -fsS -m "${TELEMETRY_TIMEOUT}" -X POST "${TELEMETRY_URL}" \
-H "Content-Type: application/json" \
-d "$JSON_PAYLOAD" &>/dev/null || true
POST_UPDATE_DONE=true
}

View File

@ -26,6 +26,15 @@ type DashboardData struct {
FailedApps []AppFailure `json:"failed_apps"`
RecentRecords []TelemetryRecord `json:"recent_records"`
DailyStats []DailyStat `json:"daily_stats"`
// Extended metrics
GPUStats []GPUCount `json:"gpu_stats"`
ErrorCategories []ErrorCatCount `json:"error_categories"`
TopTools []ToolCount `json:"top_tools"`
TopAddons []AddonCount `json:"top_addons"`
AvgInstallDuration float64 `json:"avg_install_duration"` // seconds
TotalTools int `json:"total_tools"`
TotalAddons int `json:"total_addons"`
}
type AppCount struct {
@ -72,6 +81,29 @@ type DailyStat struct {
Failed int `json:"failed"`
}
// Extended metric types
type GPUCount struct {
Vendor string `json:"vendor"`
Passthrough string `json:"passthrough"`
Count int `json:"count"`
}
type ErrorCatCount struct {
Category string `json:"category"`
Count int `json:"count"`
}
type ToolCount struct {
Tool string `json:"tool"`
Count int `json:"count"`
}
type AddonCount struct {
Addon string `json:"addon"`
ParentCT string `json:"parent_ct"`
Count int `json:"count"`
}
// FetchDashboardData retrieves aggregated data from PocketBase
func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*DashboardData, error) {
if err := p.ensureAuth(ctx); err != nil {
@ -101,6 +133,13 @@ func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*Dashboard
dailySuccess := make(map[string]int)
dailyFailed := make(map[string]int)
// Extended metrics maps
gpuCounts := make(map[string]int) // "vendor|passthrough" -> count
errorCatCounts := make(map[string]int) // category -> count
toolCounts := make(map[string]int) // tool_name -> count
addonCounts := make(map[string]int) // addon_name -> count
var totalDuration, durationCount int
for _, r := range records {
data.TotalInstalls++

View File

@ -62,7 +62,7 @@ type Config struct {
type TelemetryIn struct {
// Required
RandomID string `json:"random_id"` // Session UUID
Type string `json:"type"` // "lxc" or "vm"
Type string `json:"type"` // "lxc", "vm", "tool", "addon"
NSAPP string `json:"nsapp"` // Application name (e.g., "jellyfin")
Status string `json:"status"` // "installing", "success", "failed", "unknown"
@ -81,6 +81,24 @@ type TelemetryIn struct {
Method string `json:"method,omitempty"` // "default", "advanced"
Error string `json:"error,omitempty"` // Error description (max 120 chars)
ExitCode int `json:"exit_code,omitempty"` // 0-255
// === NEW FIELDS ===
// Tool telemetry (type="tool")
ToolName string `json:"tool_name,omitempty"` // "microcode", "lxc-update", "post-pve-install", etc.
// Addon telemetry (type="addon")
ParentCT string `json:"parent_ct,omitempty"` // Parent container name (e.g., "jellyfin")
// GPU Passthrough stats
GPUVendor string `json:"gpu_vendor,omitempty"` // "intel", "amd", "nvidia"
GPUPassthrough string `json:"gpu_passthrough,omitempty"` // "igpu", "dgpu", "vgpu", "none"
// Performance metrics
InstallDuration int `json:"install_duration,omitempty"` // Seconds
// Error categorization
ErrorCategory string `json:"error_category,omitempty"` // "network", "storage", "dependency", "permission", "timeout", "unknown"
}
// TelemetryOut is sent to PocketBase (matches _dev_telemetry_data collection)
@ -99,13 +117,25 @@ type TelemetryOut struct {
Method string `json:"method,omitempty"`
Error string `json:"error,omitempty"`
ExitCode int `json:"exit_code,omitempty"`
// Extended fields
ToolName string `json:"tool_name,omitempty"`
ParentCT string `json:"parent_ct,omitempty"`
GPUVendor string `json:"gpu_vendor,omitempty"`
GPUPassthrough string `json:"gpu_passthrough,omitempty"`
InstallDuration int `json:"install_duration,omitempty"`
ErrorCategory string `json:"error_category,omitempty"`
}
// TelemetryStatusUpdate contains only fields needed for status updates
type TelemetryStatusUpdate struct {
Status string `json:"status"`
Error string `json:"error,omitempty"`
ExitCode int `json:"exit_code"`
Status string `json:"status"`
Error string `json:"error,omitempty"`
ExitCode int `json:"exit_code"`
InstallDuration int `json:"install_duration,omitempty"`
ErrorCategory string `json:"error_category,omitempty"`
GPUVendor string `json:"gpu_vendor,omitempty"`
GPUPassthrough string `json:"gpu_passthrough,omitempty"`
}
type PBClient struct {
@ -332,11 +362,15 @@ func (p *PBClient) UpsertTelemetry(ctx context.Context, payload TelemetryOut) er
return p.CreateTelemetry(ctx, payload)
}
// Update only status, error, and exit_code
// Update only status, error, exit_code, and new metrics fields
update := TelemetryStatusUpdate{
Status: payload.Status,
Error: payload.Error,
ExitCode: payload.ExitCode,
Status: payload.Status,
Error: payload.Error,
ExitCode: payload.ExitCode,
InstallDuration: payload.InstallDuration,
ErrorCategory: payload.ErrorCategory,
GPUVendor: payload.GPUVendor,
GPUPassthrough: payload.GPUPassthrough,
}
return p.UpdateTelemetryStatus(ctx, recordID, update)
}
@ -491,7 +525,7 @@ func getClientIP(r *http.Request, pt *ProxyTrust) net.IP {
var (
// Allowed values for 'type' field
allowedType = map[string]bool{"lxc": true, "vm": true}
allowedType = map[string]bool{"lxc": true, "vm": true, "tool": true, "addon": true}
// Allowed values for 'status' field
allowedStatus = map[string]bool{"installing": true, "success": true, "failed": true, "unknown": true}
@ -502,6 +536,18 @@ var (
"fedora": true, "rocky": true, "alma": true, "centos": true,
"opensuse": true, "gentoo": true, "openeuler": true,
}
// Allowed values for 'gpu_vendor' field
allowedGPUVendor = map[string]bool{"intel": true, "amd": true, "nvidia": true, "": true}
// Allowed values for 'gpu_passthrough' field
allowedGPUPassthrough = map[string]bool{"igpu": true, "dgpu": true, "vgpu": true, "none": true, "": true}
// Allowed values for 'error_category' field
allowedErrorCategory = map[string]bool{
"network": true, "storage": true, "dependency": true, "permission": true,
"timeout": true, "config": true, "resource": true, "unknown": true, "": true,
}
)
func sanitizeShort(s string, max int) string {
@ -529,6 +575,13 @@ func validate(in *TelemetryIn) error {
in.PveVer = sanitizeShort(in.PveVer, 32)
in.Method = sanitizeShort(in.Method, 32)
// Sanitize new fields
in.ToolName = sanitizeShort(in.ToolName, 64)
in.ParentCT = sanitizeShort(in.ParentCT, 64)
in.GPUVendor = strings.ToLower(sanitizeShort(in.GPUVendor, 16))
in.GPUPassthrough = strings.ToLower(sanitizeShort(in.GPUPassthrough, 16))
in.ErrorCategory = strings.ToLower(sanitizeShort(in.ErrorCategory, 32))
// IMPORTANT: "error" must be short and not contain identifiers/logs
in.Error = sanitizeShort(in.Error, 120)
@ -537,20 +590,36 @@ func validate(in *TelemetryIn) error {
return errors.New("missing required fields: random_id, type, nsapp, status")
}
// Normalize common typos for backwards compatibility
if in.Status == "sucess" {
in.Status = "success"
}
// Validate enums
if !allowedType[in.Type] {
return errors.New("invalid type (must be 'lxc' or 'vm')")
return errors.New("invalid type (must be 'lxc', 'vm', 'tool', or 'addon')")
}
if !allowedStatus[in.Status] {
return errors.New("invalid status")
}
// Validate new enum fields
if !allowedGPUVendor[in.GPUVendor] {
return errors.New("invalid gpu_vendor (must be 'intel', 'amd', 'nvidia', or empty)")
}
if !allowedGPUPassthrough[in.GPUPassthrough] {
return errors.New("invalid gpu_passthrough (must be 'igpu', 'dgpu', 'vgpu', 'none', or empty)")
}
if !allowedErrorCategory[in.ErrorCategory] {
return errors.New("invalid error_category")
}
// For status updates (not installing), skip numeric field validation
// These are only required for initial creation
isUpdate := in.Status != "installing"
// os_type is optional but if provided must be valid
if in.OsType != "" && !allowedOsType[in.OsType] {
// os_type is optional but if provided must be valid (only for lxc/vm)
if (in.Type == "lxc" || in.Type == "vm") && in.OsType != "" && !allowedOsType[in.OsType] {
return errors.New("invalid os_type")
}
@ -558,7 +627,7 @@ func validate(in *TelemetryIn) error {
// Values like "default", "advanced", "mydefaults-global", "mydefaults-app" are all valid
// Validate numeric ranges (only strict for new records)
if !isUpdate {
if !isUpdate && (in.Type == "lxc" || in.Type == "vm") {
if in.CTType < 0 || in.CTType > 2 {
return errors.New("invalid ct_type (must be 0, 1, or 2)")
}
@ -575,6 +644,9 @@ func validate(in *TelemetryIn) error {
if in.ExitCode < 0 || in.ExitCode > 255 {
return errors.New("invalid exit_code")
}
if in.InstallDuration < 0 || in.InstallDuration > 86400 {
return errors.New("invalid install_duration (max 24h)")
}
return nil
}
@ -897,20 +969,26 @@ func main() {
// Map input to PocketBase schema
out := TelemetryOut{
RandomID: in.RandomID,
Type: in.Type,
NSAPP: in.NSAPP,
Status: in.Status,
CTType: in.CTType,
DiskSize: in.DiskSize,
CoreCount: in.CoreCount,
RAMSize: in.RAMSize,
OsType: in.OsType,
OsVersion: in.OsVersion,
PveVer: in.PveVer,
Method: in.Method,
Error: in.Error,
ExitCode: in.ExitCode,
RandomID: in.RandomID,
Type: in.Type,
NSAPP: in.NSAPP,
Status: in.Status,
CTType: in.CTType,
DiskSize: in.DiskSize,
CoreCount: in.CoreCount,
RAMSize: in.RAMSize,
OsType: in.OsType,
OsVersion: in.OsVersion,
PveVer: in.PveVer,
Method: in.Method,
Error: in.Error,
ExitCode: in.ExitCode,
ToolName: in.ToolName,
ParentCT: in.ParentCT,
GPUVendor: in.GPUVendor,
GPUPassthrough: in.GPUPassthrough,
InstallDuration: in.InstallDuration,
ErrorCategory: in.ErrorCategory,
}
_ = computeHash(out) // For future deduplication