From 0226a043b8fa2c4a4d259275f4e99cb1f7d5fa17 Mon Sep 17 00:00:00 2001 From: MickLesk Date: Mon, 9 Feb 2026 18:33:33 +0100 Subject: [PATCH] feat(telemetry): add caching, alerts, migration & dashboard improvements - Add Redis/in-memory caching layer (cache.go) - Add SMTP alerting for high failure rates (alerts.go) - Add data migration script from old API (migrate.go) - Add docker-compose.yml for easy deployment - Move dashboard to / with redirect from /dashboard - Add dark/light mode toggle - Add error analysis and failed apps statistics - Add PVE version and LXC/VM type stats - Add /metrics Prometheus endpoint - Add /api/records pagination endpoint - Add CSV export functionality - Enhanced healthcheck with PB connection status New ENV vars: - Cache: ENABLE_CACHE, CACHE_TTL_SECONDS, ENABLE_REDIS, REDIS_URL - Alerts: ALERT_ENABLED, SMTP_*, ALERT_FAILURE_THRESHOLD, etc. - Migration: RUN_MIGRATION, MIGRATION_REQUIRED, MIGRATION_SOURCE_URL --- misc/data/Dockerfile | 44 ++- misc/data/alerts.go | 267 ++++++++++++++ misc/data/cache.go | 158 ++++++++ misc/data/dashboard.go | 682 +++++++++++++++++++++++++++++++++-- misc/data/docker-compose.yml | 74 ++++ misc/data/entrypoint.sh | 50 +++ misc/data/go.mod | 7 + misc/data/go.sum | 10 + misc/data/migrate.go | 265 ++++++++++++++ misc/data/migrate.sh | 67 ++++ misc/data/service.go | 279 +++++++++++++- misc/data/telemetry-service | Bin 0 -> 10448005 bytes 12 files changed, 1874 insertions(+), 29 deletions(-) create mode 100644 misc/data/alerts.go create mode 100644 misc/data/cache.go create mode 100644 misc/data/docker-compose.yml create mode 100644 misc/data/entrypoint.sh create mode 100644 misc/data/go.sum create mode 100644 misc/data/migrate.go create mode 100755 misc/data/migrate.sh create mode 100755 misc/data/telemetry-service diff --git a/misc/data/Dockerfile b/misc/data/Dockerfile index 6fd4377bc..d9228dafe 100644 --- a/misc/data/Dockerfile +++ b/misc/data/Dockerfile @@ -1,10 +1,52 @@ FROM golang:1.25-alpine AS build WORKDIR /src +COPY go.mod go.sum* ./ +RUN go mod download 2>/dev/null || true COPY . . RUN go build -trimpath -ldflags "-s -w" -o /out/telemetry-ingest . +RUN go build -trimpath -ldflags "-s -w" -o /out/migrate migrate.go FROM alpine:3.23 +RUN apk add --no-cache ca-certificates tzdata WORKDIR /app COPY --from=build /out/telemetry-ingest /app/telemetry-ingest +COPY --from=build /out/migrate /app/migrate +COPY entrypoint.sh /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh /app/migrate + +# Service config +ENV LISTEN_ADDR=":8080" +ENV MAX_BODY_BYTES="1024" +ENV RATE_LIMIT_RPM="60" +ENV RATE_BURST="20" +ENV UPSTREAM_TIMEOUT_MS="4000" +ENV ENABLE_REQUEST_LOGGING="false" + +# Cache config (optional) +ENV ENABLE_CACHE="true" +ENV CACHE_TTL_SECONDS="60" +ENV ENABLE_REDIS="false" +# ENV REDIS_URL="redis://localhost:6379" + +# Alert config (optional) +ENV ALERT_ENABLED="false" +# ENV SMTP_HOST="" +# ENV SMTP_PORT="587" +# ENV SMTP_USER="" +# ENV SMTP_PASSWORD="" +# ENV SMTP_FROM="telemetry@proxmoxved.local" +# ENV SMTP_TO="" +# ENV SMTP_USE_TLS="false" +ENV ALERT_FAILURE_THRESHOLD="20.0" +ENV ALERT_CHECK_INTERVAL_MIN="15" +ENV ALERT_COOLDOWN_MIN="60" + +# Migration config (optional) +ENV RUN_MIGRATION="false" +ENV MIGRATION_REQUIRED="false" +ENV MIGRATION_SOURCE_URL="https://api.htl-braunau.at/dev/data" + EXPOSE 8080 -CMD ["/app/telemetry-ingest"] +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s \ + CMD wget -q --spider http://localhost:8080/healthz || exit 1 +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/misc/data/alerts.go b/misc/data/alerts.go new file mode 100644 index 000000000..dccbbd6b6 --- /dev/null +++ b/misc/data/alerts.go @@ -0,0 +1,267 @@ +package main + +import ( + "bytes" + "context" + "crypto/tls" + "fmt" + "log" + "net/smtp" + "strings" + "sync" + "time" +) + +// AlertConfig holds SMTP alert configuration +type AlertConfig struct { + Enabled bool + SMTPHost string + SMTPPort int + SMTPUser string + SMTPPassword string + SMTPFrom string + SMTPTo []string + UseTLS bool + FailureThreshold float64 // Alert when failure rate exceeds this (e.g., 20.0 = 20%) + CheckInterval time.Duration // How often to check + Cooldown time.Duration // Minimum time between alerts +} + +// Alerter handles alerting functionality +type Alerter struct { + cfg AlertConfig + lastAlertAt time.Time + mu sync.Mutex + pb *PBClient + lastStats alertStats + alertHistory []AlertEvent +} + +type alertStats struct { + successCount int + failedCount int + checkedAt time.Time +} + +// AlertEvent records an alert that was sent +type AlertEvent struct { + Timestamp time.Time `json:"timestamp"` + Type string `json:"type"` + Message string `json:"message"` + FailureRate float64 `json:"failure_rate,omitempty"` +} + +// NewAlerter creates a new alerter instance +func NewAlerter(cfg AlertConfig, pb *PBClient) *Alerter { + return &Alerter{ + cfg: cfg, + pb: pb, + alertHistory: make([]AlertEvent, 0), + } +} + +// Start begins the alert monitoring loop +func (a *Alerter) Start() { + if !a.cfg.Enabled { + log.Println("INFO: alerting disabled") + return + } + + if a.cfg.SMTPHost == "" || len(a.cfg.SMTPTo) == 0 { + log.Println("WARN: alerting enabled but SMTP not configured") + return + } + + go a.monitorLoop() + log.Printf("INFO: alert monitoring started (threshold: %.1f%%, interval: %v)", a.cfg.FailureThreshold, a.cfg.CheckInterval) +} + +func (a *Alerter) monitorLoop() { + ticker := time.NewTicker(a.cfg.CheckInterval) + defer ticker.Stop() + + for range ticker.C { + a.checkAndAlert() + } +} + +func (a *Alerter) checkAndAlert() { + ctx, cancel := newTimeoutContext(10 * time.Second) + defer cancel() + + // Fetch last hour's data + data, err := a.pb.FetchDashboardData(ctx, 1) + if err != nil { + log.Printf("WARN: alert check failed: %v", err) + return + } + + // Calculate current failure rate + total := data.SuccessCount + data.FailedCount + if total < 10 { + // Not enough data to determine rate + return + } + + failureRate := float64(data.FailedCount) / float64(total) * 100 + + // Check if we should alert + if failureRate >= a.cfg.FailureThreshold { + a.maybeSendAlert(failureRate, data.FailedCount, total) + } +} + +func (a *Alerter) maybeSendAlert(rate float64, failed, total int) { + a.mu.Lock() + defer a.mu.Unlock() + + // Check cooldown + if time.Since(a.lastAlertAt) < a.cfg.Cooldown { + return + } + + // Send alert + subject := fmt.Sprintf("[ProxmoxVED Alert] High Failure Rate: %.1f%%", rate) + body := fmt.Sprintf(`ProxmoxVE Helper Scripts - Telemetry Alert + +⚠️ High installation failure rate detected! + +Current Statistics (last 24h): +- Failure Rate: %.1f%% +- Failed Installations: %d +- Total Installations: %d +- Threshold: %.1f%% + +Time: %s + +Please check the dashboard for more details. + +--- +This is an automated alert from the telemetry service. +`, rate, failed, total, a.cfg.FailureThreshold, time.Now().Format(time.RFC1123)) + + if err := a.sendEmail(subject, body); err != nil { + log.Printf("ERROR: failed to send alert email: %v", err) + return + } + + a.lastAlertAt = time.Now() + a.alertHistory = append(a.alertHistory, AlertEvent{ + Timestamp: time.Now(), + Type: "high_failure_rate", + Message: fmt.Sprintf("Failure rate %.1f%% exceeded threshold %.1f%%", rate, a.cfg.FailureThreshold), + FailureRate: rate, + }) + + // Keep only last 100 alerts + if len(a.alertHistory) > 100 { + a.alertHistory = a.alertHistory[len(a.alertHistory)-100:] + } + + log.Printf("ALERT: sent high failure rate alert (%.1f%%)", rate) +} + +func (a *Alerter) sendEmail(subject, body string) error { + // Build message + var msg bytes.Buffer + msg.WriteString(fmt.Sprintf("From: %s\r\n", a.cfg.SMTPFrom)) + msg.WriteString(fmt.Sprintf("To: %s\r\n", strings.Join(a.cfg.SMTPTo, ", "))) + msg.WriteString(fmt.Sprintf("Subject: %s\r\n", subject)) + msg.WriteString("MIME-Version: 1.0\r\n") + msg.WriteString("Content-Type: text/plain; charset=UTF-8\r\n") + msg.WriteString("\r\n") + msg.WriteString(body) + + addr := fmt.Sprintf("%s:%d", a.cfg.SMTPHost, a.cfg.SMTPPort) + + var auth smtp.Auth + if a.cfg.SMTPUser != "" && a.cfg.SMTPPassword != "" { + auth = smtp.PlainAuth("", a.cfg.SMTPUser, a.cfg.SMTPPassword, a.cfg.SMTPHost) + } + + if a.cfg.UseTLS { + // TLS connection + tlsConfig := &tls.Config{ + ServerName: a.cfg.SMTPHost, + } + + conn, err := tls.Dial("tcp", addr, tlsConfig) + if err != nil { + return fmt.Errorf("TLS dial failed: %w", err) + } + defer conn.Close() + + client, err := smtp.NewClient(conn, a.cfg.SMTPHost) + if err != nil { + return fmt.Errorf("SMTP client failed: %w", err) + } + defer client.Close() + + if auth != nil { + if err := client.Auth(auth); err != nil { + return fmt.Errorf("SMTP auth failed: %w", err) + } + } + + if err := client.Mail(a.cfg.SMTPFrom); err != nil { + return fmt.Errorf("SMTP MAIL failed: %w", err) + } + + for _, to := range a.cfg.SMTPTo { + if err := client.Rcpt(to); err != nil { + return fmt.Errorf("SMTP RCPT failed: %w", err) + } + } + + w, err := client.Data() + if err != nil { + return fmt.Errorf("SMTP DATA failed: %w", err) + } + + _, err = w.Write(msg.Bytes()) + if err != nil { + return fmt.Errorf("SMTP write failed: %w", err) + } + + return w.Close() + } + + // Non-TLS (STARTTLS) + return smtp.SendMail(addr, auth, a.cfg.SMTPFrom, a.cfg.SMTPTo, msg.Bytes()) +} + +// GetAlertHistory returns recent alert events +func (a *Alerter) GetAlertHistory() []AlertEvent { + a.mu.Lock() + defer a.mu.Unlock() + result := make([]AlertEvent, len(a.alertHistory)) + copy(result, a.alertHistory) + return result +} + +// TestAlert sends a test alert email +func (a *Alerter) TestAlert() error { + if !a.cfg.Enabled || a.cfg.SMTPHost == "" { + return fmt.Errorf("alerting not configured") + } + + subject := "[ProxmoxVED] Test Alert" + body := fmt.Sprintf(`This is a test alert from ProxmoxVE Helper Scripts telemetry service. + +If you received this email, your alert configuration is working correctly. + +Time: %s +SMTP Host: %s +Recipients: %s + +--- +This is an automated test message. +`, time.Now().Format(time.RFC1123), a.cfg.SMTPHost, strings.Join(a.cfg.SMTPTo, ", ")) + + return a.sendEmail(subject, body) +} + +// Helper for timeout context +func newTimeoutContext(d time.Duration) (context.Context, context.CancelFunc) { + return context.WithTimeout(context.Background(), d) +} diff --git a/misc/data/cache.go b/misc/data/cache.go new file mode 100644 index 000000000..54cc5f5c5 --- /dev/null +++ b/misc/data/cache.go @@ -0,0 +1,158 @@ +package main + +import ( + "context" + "encoding/json" + "log" + "sync" + "time" + + "github.com/redis/go-redis/v9" +) + +// CacheConfig holds cache configuration +type CacheConfig struct { + RedisURL string + EnableRedis bool + DefaultTTL time.Duration +} + +// Cache provides caching functionality with Redis or in-memory fallback +type Cache struct { + redis *redis.Client + useRedis bool + defaultTTL time.Duration + + // In-memory fallback + mu sync.RWMutex + memData map[string]cacheEntry +} + +type cacheEntry struct { + data []byte + expiresAt time.Time +} + +// NewCache creates a new cache instance +func NewCache(cfg CacheConfig) *Cache { + c := &Cache{ + defaultTTL: cfg.DefaultTTL, + memData: make(map[string]cacheEntry), + } + + if cfg.EnableRedis && cfg.RedisURL != "" { + opts, err := redis.ParseURL(cfg.RedisURL) + if err != nil { + log.Printf("WARN: invalid redis URL, using in-memory cache: %v", err) + return c + } + + client := redis.NewClient(opts) + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + if err := client.Ping(ctx).Err(); err != nil { + log.Printf("WARN: redis connection failed, using in-memory cache: %v", err) + return c + } + + c.redis = client + c.useRedis = true + log.Printf("INFO: connected to Redis for caching") + } + + // Start cleanup goroutine for in-memory cache + if !c.useRedis { + go c.cleanupLoop() + } + + return c +} + +func (c *Cache) cleanupLoop() { + ticker := time.NewTicker(5 * time.Minute) + defer ticker.Stop() + + for range ticker.C { + c.mu.Lock() + now := time.Now() + for k, v := range c.memData { + if now.After(v.expiresAt) { + delete(c.memData, k) + } + } + c.mu.Unlock() + } +} + +// Get retrieves a value from cache +func (c *Cache) Get(ctx context.Context, key string, dest interface{}) bool { + if c.useRedis { + data, err := c.redis.Get(ctx, key).Bytes() + if err != nil { + return false + } + return json.Unmarshal(data, dest) == nil + } + + // In-memory fallback + c.mu.RLock() + entry, ok := c.memData[key] + c.mu.RUnlock() + + if !ok || time.Now().After(entry.expiresAt) { + return false + } + + return json.Unmarshal(entry.data, dest) == nil +} + +// Set stores a value in cache +func (c *Cache) Set(ctx context.Context, key string, value interface{}, ttl time.Duration) error { + if ttl == 0 { + ttl = c.defaultTTL + } + + data, err := json.Marshal(value) + if err != nil { + return err + } + + if c.useRedis { + return c.redis.Set(ctx, key, data, ttl).Err() + } + + // In-memory fallback + c.mu.Lock() + c.memData[key] = cacheEntry{ + data: data, + expiresAt: time.Now().Add(ttl), + } + c.mu.Unlock() + + return nil +} + +// Delete removes a key from cache +func (c *Cache) Delete(ctx context.Context, key string) error { + if c.useRedis { + return c.redis.Del(ctx, key).Err() + } + + c.mu.Lock() + delete(c.memData, key) + c.mu.Unlock() + return nil +} + +// InvalidateDashboard clears dashboard cache +func (c *Cache) InvalidateDashboard(ctx context.Context) { + // Delete all dashboard cache keys + for days := 1; days <= 365; days++ { + _ = c.Delete(ctx, dashboardCacheKey(days)) + } +} + +func dashboardCacheKey(days int) string { + return "dashboard:" + string(rune(days)) +} diff --git a/misc/data/dashboard.go b/misc/data/dashboard.go index 34edbbf40..fb6b28955 100644 --- a/misc/data/dashboard.go +++ b/misc/data/dashboard.go @@ -6,6 +6,7 @@ import ( "fmt" "net/http" "net/url" + "strings" "time" ) @@ -19,6 +20,10 @@ type DashboardData struct { TopApps []AppCount `json:"top_apps"` OsDistribution []OsCount `json:"os_distribution"` MethodStats []MethodCount `json:"method_stats"` + PveVersions []PveCount `json:"pve_versions"` + TypeStats []TypeCount `json:"type_stats"` + ErrorAnalysis []ErrorGroup `json:"error_analysis"` + FailedApps []AppFailure `json:"failed_apps"` RecentRecords []TelemetryRecord `json:"recent_records"` DailyStats []DailyStat `json:"daily_stats"` } @@ -38,6 +43,29 @@ type MethodCount struct { Count int `json:"count"` } +type PveCount struct { + Version string `json:"version"` + Count int `json:"count"` +} + +type TypeCount struct { + Type string `json:"type"` + Count int `json:"count"` +} + +type ErrorGroup struct { + Pattern string `json:"pattern"` + Count int `json:"count"` + Apps string `json:"apps"` // Comma-separated list of affected apps +} + +type AppFailure struct { + App string `json:"app"` + TotalCount int `json:"total_count"` + FailedCount int `json:"failed_count"` + FailureRate float64 `json:"failure_rate"` +} + type DailyStat struct { Date string `json:"date"` Success int `json:"success"` @@ -64,8 +92,12 @@ func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*Dashboard // Aggregate statistics appCounts := make(map[string]int) + appFailures := make(map[string]int) osCounts := make(map[string]int) methodCounts := make(map[string]int) + pveCounts := make(map[string]int) + typeCounts := make(map[string]int) + errorPatterns := make(map[string]map[string]bool) // pattern -> set of apps dailySuccess := make(map[string]int) dailyFailed := make(map[string]int) @@ -77,6 +109,20 @@ func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*Dashboard data.SuccessCount++ case "failed": data.FailedCount++ + // Track failed apps + if r.NSAPP != "" { + appFailures[r.NSAPP]++ + } + // Group errors by pattern + if r.Error != "" { + pattern := normalizeError(r.Error) + if errorPatterns[pattern] == nil { + errorPatterns[pattern] = make(map[string]bool) + } + if r.NSAPP != "" { + errorPatterns[pattern][r.NSAPP] = true + } + } case "installing": data.InstallingCount++ } @@ -96,6 +142,16 @@ func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*Dashboard methodCounts[r.Method]++ } + // Count PVE versions + if r.PveVer != "" { + pveCounts[r.PveVer]++ + } + + // Count types (LXC vs VM) + if r.Type != "" { + typeCounts[r.Type]++ + } + // Daily stats (use Created field if available) if r.Created != "" { date := r.Created[:10] // "2026-02-09" @@ -117,6 +173,14 @@ func (p *PBClient) FetchDashboardData(ctx context.Context, days int) (*Dashboard data.TopApps = topN(appCounts, 10) data.OsDistribution = topNOs(osCounts, 10) data.MethodStats = topNMethod(methodCounts, 10) + data.PveVersions = topNPve(pveCounts, 10) + data.TypeStats = topNType(typeCounts, 10) + + // Error analysis + data.ErrorAnalysis = buildErrorAnalysis(errorPatterns, 10) + + // Failed apps with failure rates + data.FailedApps = buildFailedApps(appCounts, appFailures, 10) // Daily stats for chart data.DailyStats = buildDailyStats(dailySuccess, dailyFailed, days) @@ -234,6 +298,158 @@ func topNMethod(m map[string]int, n int) []MethodCount { return result } +func topNPve(m map[string]int, n int) []PveCount { + result := make([]PveCount, 0, len(m)) + for k, v := range m { + result = append(result, PveCount{Version: k, Count: v}) + } + for i := 0; i < len(result)-1; i++ { + for j := i + 1; j < len(result); j++ { + if result[j].Count > result[i].Count { + result[i], result[j] = result[j], result[i] + } + } + } + if len(result) > n { + return result[:n] + } + return result +} + +func topNType(m map[string]int, n int) []TypeCount { + result := make([]TypeCount, 0, len(m)) + for k, v := range m { + result = append(result, TypeCount{Type: k, Count: v}) + } + for i := 0; i < len(result)-1; i++ { + for j := i + 1; j < len(result); j++ { + if result[j].Count > result[i].Count { + result[i], result[j] = result[j], result[i] + } + } + } + if len(result) > n { + return result[:n] + } + return result +} + +// normalizeError simplifies error messages into patterns for grouping +func normalizeError(err string) string { + err = strings.TrimSpace(err) + if err == "" { + return "unknown" + } + + // Normalize common patterns + err = strings.ToLower(err) + + // Remove specific numbers, IPs, paths that vary + // Keep it simple for now - just truncate and normalize + if len(err) > 60 { + err = err[:60] + } + + // Common error pattern replacements + patterns := map[string]string{ + "connection refused": "connection refused", + "timeout": "timeout", + "no space left": "disk full", + "permission denied": "permission denied", + "not found": "not found", + "failed to download": "download failed", + "apt": "apt error", + "dpkg": "dpkg error", + "curl": "network error", + "wget": "network error", + "docker": "docker error", + "systemctl": "systemd error", + "service": "service error", + } + + for pattern, label := range patterns { + if strings.Contains(err, pattern) { + return label + } + } + + // If no pattern matches, return first 40 chars + if len(err) > 40 { + return err[:40] + "..." + } + return err +} + +func buildErrorAnalysis(patterns map[string]map[string]bool, n int) []ErrorGroup { + result := make([]ErrorGroup, 0, len(patterns)) + + for pattern, apps := range patterns { + appList := make([]string, 0, len(apps)) + for app := range apps { + appList = append(appList, app) + } + + // Limit app list display + appsStr := strings.Join(appList, ", ") + if len(appsStr) > 50 { + appsStr = appsStr[:47] + "..." + } + + result = append(result, ErrorGroup{ + Pattern: pattern, + Count: len(apps), // Number of unique apps with this error + Apps: appsStr, + }) + } + + // Sort by count descending + for i := 0; i < len(result)-1; i++ { + for j := i + 1; j < len(result); j++ { + if result[j].Count > result[i].Count { + result[i], result[j] = result[j], result[i] + } + } + } + + if len(result) > n { + return result[:n] + } + return result +} + +func buildFailedApps(total, failed map[string]int, n int) []AppFailure { + result := make([]AppFailure, 0) + + for app, failCount := range failed { + totalCount := total[app] + if totalCount == 0 { + continue + } + + rate := float64(failCount) / float64(totalCount) * 100 + result = append(result, AppFailure{ + App: app, + TotalCount: totalCount, + FailedCount: failCount, + FailureRate: rate, + }) + } + + // Sort by failure rate descending + for i := 0; i < len(result)-1; i++ { + for j := i + 1; j < len(result); j++ { + if result[j].FailureRate > result[i].FailureRate { + result[i], result[j] = result[j], result[i] + } + } + } + + if len(result) > n { + return result[:n] + } + return result +} + func buildDailyStats(success, failed map[string]int, days int) []DailyStat { result := make([]DailyStat, 0, days) for i := days - 1; i >= 0; i-- { @@ -254,7 +470,9 @@ func DashboardHTML() string { - Telemetry Dashboard - Community Scripts + Telemetry Dashboard - ProxmoxVE Helper Scripts + + @@ -510,8 +926,13 @@ func DashboardHTML() string { + + + @@ -539,6 +960,17 @@ func DashboardHTML() string {
Success Rate
-
+
+
LXC / VM
+
-
+
+ + +
+

Proxmox VE Versions

+
+ Loading... +
@@ -577,6 +1009,34 @@ func DashboardHTML() string {
+
+

+ + + + + + Error Analysis +

+
+ Loading... +
+
+ +
+

+ + + + + + Apps with Highest Failure Rates +

+
+ Loading... +
+
+

Recent Installations

@@ -599,19 +1059,63 @@ func DashboardHTML() string { OS Type Method + Resources Exit Code Error - Loading... + Loading... + +
+ +