- Add Redis/in-memory caching layer (cache.go) - Add SMTP alerting for high failure rates (alerts.go) - Add data migration script from old API (migrate.go) - Add docker-compose.yml for easy deployment - Move dashboard to / with redirect from /dashboard - Add dark/light mode toggle - Add error analysis and failed apps statistics - Add PVE version and LXC/VM type stats - Add /metrics Prometheus endpoint - Add /api/records pagination endpoint - Add CSV export functionality - Enhanced healthcheck with PB connection status New ENV vars: - Cache: ENABLE_CACHE, CACHE_TTL_SECONDS, ENABLE_REDIS, REDIS_URL - Alerts: ALERT_ENABLED, SMTP_*, ALERT_FAILURE_THRESHOLD, etc. - Migration: RUN_MIGRATION, MIGRATION_REQUIRED, MIGRATION_SOURCE_URL
268 lines
6.4 KiB
Go
268 lines
6.4 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"crypto/tls"
|
|
"fmt"
|
|
"log"
|
|
"net/smtp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// AlertConfig holds SMTP alert configuration
|
|
type AlertConfig struct {
|
|
Enabled bool
|
|
SMTPHost string
|
|
SMTPPort int
|
|
SMTPUser string
|
|
SMTPPassword string
|
|
SMTPFrom string
|
|
SMTPTo []string
|
|
UseTLS bool
|
|
FailureThreshold float64 // Alert when failure rate exceeds this (e.g., 20.0 = 20%)
|
|
CheckInterval time.Duration // How often to check
|
|
Cooldown time.Duration // Minimum time between alerts
|
|
}
|
|
|
|
// Alerter handles alerting functionality
|
|
type Alerter struct {
|
|
cfg AlertConfig
|
|
lastAlertAt time.Time
|
|
mu sync.Mutex
|
|
pb *PBClient
|
|
lastStats alertStats
|
|
alertHistory []AlertEvent
|
|
}
|
|
|
|
type alertStats struct {
|
|
successCount int
|
|
failedCount int
|
|
checkedAt time.Time
|
|
}
|
|
|
|
// AlertEvent records an alert that was sent
|
|
type AlertEvent struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Type string `json:"type"`
|
|
Message string `json:"message"`
|
|
FailureRate float64 `json:"failure_rate,omitempty"`
|
|
}
|
|
|
|
// NewAlerter creates a new alerter instance
|
|
func NewAlerter(cfg AlertConfig, pb *PBClient) *Alerter {
|
|
return &Alerter{
|
|
cfg: cfg,
|
|
pb: pb,
|
|
alertHistory: make([]AlertEvent, 0),
|
|
}
|
|
}
|
|
|
|
// Start begins the alert monitoring loop
|
|
func (a *Alerter) Start() {
|
|
if !a.cfg.Enabled {
|
|
log.Println("INFO: alerting disabled")
|
|
return
|
|
}
|
|
|
|
if a.cfg.SMTPHost == "" || len(a.cfg.SMTPTo) == 0 {
|
|
log.Println("WARN: alerting enabled but SMTP not configured")
|
|
return
|
|
}
|
|
|
|
go a.monitorLoop()
|
|
log.Printf("INFO: alert monitoring started (threshold: %.1f%%, interval: %v)", a.cfg.FailureThreshold, a.cfg.CheckInterval)
|
|
}
|
|
|
|
func (a *Alerter) monitorLoop() {
|
|
ticker := time.NewTicker(a.cfg.CheckInterval)
|
|
defer ticker.Stop()
|
|
|
|
for range ticker.C {
|
|
a.checkAndAlert()
|
|
}
|
|
}
|
|
|
|
func (a *Alerter) checkAndAlert() {
|
|
ctx, cancel := newTimeoutContext(10 * time.Second)
|
|
defer cancel()
|
|
|
|
// Fetch last hour's data
|
|
data, err := a.pb.FetchDashboardData(ctx, 1)
|
|
if err != nil {
|
|
log.Printf("WARN: alert check failed: %v", err)
|
|
return
|
|
}
|
|
|
|
// Calculate current failure rate
|
|
total := data.SuccessCount + data.FailedCount
|
|
if total < 10 {
|
|
// Not enough data to determine rate
|
|
return
|
|
}
|
|
|
|
failureRate := float64(data.FailedCount) / float64(total) * 100
|
|
|
|
// Check if we should alert
|
|
if failureRate >= a.cfg.FailureThreshold {
|
|
a.maybeSendAlert(failureRate, data.FailedCount, total)
|
|
}
|
|
}
|
|
|
|
func (a *Alerter) maybeSendAlert(rate float64, failed, total int) {
|
|
a.mu.Lock()
|
|
defer a.mu.Unlock()
|
|
|
|
// Check cooldown
|
|
if time.Since(a.lastAlertAt) < a.cfg.Cooldown {
|
|
return
|
|
}
|
|
|
|
// Send alert
|
|
subject := fmt.Sprintf("[ProxmoxVED Alert] High Failure Rate: %.1f%%", rate)
|
|
body := fmt.Sprintf(`ProxmoxVE Helper Scripts - Telemetry Alert
|
|
|
|
⚠️ High installation failure rate detected!
|
|
|
|
Current Statistics (last 24h):
|
|
- Failure Rate: %.1f%%
|
|
- Failed Installations: %d
|
|
- Total Installations: %d
|
|
- Threshold: %.1f%%
|
|
|
|
Time: %s
|
|
|
|
Please check the dashboard for more details.
|
|
|
|
---
|
|
This is an automated alert from the telemetry service.
|
|
`, rate, failed, total, a.cfg.FailureThreshold, time.Now().Format(time.RFC1123))
|
|
|
|
if err := a.sendEmail(subject, body); err != nil {
|
|
log.Printf("ERROR: failed to send alert email: %v", err)
|
|
return
|
|
}
|
|
|
|
a.lastAlertAt = time.Now()
|
|
a.alertHistory = append(a.alertHistory, AlertEvent{
|
|
Timestamp: time.Now(),
|
|
Type: "high_failure_rate",
|
|
Message: fmt.Sprintf("Failure rate %.1f%% exceeded threshold %.1f%%", rate, a.cfg.FailureThreshold),
|
|
FailureRate: rate,
|
|
})
|
|
|
|
// Keep only last 100 alerts
|
|
if len(a.alertHistory) > 100 {
|
|
a.alertHistory = a.alertHistory[len(a.alertHistory)-100:]
|
|
}
|
|
|
|
log.Printf("ALERT: sent high failure rate alert (%.1f%%)", rate)
|
|
}
|
|
|
|
func (a *Alerter) sendEmail(subject, body string) error {
|
|
// Build message
|
|
var msg bytes.Buffer
|
|
msg.WriteString(fmt.Sprintf("From: %s\r\n", a.cfg.SMTPFrom))
|
|
msg.WriteString(fmt.Sprintf("To: %s\r\n", strings.Join(a.cfg.SMTPTo, ", ")))
|
|
msg.WriteString(fmt.Sprintf("Subject: %s\r\n", subject))
|
|
msg.WriteString("MIME-Version: 1.0\r\n")
|
|
msg.WriteString("Content-Type: text/plain; charset=UTF-8\r\n")
|
|
msg.WriteString("\r\n")
|
|
msg.WriteString(body)
|
|
|
|
addr := fmt.Sprintf("%s:%d", a.cfg.SMTPHost, a.cfg.SMTPPort)
|
|
|
|
var auth smtp.Auth
|
|
if a.cfg.SMTPUser != "" && a.cfg.SMTPPassword != "" {
|
|
auth = smtp.PlainAuth("", a.cfg.SMTPUser, a.cfg.SMTPPassword, a.cfg.SMTPHost)
|
|
}
|
|
|
|
if a.cfg.UseTLS {
|
|
// TLS connection
|
|
tlsConfig := &tls.Config{
|
|
ServerName: a.cfg.SMTPHost,
|
|
}
|
|
|
|
conn, err := tls.Dial("tcp", addr, tlsConfig)
|
|
if err != nil {
|
|
return fmt.Errorf("TLS dial failed: %w", err)
|
|
}
|
|
defer conn.Close()
|
|
|
|
client, err := smtp.NewClient(conn, a.cfg.SMTPHost)
|
|
if err != nil {
|
|
return fmt.Errorf("SMTP client failed: %w", err)
|
|
}
|
|
defer client.Close()
|
|
|
|
if auth != nil {
|
|
if err := client.Auth(auth); err != nil {
|
|
return fmt.Errorf("SMTP auth failed: %w", err)
|
|
}
|
|
}
|
|
|
|
if err := client.Mail(a.cfg.SMTPFrom); err != nil {
|
|
return fmt.Errorf("SMTP MAIL failed: %w", err)
|
|
}
|
|
|
|
for _, to := range a.cfg.SMTPTo {
|
|
if err := client.Rcpt(to); err != nil {
|
|
return fmt.Errorf("SMTP RCPT failed: %w", err)
|
|
}
|
|
}
|
|
|
|
w, err := client.Data()
|
|
if err != nil {
|
|
return fmt.Errorf("SMTP DATA failed: %w", err)
|
|
}
|
|
|
|
_, err = w.Write(msg.Bytes())
|
|
if err != nil {
|
|
return fmt.Errorf("SMTP write failed: %w", err)
|
|
}
|
|
|
|
return w.Close()
|
|
}
|
|
|
|
// Non-TLS (STARTTLS)
|
|
return smtp.SendMail(addr, auth, a.cfg.SMTPFrom, a.cfg.SMTPTo, msg.Bytes())
|
|
}
|
|
|
|
// GetAlertHistory returns recent alert events
|
|
func (a *Alerter) GetAlertHistory() []AlertEvent {
|
|
a.mu.Lock()
|
|
defer a.mu.Unlock()
|
|
result := make([]AlertEvent, len(a.alertHistory))
|
|
copy(result, a.alertHistory)
|
|
return result
|
|
}
|
|
|
|
// TestAlert sends a test alert email
|
|
func (a *Alerter) TestAlert() error {
|
|
if !a.cfg.Enabled || a.cfg.SMTPHost == "" {
|
|
return fmt.Errorf("alerting not configured")
|
|
}
|
|
|
|
subject := "[ProxmoxVED] Test Alert"
|
|
body := fmt.Sprintf(`This is a test alert from ProxmoxVE Helper Scripts telemetry service.
|
|
|
|
If you received this email, your alert configuration is working correctly.
|
|
|
|
Time: %s
|
|
SMTP Host: %s
|
|
Recipients: %s
|
|
|
|
---
|
|
This is an automated test message.
|
|
`, time.Now().Format(time.RFC1123), a.cfg.SMTPHost, strings.Join(a.cfg.SMTPTo, ", "))
|
|
|
|
return a.sendEmail(subject, body)
|
|
}
|
|
|
|
// Helper for timeout context
|
|
func newTimeoutContext(d time.Duration) (context.Context, context.CancelFunc) {
|
|
return context.WithTimeout(context.Background(), d)
|
|
}
|