From f04f7601c08d577fca693e69ae7d57bb01a59b63 Mon Sep 17 00:00:00 2001 From: Andrej Mickov Date: Fri, 27 Feb 2026 14:59:23 +0100 Subject: [PATCH] Initial metrics endpoint added in Prometheus style --- README.md | 2 + api/api.go | 14 + auth/middleware.go | 35 ++- logging/logging.go | 44 ++- logging/logging_metrics_test.go | 30 ++ metadata/metadata.go | 71 +++-- metrics/metrics.go | 471 ++++++++++++++++++++++++++++++++ service/service.go | 89 +++++- storage/blob.go | 39 ++- 9 files changed, 754 insertions(+), 41 deletions(-) create mode 100644 logging/logging_metrics_test.go create mode 100644 metrics/metrics.go diff --git a/README.md b/README.md index f9fee41..1e57191 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,8 @@ Reference: `auth/README.md` Health: - `GET /healthz` - `HEAD /healthz` +- `GET /metrics` (Prometheus exposition format) +- `HEAD /metrics` ## Limitations diff --git a/api/api.go b/api/api.go index 9b28d21..90d2589 100644 --- a/api/api.go +++ b/api/api.go @@ -10,6 +10,7 @@ import ( "fs/auth" "fs/logging" "fs/metadata" + "fs/metrics" "fs/models" "fs/service" "io" @@ -70,6 +71,8 @@ func (h *Handler) setupRoutes() { h.router.Get("/healthz", h.handleHealth) h.router.Head("/healthz", h.handleHealth) + h.router.Get("/metrics", h.handleMetrics) + h.router.Head("/metrics", h.handleMetrics) h.router.Get("/", h.handleGetBuckets) h.router.Get("/{bucket}/", h.handleGetBucket) @@ -106,6 +109,17 @@ func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request) { } } +func (h *Handler) handleMetrics(w http.ResponseWriter, r *http.Request) { + payload := metrics.Default.RenderPrometheus() + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + w.Header().Set("Content-Length", strconv.Itoa(len(payload))) + w.WriteHeader(http.StatusOK) + if r.Method == http.MethodHead { + return + } + _, _ = w.Write([]byte(payload)) +} + func validateObjectKey(key string) *s3APIError { if key == "" { err := s3ErrInvalidObjectKey diff --git a/auth/middleware.go b/auth/middleware.go index eb0b902..63cebbd 100644 --- a/auth/middleware.go +++ b/auth/middleware.go @@ -1,6 +1,8 @@ package auth import ( + "errors" + "fs/metrics" "log/slog" "net" "net/http" @@ -18,17 +20,20 @@ func Middleware( return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { authCtx := RequestContext{Authenticated: false, AuthType: "none"} if svc == nil || !svc.Config().Enabled { + metrics.Default.ObserveAuth("bypass", "disabled", "auth_disabled") next.ServeHTTP(w, r.WithContext(WithRequestContext(r.Context(), authCtx))) return } - if r.URL.Path == "/healthz" { + if r.URL.Path == "/healthz" || r.URL.Path == "/metrics" { + metrics.Default.ObserveAuth("bypass", "none", "public_endpoint") next.ServeHTTP(w, r.WithContext(WithRequestContext(r.Context(), authCtx))) return } resolvedCtx, err := svc.AuthenticateRequest(r) if err != nil { + metrics.Default.ObserveAuth("error", "sigv4", authErrorClass(err)) if auditEnabled && logger != nil { requestID := middleware.GetReqID(r.Context()) attrs := []any{ @@ -50,6 +55,7 @@ func Middleware( return } + metrics.Default.ObserveAuth("ok", resolvedCtx.AuthType, "none") if auditEnabled && logger != nil { requestID := middleware.GetReqID(r.Context()) attrs := []any{ @@ -69,6 +75,33 @@ func Middleware( } } +func authErrorClass(err error) string { + switch { + case errors.Is(err, ErrInvalidAccessKeyID): + return "invalid_access_key" + case errors.Is(err, ErrSignatureDoesNotMatch): + return "signature_mismatch" + case errors.Is(err, ErrAuthorizationHeaderMalformed): + return "auth_header_malformed" + case errors.Is(err, ErrRequestTimeTooSkewed): + return "time_skew" + case errors.Is(err, ErrExpiredToken): + return "expired_token" + case errors.Is(err, ErrNoAuthCredentials): + return "missing_credentials" + case errors.Is(err, ErrUnsupportedAuthScheme): + return "unsupported_auth_scheme" + case errors.Is(err, ErrInvalidPresign): + return "invalid_presign" + case errors.Is(err, ErrCredentialDisabled): + return "credential_disabled" + case errors.Is(err, ErrAccessDenied): + return "access_denied" + default: + return "other" + } +} + func clientIP(remoteAddr string) string { host, _, err := net.SplitHostPort(remoteAddr) if err == nil && host != "" { diff --git a/logging/logging.go b/logging/logging.go index 28e1985..186607f 100644 --- a/logging/logging.go +++ b/logging/logging.go @@ -1,6 +1,7 @@ package logging import ( + "fs/metrics" "log/slog" "net/http" "os" @@ -9,6 +10,7 @@ import ( "strings" "time" + "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" ) @@ -86,6 +88,8 @@ func HTTPMiddleware(logger *slog.Logger, cfg Config) func(http.Handler) http.Han return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() ww := middleware.NewWrapResponseWriter(w, r.ProtoMajor) + metrics.Default.IncHTTPInFlight() + defer metrics.Default.DecHTTPInFlight() requestID := middleware.GetReqID(r.Context()) if requestID != "" { ww.Header().Set("x-amz-request-id", requestID) @@ -93,15 +97,18 @@ func HTTPMiddleware(logger *slog.Logger, cfg Config) func(http.Handler) http.Han next.ServeHTTP(ww, r) - if !cfg.Audit && !cfg.DebugMode { - return - } - elapsed := time.Since(start) status := ww.Status() if status == 0 { status = http.StatusOK } + route := metricRouteLabel(r) + metrics.Default.ObserveHTTPRequest(r.Method, route, status, elapsed, ww.BytesWritten()) + + if !cfg.Audit && !cfg.DebugMode { + return + } + attrs := []any{ "method", r.Method, "path", r.URL.Path, @@ -131,6 +138,35 @@ func HTTPMiddleware(logger *slog.Logger, cfg Config) func(http.Handler) http.Han } } +func metricRouteLabel(r *http.Request) string { + if r == nil || r.URL == nil { + return "/unknown" + } + + if routeCtx := chi.RouteContext(r.Context()); routeCtx != nil { + if pattern := strings.TrimSpace(routeCtx.RoutePattern()); pattern != "" { + return pattern + } + } + + path := strings.TrimSpace(r.URL.Path) + if path == "" || path == "/" { + return "/" + } + if path == "/healthz" || path == "/metrics" { + return path + } + + trimmed := strings.Trim(path, "/") + if trimmed == "" { + return "/" + } + if !strings.Contains(trimmed, "/") { + return "/{bucket}" + } + return "/{bucket}/*" +} + func envBool(key string, defaultValue bool) bool { raw := os.Getenv(key) if raw == "" { diff --git a/logging/logging_metrics_test.go b/logging/logging_metrics_test.go new file mode 100644 index 0000000..2a45b4b --- /dev/null +++ b/logging/logging_metrics_test.go @@ -0,0 +1,30 @@ +package logging + +import ( + "net/http/httptest" + "testing" +) + +func TestMetricRouteLabelFallbacks(t *testing.T) { + testCases := []struct { + name string + path string + want string + }{ + {name: "root", path: "/", want: "/"}, + {name: "health", path: "/healthz", want: "/healthz"}, + {name: "metrics", path: "/metrics", want: "/metrics"}, + {name: "bucket", path: "/some-bucket", want: "/{bucket}"}, + {name: "object", path: "/some-bucket/private/path/file.jpg", want: "/{bucket}/*"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + req := httptest.NewRequest("GET", tc.path, nil) + got := metricRouteLabel(req) + if got != tc.want { + t.Fatalf("metricRouteLabel(%q) = %q, want %q", tc.path, got, tc.want) + } + }) + } +} diff --git a/metadata/metadata.go b/metadata/metadata.go index bf3a753..ff0c804 100644 --- a/metadata/metadata.go +++ b/metadata/metadata.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "fs/metrics" "fs/models" "net" "regexp" @@ -47,7 +48,7 @@ func NewMetadataHandler(dbPath string) (*MetadataHandler, error) { } h := &MetadataHandler{db: db} - err = h.db.Update(func(tx *bbolt.Tx) error { + err = h.update(func(tx *bbolt.Tx) error { _, err := tx.CreateBucketIfNotExists(systemIndex) return err }) @@ -55,7 +56,7 @@ func NewMetadataHandler(dbPath string) (*MetadataHandler, error) { _ = db.Close() return nil, err } - err = h.db.Update(func(tx *bbolt.Tx) error { + err = h.update(func(tx *bbolt.Tx) error { _, err := tx.CreateBucketIfNotExists(multipartUploadIndex) return err }) @@ -63,7 +64,7 @@ func NewMetadataHandler(dbPath string) (*MetadataHandler, error) { _ = db.Close() return nil, err } - err = h.db.Update(func(tx *bbolt.Tx) error { + err = h.update(func(tx *bbolt.Tx) error { _, err := tx.CreateBucketIfNotExists(multipartUploadPartsIndex) return err }) @@ -71,7 +72,7 @@ func NewMetadataHandler(dbPath string) (*MetadataHandler, error) { _ = db.Close() return nil, err } - err = h.db.Update(func(tx *bbolt.Tx) error { + err = h.update(func(tx *bbolt.Tx) error { _, err := tx.CreateBucketIfNotExists(authIdentitiesIndex) return err }) @@ -79,7 +80,7 @@ func NewMetadataHandler(dbPath string) (*MetadataHandler, error) { _ = db.Close() return nil, err } - err = h.db.Update(func(tx *bbolt.Tx) error { + err = h.update(func(tx *bbolt.Tx) error { _, err := tx.CreateBucketIfNotExists(authPoliciesIndex) return err }) @@ -119,6 +120,20 @@ func (h *MetadataHandler) Close() error { return h.db.Close() } +func (h *MetadataHandler) view(fn func(tx *bbolt.Tx) error) error { + start := time.Now() + err := h.db.View(fn) + metrics.Default.ObserveMetadataTx("view", time.Since(start), err == nil) + return err +} + +func (h *MetadataHandler) update(fn func(tx *bbolt.Tx) error) error { + start := time.Now() + err := h.db.Update(fn) + metrics.Default.ObserveMetadataTx("update", time.Since(start), err == nil) + return err +} + func (h *MetadataHandler) PutAuthIdentity(identity *models.AuthIdentity) error { if identity == nil { return errors.New("auth identity is required") @@ -126,7 +141,7 @@ func (h *MetadataHandler) PutAuthIdentity(identity *models.AuthIdentity) error { if strings.TrimSpace(identity.AccessKeyID) == "" { return errors.New("access key id is required") } - return h.db.Update(func(tx *bbolt.Tx) error { + return h.update(func(tx *bbolt.Tx) error { bucket := tx.Bucket(authIdentitiesIndex) if bucket == nil { return errors.New("auth identities index not found") @@ -146,7 +161,7 @@ func (h *MetadataHandler) GetAuthIdentity(accessKeyID string) (*models.AuthIdent } var identity *models.AuthIdentity - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { bucket := tx.Bucket(authIdentitiesIndex) if bucket == nil { return errors.New("auth identities index not found") @@ -177,7 +192,7 @@ func (h *MetadataHandler) PutAuthPolicy(policy *models.AuthPolicy) error { return errors.New("auth policy principal is required") } policy.Principal = principal - return h.db.Update(func(tx *bbolt.Tx) error { + return h.update(func(tx *bbolt.Tx) error { bucket := tx.Bucket(authPoliciesIndex) if bucket == nil { return errors.New("auth policies index not found") @@ -197,7 +212,7 @@ func (h *MetadataHandler) GetAuthPolicy(accessKeyID string) (*models.AuthPolicy, } var policy *models.AuthPolicy - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { bucket := tx.Bucket(authPoliciesIndex) if bucket == nil { return errors.New("auth policies index not found") @@ -224,7 +239,7 @@ func (h *MetadataHandler) CreateBucket(bucketName string) error { return fmt.Errorf("%w: %s", ErrInvalidBucketName, bucketName) } - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { indexBucket, err := tx.CreateBucketIfNotExists([]byte(systemIndex)) if err != nil { return err @@ -256,7 +271,7 @@ func (h *MetadataHandler) DeleteBucket(bucketName string) error { return fmt.Errorf("%w: %s", ErrInvalidBucketName, bucketName) } - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { indexBucket, err := tx.CreateBucketIfNotExists([]byte(systemIndex)) if err != nil { return err @@ -303,7 +318,7 @@ func (h *MetadataHandler) DeleteBucket(bucketName string) error { func (h *MetadataHandler) ListBuckets() ([]string, error) { buckets := []string{} - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { systemIndexBucket := tx.Bucket([]byte(systemIndex)) if systemIndexBucket == nil { return errors.New("system index not found") @@ -323,7 +338,7 @@ func (h *MetadataHandler) ListBuckets() ([]string, error) { func (h *MetadataHandler) GetBucketManifest(bucketName string) (*models.BucketManifest, error) { var manifest *models.BucketManifest - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { systemIndexBucket := tx.Bucket([]byte(systemIndex)) if systemIndexBucket == nil { return errors.New("system index not found") @@ -353,7 +368,7 @@ func (h *MetadataHandler) PutManifest(manifest *models.ObjectManifest) error { return err } - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { data, err := json.Marshal(manifest) if err != nil { return err @@ -373,7 +388,7 @@ func (h *MetadataHandler) PutManifest(manifest *models.ObjectManifest) error { func (h *MetadataHandler) GetManifest(bucket, key string) (*models.ObjectManifest, error) { var manifest *models.ObjectManifest - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { metadataBucket := tx.Bucket([]byte(bucket)) if metadataBucket == nil { return fmt.Errorf("%w: %s", ErrBucketNotFound, bucket) @@ -400,7 +415,7 @@ func (h *MetadataHandler) ListObjects(bucket, prefix string) ([]*models.ObjectMa var objects []*models.ObjectManifest - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { systemIndexBucket := tx.Bucket([]byte(systemIndex)) if systemIndexBucket == nil { return errors.New("system index not found") @@ -440,7 +455,7 @@ func (h *MetadataHandler) ForEachObjectFrom(bucket, startKey string, fn func(*mo return errors.New("object callback is required") } - return h.db.View(func(tx *bbolt.Tx) error { + return h.view(func(tx *bbolt.Tx) error { systemIndexBucket := tx.Bucket([]byte(systemIndex)) if systemIndexBucket == nil { return errors.New("system index not found") @@ -480,7 +495,7 @@ func (h *MetadataHandler) DeleteManifest(bucket, key string) error { return err } - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { metadataBucket := tx.Bucket([]byte(bucket)) if metadataBucket == nil { return fmt.Errorf("%w: %s", ErrBucketNotFound, bucket) @@ -497,7 +512,7 @@ func (h *MetadataHandler) DeleteManifest(bucket, key string) error { func (h *MetadataHandler) DeleteManifests(bucket string, keys []string) ([]string, error) { deleted := make([]string, 0, len(keys)) - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { metadataBucket := tx.Bucket([]byte(bucket)) if metadataBucket == nil { return fmt.Errorf("%w: %s", ErrBucketNotFound, bucket) @@ -525,7 +540,7 @@ func (h *MetadataHandler) DeleteManifests(bucket string, keys []string) ([]strin func (h *MetadataHandler) CreateMultipartUpload(bucket, key string) (*models.MultipartUpload, error) { var upload *models.MultipartUpload - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { systemIndexBucket := tx.Bucket([]byte(systemIndex)) if systemIndexBucket == nil { return errors.New("system index not found") @@ -548,7 +563,7 @@ func (h *MetadataHandler) CreateMultipartUpload(bucket, key string) (*models.Mul State: "pending", } - err = h.db.Update(func(tx *bbolt.Tx) error { + err = h.update(func(tx *bbolt.Tx) error { multipartUploadBucket := tx.Bucket([]byte(multipartUploadIndex)) if multipartUploadBucket == nil { return errors.New("multipart upload index not found") @@ -643,7 +658,7 @@ func deleteMultipartPartsByUploadID(tx *bbolt.Tx, uploadID string) error { func (h *MetadataHandler) GetMultipartUpload(uploadID string) (*models.MultipartUpload, error) { var upload *models.MultipartUpload - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { var err error upload, _, err = getMultipartUploadFromTx(tx, uploadID) if err != nil { @@ -661,7 +676,7 @@ func (h *MetadataHandler) PutMultipartPart(uploadID string, part models.Uploaded return fmt.Errorf("invalid part number: %d", part.PartNumber) } - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { upload, _, err := getMultipartUploadFromTx(tx, uploadID) if err != nil { return err @@ -690,7 +705,7 @@ func (h *MetadataHandler) PutMultipartPart(uploadID string, part models.Uploaded func (h *MetadataHandler) ListMultipartParts(uploadID string) ([]models.UploadedPart, error) { parts := make([]models.UploadedPart, 0) - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { if _, _, err := getMultipartUploadFromTx(tx, uploadID); err != nil { return err } @@ -724,7 +739,7 @@ func (h *MetadataHandler) CompleteMultipartUpload(uploadID string, final *models return errors.New("final object manifest is required") } - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { upload, multipartUploadBucket, err := getMultipartUploadFromTx(tx, uploadID) if err != nil { return err @@ -763,7 +778,7 @@ func (h *MetadataHandler) CompleteMultipartUpload(uploadID string, final *models return nil } func (h *MetadataHandler) AbortMultipartUpload(uploadID string) error { - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { upload, multipartUploadBucket, err := getMultipartUploadFromTx(tx, uploadID) if err != nil { return err @@ -793,7 +808,7 @@ func (h *MetadataHandler) CleanupMultipartUploads(retention time.Duration) (int, } cleaned := 0 - err := h.db.Update(func(tx *bbolt.Tx) error { + err := h.update(func(tx *bbolt.Tx) error { uploadsBucket, err := getMultipartUploadBucket(tx) if err != nil { return err @@ -843,7 +858,7 @@ func (h *MetadataHandler) GetReferencedChunkSet() (map[string]struct{}, error) { chunkSet := make(map[string]struct{}) pendingUploadSet := make(map[string]struct{}) - err := h.db.View(func(tx *bbolt.Tx) error { + err := h.view(func(tx *bbolt.Tx) error { systemIndexBucket := tx.Bucket([]byte(systemIndex)) if systemIndexBucket == nil { return errors.New("system index not found") diff --git a/metrics/metrics.go b/metrics/metrics.go new file mode 100644 index 0000000..a696659 --- /dev/null +++ b/metrics/metrics.go @@ -0,0 +1,471 @@ +package metrics + +import ( + "fmt" + "runtime" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" +) + +var defaultBuckets = []float64{ + 0.0005, 0.001, 0.0025, 0.005, 0.01, + 0.025, 0.05, 0.1, 0.25, 0.5, + 1, 2.5, 5, 10, +} + +var Default = NewRegistry() + +type histogram struct { + bounds []float64 + counts []uint64 + sum float64 + count uint64 +} + +func newHistogram(bounds []float64) *histogram { + cloned := make([]float64, len(bounds)) + copy(cloned, bounds) + return &histogram{ + bounds: cloned, + counts: make([]uint64, len(bounds)+1), + } +} + +func (h *histogram) observe(v float64) { + h.count++ + h.sum += v + for i, bound := range h.bounds { + if v <= bound { + h.counts[i]++ + return + } + } + h.counts[len(h.counts)-1]++ +} + +func (h *histogram) snapshot() (bounds []float64, counts []uint64, sum float64, count uint64) { + bounds = make([]float64, len(h.bounds)) + copy(bounds, h.bounds) + counts = make([]uint64, len(h.counts)) + copy(counts, h.counts) + return bounds, counts, h.sum, h.count +} + +type Registry struct { + startedAt time.Time + inFlight atomic.Int64 + + mu sync.Mutex + + httpRequests map[string]uint64 + httpResponseByte map[string]uint64 + httpDuration map[string]*histogram + + authRequests map[string]uint64 + + serviceOps map[string]uint64 + serviceDuration map[string]*histogram + + dbTxTotal map[string]uint64 + dbTxDuration map[string]*histogram + + blobOps map[string]uint64 + blobBytes map[string]uint64 + blobDuration map[string]*histogram + + gcRuns map[string]uint64 + gcDuration *histogram + gcDeletedChunks uint64 + gcDeleteErrors uint64 + gcCleanedUpload uint64 +} + +func NewRegistry() *Registry { + return &Registry{ + startedAt: time.Now(), + httpRequests: make(map[string]uint64), + httpResponseByte: make(map[string]uint64), + httpDuration: make(map[string]*histogram), + authRequests: make(map[string]uint64), + serviceOps: make(map[string]uint64), + serviceDuration: make(map[string]*histogram), + dbTxTotal: make(map[string]uint64), + dbTxDuration: make(map[string]*histogram), + blobOps: make(map[string]uint64), + blobBytes: make(map[string]uint64), + blobDuration: make(map[string]*histogram), + gcRuns: make(map[string]uint64), + gcDuration: newHistogram(defaultBuckets), + } +} + +func (r *Registry) IncHTTPInFlight() { + r.inFlight.Add(1) +} + +func (r *Registry) DecHTTPInFlight() { + r.inFlight.Add(-1) +} + +func (r *Registry) ObserveHTTPRequest(method, route string, status int, d time.Duration, responseBytes int) { + route = normalizeRoute(route) + key := method + "|" + route + "|" + strconv.Itoa(status) + durationKey := method + "|" + route + + r.mu.Lock() + r.httpRequests[key]++ + if responseBytes > 0 { + r.httpResponseByte[key] += uint64(responseBytes) + } + h := r.httpDuration[durationKey] + if h == nil { + h = newHistogram(defaultBuckets) + r.httpDuration[durationKey] = h + } + h.observe(d.Seconds()) + r.mu.Unlock() +} + +func (r *Registry) ObserveAuth(result, authType, reason string) { + authType = strings.TrimSpace(authType) + if authType == "" { + authType = "unknown" + } + reason = strings.TrimSpace(reason) + if reason == "" { + reason = "none" + } + key := result + "|" + authType + "|" + reason + r.mu.Lock() + r.authRequests[key]++ + r.mu.Unlock() +} + +func (r *Registry) ObserveService(operation string, d time.Duration, ok bool) { + result := "error" + if ok { + result = "ok" + } + key := operation + "|" + result + r.mu.Lock() + r.serviceOps[key]++ + h := r.serviceDuration[operation] + if h == nil { + h = newHistogram(defaultBuckets) + r.serviceDuration[operation] = h + } + h.observe(d.Seconds()) + r.mu.Unlock() +} + +func (r *Registry) ObserveMetadataTx(txType string, d time.Duration, ok bool) { + result := "error" + if ok { + result = "ok" + } + key := txType + "|" + result + r.mu.Lock() + r.dbTxTotal[key]++ + h := r.dbTxDuration[txType] + if h == nil { + h = newHistogram(defaultBuckets) + r.dbTxDuration[txType] = h + } + h.observe(d.Seconds()) + r.mu.Unlock() +} + +func (r *Registry) ObserveBlob(operation string, d time.Duration, bytes int64, ok bool) { + result := "error" + if ok { + result = "ok" + } + key := operation + "|" + result + r.mu.Lock() + r.blobOps[key]++ + h := r.blobDuration[operation] + if h == nil { + h = newHistogram(defaultBuckets) + r.blobDuration[operation] = h + } + h.observe(d.Seconds()) + if bytes > 0 { + switch operation { + case "read_chunk": + r.blobBytes["read"] += uint64(bytes) + case "write_chunk": + r.blobBytes["write"] += uint64(bytes) + } + } + r.mu.Unlock() +} + +func (r *Registry) ObserveGC(d time.Duration, deletedChunks, deleteErrors, cleanedUploads int, ok bool) { + result := "error" + if ok { + result = "ok" + } + r.mu.Lock() + r.gcRuns[result]++ + r.gcDuration.observe(d.Seconds()) + if deletedChunks > 0 { + r.gcDeletedChunks += uint64(deletedChunks) + } + if deleteErrors > 0 { + r.gcDeleteErrors += uint64(deleteErrors) + } + if cleanedUploads > 0 { + r.gcCleanedUpload += uint64(cleanedUploads) + } + r.mu.Unlock() +} + +func (r *Registry) RenderPrometheus() string { + now := time.Now() + var mem runtime.MemStats + runtime.ReadMemStats(&mem) + + r.mu.Lock() + httpReq := copyCounterMap(r.httpRequests) + httpBytes := copyCounterMap(r.httpResponseByte) + httpDur := copyHistogramMap(r.httpDuration) + authReq := copyCounterMap(r.authRequests) + serviceOps := copyCounterMap(r.serviceOps) + serviceDur := copyHistogramMap(r.serviceDuration) + dbTx := copyCounterMap(r.dbTxTotal) + dbTxDur := copyHistogramMap(r.dbTxDuration) + blobOps := copyCounterMap(r.blobOps) + blobBytes := copyCounterMap(r.blobBytes) + blobDur := copyHistogramMap(r.blobDuration) + gcRuns := copyCounterMap(r.gcRuns) + gcDurBounds, gcDurCounts, gcDurSum, gcDurCount := r.gcDuration.snapshot() + gcDeletedChunks := r.gcDeletedChunks + gcDeleteErrors := r.gcDeleteErrors + gcCleanedUploads := r.gcCleanedUpload + r.mu.Unlock() + + var b strings.Builder + + writeGauge(&b, "fs_http_inflight_requests", "Current in-flight HTTP requests.", float64(r.inFlight.Load())) + writeCounterVecKV(&b, "fs_http_requests_total", "Total HTTP requests handled.", httpReq, []string{"method", "route", "status"}) + writeCounterVecKV(&b, "fs_http_response_bytes_total", "Total HTTP response bytes written.", httpBytes, []string{"method", "route", "status"}) + writeHistogramVecKV(&b, "fs_http_request_duration_seconds", "HTTP request latency.", httpDur, []string{"method", "route"}) + + writeCounterVecKV(&b, "fs_auth_requests_total", "Authentication attempts by result.", authReq, []string{"result", "auth_type", "reason"}) + + writeCounterVecKV(&b, "fs_service_operations_total", "Service-level operation calls.", serviceOps, []string{"operation", "result"}) + writeHistogramVecKV(&b, "fs_service_operation_duration_seconds", "Service-level operation latency.", serviceDur, []string{"operation"}) + + writeCounterVecKV(&b, "fs_metadata_tx_total", "Metadata transaction calls.", dbTx, []string{"type", "result"}) + writeHistogramVecKV(&b, "fs_metadata_tx_duration_seconds", "Metadata transaction latency.", dbTxDur, []string{"type"}) + + writeCounterVecKV(&b, "fs_blob_operations_total", "Blob store operations.", blobOps, []string{"operation", "result"}) + writeCounterVecKV(&b, "fs_blob_bytes_total", "Blob bytes processed.", blobBytes, []string{"direction"}) + writeHistogramVecKV(&b, "fs_blob_operation_duration_seconds", "Blob operation latency.", blobDur, []string{"operation"}) + + writeCounterVecKV(&b, "fs_gc_runs_total", "Garbage collection runs.", gcRuns, []string{"result"}) + writeHistogram(&b, "fs_gc_duration_seconds", "Garbage collection runtime.", nil, gcDurBounds, gcDurCounts, gcDurSum, gcDurCount) + writeCounter(&b, "fs_gc_deleted_chunks_total", "Deleted chunks during GC.", gcDeletedChunks) + writeCounter(&b, "fs_gc_delete_errors_total", "Chunk delete errors during GC.", gcDeleteErrors) + writeCounter(&b, "fs_gc_cleaned_uploads_total", "Cleaned multipart uploads during GC.", gcCleanedUploads) + + writeGauge(&b, "fs_uptime_seconds", "Process uptime in seconds.", now.Sub(r.startedAt).Seconds()) + writeGauge(&b, "fs_runtime_goroutines", "Number of goroutines.", float64(runtime.NumGoroutine())) + writeGaugeVec(&b, "fs_runtime_memory_bytes", "Runtime memory in bytes.", map[string]float64{ + "alloc": float64(mem.Alloc), + "total": float64(mem.TotalAlloc), + "sys": float64(mem.Sys), + "heap_alloc": float64(mem.HeapAlloc), + "heap_sys": float64(mem.HeapSys), + "stack_sys": float64(mem.StackSys), + }, "type") + writeCounter(&b, "fs_runtime_gc_cycles_total", "Completed GC cycles.", uint64(mem.NumGC)) + writeCounterFloat(&b, "fs_runtime_gc_pause_seconds_total", "Total GC pause time in seconds.", float64(mem.PauseTotalNs)/1e9) + + return b.String() +} + +func normalizeRoute(route string) string { + route = strings.TrimSpace(route) + if route == "" { + return "/unknown" + } + return route +} + +type histogramSnapshot struct { + bounds []float64 + counts []uint64 + sum float64 + count uint64 +} + +func copyCounterMap(src map[string]uint64) map[string]uint64 { + out := make(map[string]uint64, len(src)) + for k, v := range src { + out[k] = v + } + return out +} + +func copyHistogramMap(src map[string]*histogram) map[string]histogramSnapshot { + out := make(map[string]histogramSnapshot, len(src)) + for k, h := range src { + bounds, counts, sum, count := h.snapshot() + out[k] = histogramSnapshot{ + bounds: bounds, + counts: counts, + sum: sum, + count: count, + } + } + return out +} + +func writeCounter(b *strings.Builder, name, help string, value uint64) { + fmt.Fprintf(b, "# HELP %s %s\n", name, help) + fmt.Fprintf(b, "# TYPE %s counter\n", name) + fmt.Fprintf(b, "%s %d\n", name, value) +} + +func writeCounterFloat(b *strings.Builder, name, help string, value float64) { + fmt.Fprintf(b, "# HELP %s %s\n", name, help) + fmt.Fprintf(b, "# TYPE %s counter\n", name) + fmt.Fprintf(b, "%s %.9f\n", name, value) +} + +func writeGauge(b *strings.Builder, name, help string, value float64) { + fmt.Fprintf(b, "# HELP %s %s\n", name, help) + fmt.Fprintf(b, "# TYPE %s gauge\n", name) + fmt.Fprintf(b, "%s %.9f\n", name, value) +} + +func writeGaugeVec(b *strings.Builder, name, help string, values map[string]float64, labelName string) { + fmt.Fprintf(b, "# HELP %s %s\n", name, help) + fmt.Fprintf(b, "# TYPE %s gauge\n", name) + keys := make([]string, 0, len(values)) + for k := range values { + keys = append(keys, k) + } + sort.Strings(keys) + for _, key := range keys { + fmt.Fprintf(b, "%s{%s=\"%s\"} %.9f\n", name, labelName, escapeLabelValue(key), values[key]) + } +} + +func writeCounterVecKV(b *strings.Builder, name, help string, values map[string]uint64, labels []string) { + fmt.Fprintf(b, "# HELP %s %s\n", name, help) + fmt.Fprintf(b, "# TYPE %s counter\n", name) + keys := make([]string, 0, len(values)) + for k := range values { + keys = append(keys, k) + } + sort.Strings(keys) + for _, key := range keys { + parts := strings.Split(key, "|") + fmt.Fprintf(b, "%s{%s} %d\n", name, formatLabels(labels, parts), values[key]) + } +} + +func writeHistogramVecKV(b *strings.Builder, name, help string, values map[string]histogramSnapshot, labels []string) { + fmt.Fprintf(b, "# HELP %s %s\n", name, help) + fmt.Fprintf(b, "# TYPE %s histogram\n", name) + keys := make([]string, 0, len(values)) + for k := range values { + keys = append(keys, k) + } + sort.Strings(keys) + for _, key := range keys { + parts := strings.Split(key, "|") + labelsMap := make(map[string]string, len(labels)) + for i, label := range labels { + if i < len(parts) { + labelsMap[label] = parts[i] + } else { + labelsMap[label] = "" + } + } + writeHistogramWithLabelsMap(b, name, labelsMap, values[key]) + } +} + +func writeHistogram(b *strings.Builder, name, help string, labels map[string]string, bounds []float64, counts []uint64, sum float64, count uint64) { + fmt.Fprintf(b, "# HELP %s %s\n", name, help) + fmt.Fprintf(b, "# TYPE %s histogram\n", name) + writeHistogramWithLabelsMap(b, name, labels, histogramSnapshot{ + bounds: bounds, + counts: counts, + sum: sum, + count: count, + }) +} + +func writeHistogramWithLabelsMap(b *strings.Builder, name string, labels map[string]string, s histogramSnapshot) { + var cumulative uint64 + for i, bucketCount := range s.counts { + cumulative += bucketCount + bucketLabels := cloneLabels(labels) + if i < len(s.bounds) { + bucketLabels["le"] = trimFloat(s.bounds[i]) + } else { + bucketLabels["le"] = "+Inf" + } + fmt.Fprintf(b, "%s_bucket{%s} %d\n", name, labelsToString(bucketLabels), cumulative) + } + fmt.Fprintf(b, "%s_sum{%s} %.9f\n", name, labelsToString(labels), s.sum) + fmt.Fprintf(b, "%s_count{%s} %d\n", name, labelsToString(labels), s.count) +} + +func formatLabels(keys, values []string) string { + parts := make([]string, 0, len(keys)) + for i, key := range keys { + value := "" + if i < len(values) { + value = values[i] + } + parts = append(parts, fmt.Sprintf("%s=\"%s\"", key, escapeLabelValue(value))) + } + return strings.Join(parts, ",") +} + +func labelsToString(labels map[string]string) string { + if len(labels) == 0 { + return "" + } + keys := make([]string, 0, len(labels)) + for k := range labels { + keys = append(keys, k) + } + sort.Strings(keys) + parts := make([]string, 0, len(keys)) + for _, key := range keys { + parts = append(parts, fmt.Sprintf("%s=\"%s\"", key, escapeLabelValue(labels[key]))) + } + return strings.Join(parts, ",") +} + +func cloneLabels(in map[string]string) map[string]string { + if len(in) == 0 { + return map[string]string{} + } + out := make(map[string]string, len(in)+1) + for k, v := range in { + out[k] = v + } + return out +} + +func trimFloat(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) +} + +func escapeLabelValue(value string) string { + value = strings.ReplaceAll(value, `\`, `\\`) + value = strings.ReplaceAll(value, "\n", `\n`) + value = strings.ReplaceAll(value, `"`, `\"`) + return value +} diff --git a/service/service.go b/service/service.go index 09b04d6..8442a43 100644 --- a/service/service.go +++ b/service/service.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "fs/metadata" + "fs/metrics" "fs/models" "fs/storage" "io" @@ -42,6 +43,12 @@ func NewObjectService(metadataHandler *metadata.MetadataHandler, blobHandler *st } func (s *ObjectService) PutObject(bucket, key, contentType string, input io.Reader) (*models.ObjectManifest, error) { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("put_object", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() @@ -71,10 +78,17 @@ func (s *ObjectService) PutObject(bucket, key, contentType string, input io.Read return nil, err } + success = true return manifest, nil } func (s *ObjectService) GetObject(bucket, key string) (io.ReadCloser, *models.ObjectManifest, error) { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("get_object", time.Since(start), success) + }() + s.gcMu.RLock() manifest, err := s.metadata.GetManifest(bucket, key) @@ -92,10 +106,17 @@ func (s *ObjectService) GetObject(bucket, key string) (io.ReadCloser, *models.Ob } _ = pw.Close() }() + success = true return pr, manifest, nil } func (s *ObjectService) HeadObject(bucket, key string) (models.ObjectManifest, error) { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("head_object", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() @@ -103,13 +124,22 @@ func (s *ObjectService) HeadObject(bucket, key string) (models.ObjectManifest, e if err != nil { return models.ObjectManifest{}, err } + success = true return *manifest, nil } func (s *ObjectService) DeleteObject(bucket, key string) error { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("delete_object", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() - return s.metadata.DeleteManifest(bucket, key) + err := s.metadata.DeleteManifest(bucket, key) + success = err == nil + return err } func (s *ObjectService) ListObjects(bucket, prefix string) ([]*models.ObjectManifest, error) { @@ -120,16 +150,32 @@ func (s *ObjectService) ListObjects(bucket, prefix string) ([]*models.ObjectMani } func (s *ObjectService) ForEachObjectFrom(bucket, startKey string, fn func(*models.ObjectManifest) error) error { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("for_each_object_from", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() - return s.metadata.ForEachObjectFrom(bucket, startKey, fn) + err := s.metadata.ForEachObjectFrom(bucket, startKey, fn) + success = err == nil + return err } func (s *ObjectService) CreateBucket(bucket string) error { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("create_bucket", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() - return s.metadata.CreateBucket(bucket) + err := s.metadata.CreateBucket(bucket) + success = err == nil + return err } func (s *ObjectService) HeadBucket(bucket string) error { @@ -154,10 +200,18 @@ func (s *ObjectService) DeleteBucket(bucket string) error { } func (s *ObjectService) ListBuckets() ([]string, error) { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("list_buckets", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() - return s.metadata.ListBuckets() + buckets, err := s.metadata.ListBuckets() + success = err == nil + return buckets, err } func (s *ObjectService) DeleteObjects(bucket string, keys []string) ([]string, error) { @@ -173,6 +227,12 @@ func (s *ObjectService) CreateMultipartUpload(bucket, key string) (*models.Multi } func (s *ObjectService) UploadPart(bucket, key, uploadId string, partNumber int, input io.Reader) (string, error) { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("upload_part", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() @@ -204,6 +264,7 @@ func (s *ObjectService) UploadPart(bucket, key, uploadId string, partNumber int, if err != nil { return "", err } + success = true return etag, nil } @@ -222,6 +283,12 @@ func (s *ObjectService) ListMultipartParts(bucket, key, uploadID string) ([]mode } func (s *ObjectService) CompleteMultipartUpload(bucket, key, uploadID string, completed []models.CompletedPart) (*models.ObjectManifest, error) { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveService("complete_multipart_upload", time.Since(start), success) + }() + s.gcMu.RLock() defer s.gcMu.RUnlock() @@ -288,6 +355,7 @@ func (s *ObjectService) CompleteMultipartUpload(bucket, key, uploadID string, co return nil, err } + success = true return manifest, nil } @@ -327,6 +395,15 @@ func (s *ObjectService) Close() error { } func (s *ObjectService) GarbageCollect() error { + start := time.Now() + success := false + deletedChunks := 0 + deleteErrors := 0 + cleanedUploads := 0 + defer func() { + metrics.Default.ObserveGC(time.Since(start), deletedChunks, deleteErrors, cleanedUploads, success) + }() + s.gcMu.Lock() defer s.gcMu.Unlock() @@ -336,9 +413,6 @@ func (s *ObjectService) GarbageCollect() error { } totalChunks := 0 - deletedChunks := 0 - deleteErrors := 0 - cleanedUploads := 0 if err := s.blob.ForEachChunk(func(chunkID string) error { totalChunks++ @@ -368,6 +442,7 @@ func (s *ObjectService) GarbageCollect() error { "delete_errors", deleteErrors, "cleaned_uploads", cleanedUploads, ) + success = true return nil } diff --git a/storage/blob.go b/storage/blob.go index 6215a6f..41297df 100644 --- a/storage/blob.go +++ b/storage/blob.go @@ -6,10 +6,12 @@ import ( "encoding/hex" "errors" "fmt" + "fs/metrics" "io" "os" "path/filepath" "strings" + "time" ) const blobRoot = "blobs" @@ -37,11 +39,16 @@ func NewBlobStore(root string, chunkSize int) (*BlobStore, error) { } func (bs *BlobStore) IngestStream(stream io.Reader) ([]string, int64, string, error) { + start := time.Now() fullFileHasher := md5.New() buffer := make([]byte, bs.chunkSize) var totalSize int64 var chunkIDs []string + success := false + defer func() { + metrics.Default.ObserveBlob("ingest_stream", time.Since(start), 0, success) + }() for { bytesRead, err := io.ReadFull(stream, buffer) @@ -74,10 +81,17 @@ func (bs *BlobStore) IngestStream(stream io.Reader) ([]string, int64, string, er } etag := hex.EncodeToString(fullFileHasher.Sum(nil)) + success = true return chunkIDs, totalSize, etag, nil } func (bs *BlobStore) saveBlob(chunkID string, data []byte) error { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveBlob("write_chunk", time.Since(start), int64(len(data)), success) + }() + if !isValidChunkID(chunkID) { return fmt.Errorf("invalid chunk id: %q", chunkID) } @@ -88,6 +102,7 @@ func (bs *BlobStore) saveBlob(chunkID string, data []byte) error { fullPath := filepath.Join(dir, chunkID) if _, err := os.Stat(fullPath); err == nil { + success = true return nil } else if !os.IsNotExist(err) { return err @@ -119,6 +134,7 @@ func (bs *BlobStore) saveBlob(chunkID string, data []byte) error { if err := os.Rename(tmpPath, fullPath); err != nil { if _, statErr := os.Stat(fullPath); statErr == nil { + success = true return nil } return err @@ -128,10 +144,17 @@ func (bs *BlobStore) saveBlob(chunkID string, data []byte) error { if err := syncDir(dir); err != nil { return err } + success = true return nil } func (bs *BlobStore) AssembleStream(chunkIDs []string, w *io.PipeWriter) error { + start := time.Now() + success := false + defer func() { + metrics.Default.ObserveBlob("assemble_stream", time.Since(start), 0, success) + }() + for _, chunkID := range chunkIDs { chunkData, err := bs.GetBlob(chunkID) if err != nil { @@ -141,14 +164,28 @@ func (bs *BlobStore) AssembleStream(chunkIDs []string, w *io.PipeWriter) error { return err } } + success = true return nil } func (bs *BlobStore) GetBlob(chunkID string) ([]byte, error) { + start := time.Now() + success := false + var size int64 + defer func() { + metrics.Default.ObserveBlob("read_chunk", time.Since(start), size, success) + }() + if !isValidChunkID(chunkID) { return nil, fmt.Errorf("invalid chunk id: %q", chunkID) } - return os.ReadFile(filepath.Join(bs.dataRoot, blobRoot, chunkID[:2], chunkID[2:4], chunkID)) + data, err := os.ReadFile(filepath.Join(bs.dataRoot, blobRoot, chunkID[:2], chunkID[2:4], chunkID)) + if err != nil { + return nil, err + } + size = int64(len(data)) + success = true + return data, nil } func (bs *BlobStore) DeleteBlob(chunkID string) error {