feat(bigtable): connectivity_error_count client side metric (#12438)

bhshkh · web-flow · commit dd45fd7487e7 · 2025-06-23T15:35:29.000-04:00
* feat(bigtable): connectivity_error_count client side metric

* test(bigtable): validate exported value of connectivity_error_count

* test: poll for expected metrics

* logs

* fix tests

* revert changes
diff --git a/bigtable/bigtable.go b/bigtable/bigtable.go
@@ -2322,4 +2322,20 @@ func recordAttemptCompletion(mt *builtinMetricsTracer) {
 	if mt.currOp.currAttempt.serverLatencyErr == nil {
 		mt.instrumentServerLatencies.Record(mt.ctx, mt.currOp.currAttempt.serverLatency, metric.WithAttributeSet(serverLatAttrs))
 	}
+
+	// Record connectivity_error_count
+	connErrCountAttrs, _ := mt.toOtelMetricAttrs(metricNameConnErrCount)
+	// Determine if connection error should be incremented.
+	// A true connectivity error occurs only when we receive NO server-side signals.
+	// 1. Server latency (from server-timing header) is a signal, but absent in DirectPath.
+	// 2. Location (from x-goog-ext header) is a signal present in both paths.
+	// Therefore, we only count an error if BOTH signals are missing.
+	isServerLatencyEffectivelyEmpty := mt.currOp.currAttempt.serverLatencyErr != nil || mt.currOp.currAttempt.serverLatency == 0
+	isLocationEmpty := mt.currOp.currAttempt.clusterID == defaultCluster
+	if isServerLatencyEffectivelyEmpty && isLocationEmpty {
+		// This is a connectivity error: the request likely never reached Google's network.
+		mt.instrumentConnErrCount.Add(mt.ctx, 1, metric.WithAttributeSet(connErrCountAttrs))
+	} else {
+		mt.instrumentConnErrCount.Add(mt.ctx, 0, metric.WithAttributeSet(connErrCountAttrs))
+	}
 }
diff --git a/bigtable/metrics.go b/bigtable/metrics.go
@@ -62,6 +62,7 @@ const (
 	metricNameServerLatencies    = "server_latencies"
 	metricNameRetryCount         = "retry_count"
 	metricNameDebugTags          = "debug_tags"
+	metricNameConnErrCount       = "connectivity_error_count"
 
 	// Metric units
 	metricUnitMS    = "ms"
@@ -113,6 +114,12 @@ var (
 			},
 			recordedPerAttempt: true,
 		},
+		metricNameConnErrCount: {
+			additionalAttrs: []string{
+				metricLabelKeyStatus,
+			},
+			recordedPerAttempt: true,
+		},
 	}
 
 	// Generates unique client ID in the format go-<random UUID>@<hostname>
@@ -161,6 +168,7 @@ type builtinMetricsTracerFactory struct {
 	serverLatencies    metric.Float64Histogram
 	attemptLatencies   metric.Float64Histogram
 	retryCount         metric.Int64Counter
+	connErrCount       metric.Int64Counter
 	debugTags          metric.Int64Counter
 }
 
@@ -271,6 +279,13 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
 		return err
 	}
 
+	// Create connectivity_error_count
+	tf.connErrCount, err = meter.Int64Counter(
+		metricNameConnErrCount,
+		metric.WithDescription("Number of requests that failed to reach the Google datacenter. (Requests without google response headers"),
+		metric.WithUnit(metricUnitCount),
+	)
+
 	// Create debug_tags
 	tf.debugTags, err = meter.Int64Counter(
 		metricNameDebugTags,
@@ -295,6 +310,7 @@ type builtinMetricsTracer struct {
 	instrumentServerLatencies    metric.Float64Histogram
 	instrumentAttemptLatencies   metric.Float64Histogram
 	instrumentRetryCount         metric.Int64Counter
+	instrumentConnErrCount       metric.Int64Counter
 	instrumentDebugTags          metric.Int64Counter
 
 	tableName   string
@@ -392,6 +408,7 @@ func (tf *builtinMetricsTracerFactory) createBuiltinMetricsTracer(ctx context.Co
 		instrumentServerLatencies:    tf.serverLatencies,
 		instrumentAttemptLatencies:   tf.attemptLatencies,
 		instrumentRetryCount:         tf.retryCount,
+		instrumentConnErrCount:       tf.connErrCount,
 		instrumentDebugTags:          tf.debugTags,
 
 		tableName:   tableName,
diff --git a/bigtable/metrics_test.go b/bigtable/metrics_test.go
@@ -27,6 +27,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 
@@ -93,7 +94,7 @@ func TestNewBuiltinMetricsTracerFactory(t *testing.T) {
 		attribute.String(metricLabelKeyClientUID, clientUID),
 		attribute.String(metricLabelKeyClientName, clientName),
 	}
-	wantMetricNamesStdout := []string{metricNameAttemptLatencies, metricNameAttemptLatencies, metricNameOperationLatencies, metricNameRetryCount, metricNameServerLatencies}
+	wantMetricNamesStdout := []string{metricNameAttemptLatencies, metricNameAttemptLatencies, metricNameConnErrCount, metricNameConnErrCount, metricNameOperationLatencies, metricNameRetryCount, metricNameServerLatencies}
 	wantMetricTypesGCM := []string{}
 	for _, wantMetricName := range wantMetricNamesStdout {
 		wantMetricTypesGCM = append(wantMetricTypesGCM, builtInMetricsMeterName+wantMetricName)
@@ -211,7 +212,8 @@ func TestNewBuiltinMetricsTracerFactory(t *testing.T) {
 			gotNonNilInstruments := gotClient.metricsTracerFactory.operationLatencies != nil &&
 				gotClient.metricsTracerFactory.serverLatencies != nil &&
 				gotClient.metricsTracerFactory.attemptLatencies != nil &&
-				gotClient.metricsTracerFactory.retryCount != nil
+				gotClient.metricsTracerFactory.retryCount != nil &&
+				gotClient.metricsTracerFactory.connErrCount != nil
 			if test.wantBuiltinEnabled != gotNonNilInstruments {
 				t.Errorf("NonNilInstruments: got: %v, want: %v", gotNonNilInstruments, test.wantBuiltinEnabled)
 			}
@@ -301,6 +303,160 @@ func TestNewBuiltinMetricsTracerFactory(t *testing.T) {
 	}
 }
 
+func TestConnectivityErrorCount(t *testing.T) {
+	ctx := context.Background()
+	project := "test-project"
+	instance := "test-instance"
+	appProfile := "test-app-profile"
+
+	// Increase sampling period to simulate potential delays
+	origSamplePeriod := defaultSamplePeriod
+	defaultSamplePeriod = 500 * time.Millisecond
+	defer func() {
+		defaultSamplePeriod = origSamplePeriod
+	}()
+
+	// Setup mock monitoring server
+	monitoringServer, err := NewMetricTestServer()
+	if err != nil {
+		t.Fatalf("Error setting up metrics test server: %v", err)
+	}
+	go monitoringServer.Serve()
+	defer monitoringServer.Shutdown()
+
+	// Override exporter options to connect to the mock server
+	origCreateExporterOptions := createExporterOptions
+	createExporterOptions = func(opts ...option.ClientOption) []option.ClientOption {
+		return []option.ClientOption{
+			option.WithEndpoint(monitoringServer.Endpoint),
+			option.WithoutAuthentication(),
+			option.WithGRPCDialOption(grpc.WithTransportCredentials(insecure.NewCredentials())),
+		}
+	}
+	defer func() {
+		createExporterOptions = origCreateExporterOptions
+	}()
+
+	// Control structure for mock server behavior during the specific ReadRows call.
+	// We use a channel to signal the interceptor that the ReadRows call under test is active.
+	readRowsCallActive := make(chan bool, 1)
+	var testSpecificAttemptCount int32
+
+	serverStreamInterceptor := func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
+		if strings.HasSuffix(info.FullMethod, "ReadRows") {
+			select {
+			case <-readRowsCallActive:
+				currentTestAttempt := atomic.AddInt32(&testSpecificAttemptCount, 1)
+				if currentTestAttempt == 1 {
+					// Put the token back for subsequent retries of this specific call.
+					readRowsCallActive <- true
+					return status.Error(codes.Unavailable, "Mock Unavailable error for connectivity test")
+				}
+				if currentTestAttempt == 2 {
+					header := metadata.New(map[string]string{
+						locationMDKey: string(testHeaders),
+					})
+					if errH := ss.SendHeader(header); errH != nil {
+						t.Errorf("[ServerInterceptor Attempt 2] Error sending header: %v", errH)
+					}
+
+					// Send a minimal successful message to ensure headers are processed by the client.
+					emptyResp := &btpb.ReadRowsResponse{}
+					if errS := ss.SendMsg(emptyResp); errS != nil {
+						t.Errorf("[ServerInterceptor Attempt 2] Error sending empty message: %v", errS)
+						return status.Errorf(codes.Internal, "mock server failed to send empty message: %v", errS)
+					}
+
+					readRowsCallActive <- true
+					return status.Error(codes.Unavailable, "Mock Unavailable error with location headers")
+				}
+
+				// On the third and final attempt, cause a non-retriable error.
+				atomic.StoreInt32(&testSpecificAttemptCount, 0)
+				// Do not put the token back, as this is the final attempt for this ReadRows sequence.
+				return status.Error(codes.Internal, "non-retriable error")
+			default:
+				return handler(srv, ss)
+			}
+		}
+		return handler(srv, ss)
+	}
+
+	config := ClientConfig{AppProfile: appProfile}
+	tbl, cleanup, gotErr := setupFakeServer(project, instance, config, grpc.StreamInterceptor(serverStreamInterceptor))
+	defer cleanup()
+	if gotErr != nil {
+		t.Fatalf("setupFakeServer error: got: %v, want: nil", gotErr)
+	}
+
+	// Pop out any old requests from the monitoring server to ensure a clean state.
+	monitoringServer.CreateServiceTimeSeriesRequests()
+	atomic.StoreInt32(&testSpecificAttemptCount, 0)
+
+	readRowsCallActive <- true
+
+	// Perform a read rows operation that will undergo a specific retry sequence:
+	// Attempt 1: Fails with Unavailable (no server headers) -> conn error count = 1
+	// Attempt 2: Fails with Unavailable (with location header) -> conn error count = 0
+	// Attempt 3: Fails with Internal (no server headers) -> conn error count = 1
+	// The overall operation fails with the final Internal error.
+	err = tbl.ReadRows(ctx, NewRange("a", "z"), func(r Row) bool { return true })
+	if err == nil {
+		t.Fatal("ReadRows: got nil error, want an error")
+	}
+	if status.Code(err) != codes.Internal {
+		t.Fatalf("ReadRows: got error code %v, want %v", status.Code(err), codes.Internal)
+	}
+
+	// Wait a bit for metrics to be exported. The defaultSamplePeriod is 500ms,
+	// so waiting slightly longer should be sufficient.
+	// If tests are flaky, this might need adjustment or a more sophisticated wait.
+	time.Sleep(defaultSamplePeriod + 200*time.Millisecond)
+
+	var totalConnectivityErrorsFromMetrics int64
+	statusesReported := make(map[string]int64)
+	foundConnErrMetricForTest := false
+
+	exportedMetricBatches := monitoringServer.CreateServiceTimeSeriesRequests()
+	for _, batch := range exportedMetricBatches {
+		for _, ts := range batch.TimeSeries {
+			if strings.HasSuffix(ts.Metric.Type, metricNameConnErrCount) {
+				methodLabel, ok := ts.Metric.Labels[metricLabelKeyMethod]
+				if !ok || methodLabel != "Bigtable.ReadRows" {
+					continue
+				}
+				foundConnErrMetricForTest = true
+				statusKey := ts.Metric.Labels[metricLabelKeyStatus]
+				for _, point := range ts.Points {
+					// Summing up values from points. For a counter, this is the delta.
+					// We expect each reported error to be a single point with a value of 1.
+					statusesReported[statusKey] += point.GetValue().GetInt64Value()
+					totalConnectivityErrorsFromMetrics += point.GetValue().GetInt64Value()
+				}
+			}
+		}
+	}
+
+	if !foundConnErrMetricForTest {
+		t.Fatalf("Metric %s for method Bigtable.ReadRows was not found in exported metrics. Batches received: %+v", metricNameConnErrCount, exportedMetricBatches)
+	}
+
+	if statusesReported[codes.Unavailable.String()] != 1 {
+		t.Errorf("Metric %s for status %s: got cumulative value %d, want 1. All statuses: %v",
+			metricNameConnErrCount, codes.Unavailable.String(), statusesReported[codes.Unavailable.String()], statusesReported)
+	}
+	if statusesReported[codes.Internal.String()] != 1 {
+		t.Errorf("Metric %s for status %s: got cumulative value %d, want 1. All statuses: %v",
+			metricNameConnErrCount, codes.Internal.String(), statusesReported[codes.Internal.String()], statusesReported)
+	}
+
+	// The total connectivity errors should be 2.
+	// Attempt 2 (Unavailable, with location) should not increment the error count.
+	if totalConnectivityErrorsFromMetrics != 2 {
+		t.Errorf("Metric %s: got cumulative value %d, want 2. Statuses reported: %v",
+			metricNameConnErrCount, totalConnectivityErrorsFromMetrics, statusesReported)
+	}
+}
 func setMockErrorHandler(t *testing.T, mockErrorHandler *MockErrorHandler) {
 	origErrHandler := otel.GetErrorHandler()
 	otel.SetErrorHandler(mockErrorHandler)