Search code examples
azureazure-cosmosdbazure-cosmosdb-sqlapi

CosmosException with status code 0 while trying to read from CosmosDB


I'm trying to use cosmos-db sdk for java version 4.52.0 sync code to read items from a container. A lot of times I'm getting CosmosException but there's no detail in those exceptions to suggest what is actually going wrong.

I've obscured some of the specific details of cosmos db account and business logic.

My main cosmos db account is in US East region with Read and Write enabled and also in Japan East region with only read enabled. Based on where my application is deployed it checks the activeProfile (spring-boot's profile) and adds a list of preferred regions if deployed in Japan East. Code given below.

Configuration used to access cosmos db:

cosmosdb.endpoint = https://redacted.documents.azure.com:443/
cosmosdb.database = database1
cosmosdb.container = container1
cosmosdb.max-retry-wait-time = PT1S
cosmosdb.max-retry-attempts-on-throttled-requests = 0
cosmosdb.content-response-on-write-enabled = false
cosmosdb.cosmos-item-request-options.cosmos-end-to-end-operation-latency-policy-config-duration = PT0.020S

cosmosdb.connection-timeout = PT5S
cosmosdb.idle-connection-timeout = PT5M
cosmosdb.max-connections-per-endpoint = 100
cosmosdb.max-requests-per-connection = 30

Code used to configure:

import com.azure.cosmos.CosmosEndToEndOperationLatencyPolicyConfig;
import com.azure.cosmos.CosmosEndToEndOperationLatencyPolicyConfigBuilder;
import com.azure.cosmos.GatewayConnectionConfig;
import com.azure.cosmos.ThrottlingRetryOptions;
import com.azure.cosmos.ConsistencyLevel;
import com.azure.cosmos.CosmosClient;
import com.azure.cosmos.CosmosClientBuilder;
import com.azure.cosmos.GatewayConnectionConfig;
import com.azure.cosmos.ThrottlingRetryOptions;
import com.azure.cosmos.CosmosContainer;
import com.azure.cosmos.CosmosClient;
import com.azure.cosmos.models.CosmosItemRequestOptions;
import com.azure.cosmos.models.CosmosPatchItemRequestOptions;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import java.time.Duration;

@Slf4j
@Configuration
public class CosmosClientConfigurer {
    @Bean
    GatewayConnectionConfig
    getGatewayConnectionConfig(@Value("${cosmosdb.idle-connection-timeout}") final Duration idleConnectionTimeout,
                               @Value("${cosmosdb.max-connections-per-endpoint}") final int maxConnectionsPerEndpoint) {

        final var gatewayConnectionConfig = new GatewayConnectionConfig()
                .setIdleConnectionTimeout(idleConnectionTimeout)
                .setMaxConnectionPoolSize(maxConnectionsPerEndpoint);

        log.info("GatewayConnectionConfig: {}", gatewayConnectionConfig);
        return gatewayConnectionConfig;
    }

    @Bean
    ThrottlingRetryOptions getThrottlingRetryOptions(
            @Value("${cosmosdb.max-retry-wait-time}") final Duration maxRetryWaitTime,
            @Value("${cosmosdb.max-retry-attempts-on-throttled-requests}") final int maxRetryAttemptsOnThrottledRequests) {

        final var throttlingRetryOptions = new ThrottlingRetryOptions()
                .setMaxRetryWaitTime(maxRetryWaitTime)
                .setMaxRetryAttemptsOnThrottledRequests(maxRetryAttemptsOnThrottledRequests);

        log.info("ThrottlingRetryOptions: {}", throttlingRetryOptions);
        return throttlingRetryOptions;
    }

    @Bean
    CosmosItemRequestOptions getCosmosItemRequestOptions(
            @Value("${cosmosdb.cosmos-item-request-options.cosmos-end-to-end-operation-latency-policy-config-duration}") final Duration duration) {

        final CosmosEndToEndOperationLatencyPolicyConfig latencyPolicyConfig =
                new CosmosEndToEndOperationLatencyPolicyConfigBuilder(duration).build();
        final CosmosItemRequestOptions cosmosItemRequestOptions = new CosmosPatchItemRequestOptions();
        cosmosItemRequestOptions.setCosmosEndToEndOperationLatencyPolicyConfig(latencyPolicyConfig);
        return cosmosItemRequestOptions;
    }

    CosmosContainer getUserProfileContainer(final CosmosClient cosmosClient,
                                            @Value("${cosmosdb.database}") final String database,
                                            @Value("${cosmosdb.container}") final String container) {

        return getCosmosContainer(cosmosClient, database, container);
    }


    private CosmosContainer getCosmosContainer(final CosmosClient cosmosClient,
                                               final String database,
                                               final String container) {
        return cosmosClient
                .getDatabase(database)
                .getContainer(container);
    }
    
    @Bean
    CosmosClient getCosmosClient(@Value("${cosmosdb.endpoint}") final String endpoint,
                                 @Value("${cosmosdb.content-response-on-write-enabled}") final boolean contentResponseOnWriteEnabled,
                                 @Value("${cosmosdb.key}") final String key,
                                 @Value("${spring.profiles.active}") final String activeProfile,
                                 final GatewayConnectionConfig gatewayConnectionConfig,
                                 final ThrottlingRetryOptions throttlingRetryOptions) {
    
            final List<String> japanEastRegion = List.of("Japan East");
            final CosmosClientBuilder cosmosClientBuilder = new CosmosClientBuilder()
                .endpoint(endpoint)
                .key(key)
                .throttlingRetryOptions(throttlingRetryOptions)
                .gatewayMode(gatewayConnectionConfig)
                .contentResponseOnWriteEnabled(contentResponseOnWriteEnabled)
                .consistencyLevel(ConsistencyLevel.EVENTUAL)

            if (activeProfile.equals("jpe"))
                cosmosClientBuilder.preferredRegions(japanEastRegion)

            return cosmosClientBuilder.buildClient()
    }
}

Code used to call:

public Optional<CosmosData> read(final String id) {

    try {
        final var startTime = Instant.now();

        final CosmosItemResponse<CosmosData> cosmosItemResponse = readFromCosmos(id);

        cosmosMetrics.recordClientSideReadLatency(Duration.between(startTime, Instant.now()));
        cosmosMetrics.recordServerSideReadLatency(cosmosItemResponse.getDuration());
        cosmosMetrics.recordReadRus(cosmosItemResponse.getRequestCharge());
        cosmosMetrics.countContainerStatusCodes(cosmosItemResponse.getStatusCode());

        return Optional.of(cosmosItemResponse.getItem());
    } catch (CosmosException e) {
        cosmosMetrics.countContainerStatusCodes(e.getStatusCode());
    } catch (Exception e) {
        cosmosMetrics.incrementErrorsForUserProfile(e);
    }
    return Optional.empty();
}
    
    
public CosmosItemResponse<CosmosData> readFromCosmos(final String id) throws CosmosException {

    final PartitionKey partitionKey = new PartitionKey(id);

    try {
        return
                cosmosContainer.readItem(
                        id,
                        partitionKey,
                        cosmosItemRequestOptions,
                        CosmosData.class
                );
    } catch (CosmosException e) {

        if (e.getStatusCode() == 404
                || e.getStatusCode() == 408
                || e.getStatusCode() == 409
                || e.getStatusCode() == 410
                || e.getStatusCode() == 429
                || e.getStatusCode() == 503) {

            log.info("Exception while reading for id: {}: {}", id, e.getMessage());
        } else {
            log.error("CosmosException for id {}: {}", id, e.getMessage());
        }
        throw e;
    }
}

A snippet CosmosException is given below:

{
  "innerErrorMessage": null,
  "cosmosDiagnostics": {
    "userAgent": "azsdk-java-cosmos/4.52.0 Linux/5.15.0-1051-azure JRE/21.0.1",
    "activityId": "4f02cde3-c9c9-4c09-af6b-258f4d82db5d",
    "requestLatencyInMs": 0,
    "requestStartTimeUTC": "2023-12-15T18:23:24.381516148Z",
    "requestEndTimeUTC": "2023-12-15T18:23:24.381762054Z",
    "responseStatisticsList": [],
    "supplementalResponseStatisticsList": [],
    "addressResolutionStatistics": {},
    "regionsContacted": [
      "japan east"
    ],
    "retryContext": {
      "statusAndSubStatusCodes": null,
      "retryLatency": 0,
      "retryCount": 0
    },
    "metadataDiagnosticsContext": {
      "metadataDiagnosticList": null
    },
    "serializationDiagnosticsContext": {
      "serializationDiagnosticsList": null
    },
    "gatewayStatisticsList": [
      {
        "sessionToken": null,
        "operationType": "Read",
        "resourceType": "Document",
        "statusCode": 0,
        "subStatusCode": 0,
        "requestCharge": 0,
        "requestTimeline": [
          {
            "eventName": "connectionAcquired",
            "startTimeUTC": "2023-12-15T18:23:24.381635051Z",
            "durationInMilliSecs": 0.122203
          },
          {
            "eventName": "connectionConfigured",
            "startTimeUTC": null,
            "durationInMilliSecs": 0
          },
          {
            "eventName": "requestSent",
            "startTimeUTC": null,
            "durationInMilliSecs": 0
          },
          {
            "eventName": "transitTime",
            "startTimeUTC": null,
            "durationInMilliSecs": 0
          },
          {
            "eventName": "received",
            "startTimeUTC": null,
            "durationInMilliSecs": 0
          }
        ],
        "partitionKeyRangeId": null,
        "responsePayloadSizeInBytes": 0,
        "exceptionResponseHeaders": "{}"
      }
    ],
    "samplingRateSnapshot": 1,
    "systemInformation": {
      "usedMemory": "20807154 KB",
      "availableMemory": "10650126 KB",
      "systemCpuLoad": "(2023-12-15T18:22:55.851238339Z 40.2%), (2023-12-15T18:23:00.851247379Z 37.0%), (2023-12-15T18:23:05.851238334Z 35.5%), (2023-12-15T18:23:10.851252759Z 38.7%), (2023-12-15T18:23:15.851245468Z 42.3%), (2023-12-15T18:23:20.851240443Z 45.6%)",
      "availableProcessors": 31
    },
    "clientCfgs": {
      "id": 2,
      "machineId": "vmId_17b8a761-9046-429e-8c32-42505a66b26c",
      "connectionMode": "GATEWAY",
      "numberOfClients": 2,
      "excrgns": "[]",
      "clientEndpoints": {
        "https://redacted.documents.azure.com:443/": 2
      },
      "connCfg": {
        "rntbd": null,
        "gw": "(cps:10, nrto:PT1M, icto:PT5M, p:false)",
        "other": "(ed: true, cs: false, rv: true)"
      },
      "consistencyCfg": "(consistency: Eventual, mm: true, prgns: [japaneast])",
      "proactiveInit": "",
      "e2ePolicyCfg": ""
    }
  }
}
  • I think this request never initiated because in request timeline, except the first event which is connection acquired, every other event is null. One of the event is even named requestSent which is null?
  • Another thing is requestStartTimeUTC and requestEndTimeUTC which happened within the same second? Request started and ended in the same second but never went to cosmos db?

Solution

  • The exceptions are only coming when using Gateway mode to connect to Cosmos DB. When I try to connect with Direct mode, these errors aren't coming.

    Configuration:

    cosmosdb.endpoint = https://redacted.documents.azure.com:443/
    cosmosdb.database = database1
    cosmosdb.container = container1
    cosmosdb.max-retry-wait-time = PT1S
    cosmosdb.max-retry-attempts-on-throttled-requests = 0
    cosmosdb.content-response-on-write-enabled = false
    cosmosdb.cosmos-item-request-options.cosmos-end-to-end-operation-latency-policy-config-duration = PT0.020S
    
    cosmosdb.connect-timeout = PT5S
    cosmosdb.idle-endpoint-timeout = PT1H
    cosmosdb.idle-connection-timeout = PT5M
    cosmosdb.network-request-timeout = PT5S
    cosmosdb.max-connections-per-endpoint = 512
    cosmosdb.max-requests-per-connection = 1
    cosmosdb.connection-endpoint-rediscovery-enabled = true
    

    Code to configure:

        @Bean
        DirectConnectionConfig
        getDirectConnectionConfig(@Value("${cosmosdb.connect-timeout}") final Duration connectTimeout,
                                  @Value("${cosmosdb.idle-endpoint-timeout}") final Duration idleEndpointTimeout,
                                  @Value("${cosmosdb.idle-connection-timeout}") final Duration idleConnectionTimeout,
                                  @Value("${cosmosdb.network-request-timeout}") final Duration networkRequestTimeout,
                                  @Value("${cosmosdb.max-connections-per-endpoint}") final int maxConnectionsPerEndpoint,
                                  @Value("${cosmosdb.max-requests-per-connection}") final int maxRequestsPerConnection,
                                  @Value("${cosmosdb.connection-endpoint-rediscovery-enabled}") final boolean connectionEndpointRediscoveryEnabled) {
    
            final var directConnectionConfig = new DirectConnectionConfig()
                    .setConnectTimeout(connectTimeout)
                    .setIdleEndpointTimeout(idleEndpointTimeout)
                    .setIdleConnectionTimeout(idleConnectionTimeout)
                    .setNetworkRequestTimeout(networkRequestTimeout)
                    .setMaxConnectionsPerEndpoint(maxConnectionsPerEndpoint)
                    .setMaxRequestsPerConnection(maxRequestsPerConnection)
                    .setConnectionEndpointRediscoveryEnabled(connectionEndpointRediscoveryEnabled);
    
            log.info("DirectConnectionConfig: {}", directConnectionConfig);
            return directConnectionConfig;
        }