Search code examples
netflix-zuulspring-cloud-netflixspring-retry

Ribbon Retry Config not working


I have Spring Cloud packaged Zuul API Gateway running with the below configuration -

# Eureka Client Config to register ZUUL with Eureka
eureka:
client:
    healthcheck:
      enabled: true
    lease:
      duration: 5
    service-url:
      defaultZone: http://localhost:8761/eureka/

# Ribbon Global Config
ribbon:
  OkToRetryOnAllOperations: false
  ReadTimeout: 30000
  ConnectTimeout: 1000
  MaxTotalHttpConnections: 1600
  MaxConnectionsPerHost: 800
  MaxAutoRetries: 11
  MaxAutoRetriesNextServer: 111

# Ribbon Named Client Config for Ingest API
ingestService:
  ribbon:
    eureka:
      enabled: false
    NIWSServerListClassName: com.netflix.loadbalancer.ConfigurationBasedServerList
    listOfServers: http://test-nlb-zuul-us-west-2c-6af11a3ede8a872a.elb.us-west-2.amazonaws.com
    OkToRetryOnAllOperations: true
    MaxAutoRetries: 1
    MaxAutoRetriesNextServer: 1
    MaxTotalHttpConnections: 500
    MaxConnectionsPerHost: 200
    retryableStatusCodes: 500, 501, 502, 503
    ReadTimeout: 10000
    ConnectTimeout: 1000

# Zuul Routes
zuul:
  debug:
    request: true
    parameter: true
  host: # timeout config for direct URL based requests from Zuul to external URLs
    connect-timeout-millis: 10000
    socket-timeout-millis: 20000
  ignored-services: '*'
  routes:
    ingest:
      path: /ingest/**
      retryable: true
      stripPrefix: false
      serviceId: ingestService

management.security.enabled : false

spring:
  application:
    name: zuul-gateway
  cloud:
    loadbalancer:
      retry:
        enabled: true

logging:
  level:
    org:
      springframework:
        retry: DEBUG
      apache:
        http: DEBUG
    com:
      netflix:
        ribbon: DEBUG
        eureka: DEBUG
        discovery: DEBUG

hystrix:
  command:
    default:
      execution:
        isolation:
          strategy: THREAD
          thread:
            timeoutInMilliseconds: 30000

When I hit the zuul /ingest endpoint, the request gets redirected to the server listed under the ribbon serviceId configuration for ingestService. However, I see that the ribbon retry configuration is completely ignored.

When there is a HTTP 500 errors from the server temporarily, both the global ribbon config - ribbon.MaxAutoRetries=11 and the named client config - ingestService.ribbon.MaxAutoRetries=1 are ignored. I see retries are happening exactly 10 times which I have no clue where that retry config is coming from which seems totally against the documentation available. I am not sure where to start debugging as I new to the whole netflix ecosystem of tools. Wanted to check if there was some config error I was making. Please advice.

Here is my pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.mycomp</groupId>
    <artifactId>zuul-gateway</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>zuul-gateway</name>
    <description>Spring Boot Zuul</description>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>1.5.9.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <java.version>1.8</java.version>
        <spring-cloud.version>Edgware.SR1</spring-cloud.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.cloud</groupId>
            <artifactId>spring-cloud-starter-zuul</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.cloud</groupId>
            <artifactId>spring-cloud-starter-eureka</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.cloud</groupId>
            <artifactId>spring-cloud-starter-netflix-ribbon</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.springframework.retry</groupId>
            <artifactId>spring-retry</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-configuration-processor</artifactId>
        </dependency>
    </dependencies>

    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>org.springframework.cloud</groupId>
                <artifactId>spring-cloud-dependencies</artifactId>
                <version>${spring-cloud.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>
</project>

and the spring application startup

package com.mycomp.zuulgateway;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.cloud.client.circuitbreaker.EnableCircuitBreaker;
import org.springframework.cloud.client.discovery.EnableDiscoveryClient;
import org.springframework.cloud.netflix.zuul.EnableZuulProxy;
import org.springframework.context.annotation.Bean;

@SpringBootApplication
@EnableZuulProxy
@EnableDiscoveryClient
@EnableCircuitBreaker
public class ZuulGatewayApplication {

    public static void main(String[] args) {
        SpringApplication.run(ZuulGatewayApplication.class, args);
    }
}

Solution

  • When I started this project, I started with

    • spring-cloud-dependencies with version Edgware.SR1

    Because of this, as per the pom/bom declarations - my ribbon application client seems to be using the spring-cloud-netflix-core-1.4.2.RELEASE.jar.

    In spring-cloud-netflix-core-1.4.2.RELEASE.jar, the class org.springframework.cloud.netflix.ribbon.apache.RetryableRibbonLoadBalancingHttpClient has a bug/defect -

    from the method -

    public RibbonApacheHttpResponse execute(final RibbonApacheHttpRequest request, final IClientConfig configOverride) throws Exception {
        Builder builder = RequestConfig.custom();
        IClientConfig config = configOverride != null ? configOverride : this.config;
        builder.setConnectTimeout((Integer)config.get(CommonClientConfigKey.ConnectTimeout, this.connectTimeout));
        builder.setSocketTimeout((Integer)config.get(CommonClientConfigKey.ReadTimeout, this.readTimeout));
        builder.setRedirectsEnabled((Boolean)config.get(CommonClientConfigKey.FollowRedirects, this.followRedirects));
        final RequestConfig requestConfig = builder.build();
        final LoadBalancedRetryPolicy retryPolicy = this.loadBalancedRetryPolicyFactory.create(this.getClientName(), this);
        RetryCallback retryCallback = new RetryCallback() {
            public RibbonApacheHttpResponse doWithRetry(RetryContext context) throws Exception {
                RibbonApacheHttpRequest newRequest = request;
                if (context instanceof LoadBalancedRetryContext) {
                    ServiceInstance service = ((LoadBalancedRetryContext)context).getServiceInstance();
                    if (service != null) {
                        newRequest = newRequest.withNewUri(new URI(service.getUri().getScheme(), newRequest.getURI().getUserInfo(), service.getHost(), service.getPort(), newRequest.getURI().getPath(), newRequest.getURI().getQuery(), newRequest.getURI().getFragment()));
                    }
                }
    
                // ***** after getting a newRequest in the if block above, the newRequest is not passed to the getSecureRequest() below ***** 
                newRequest = RetryableRibbonLoadBalancingHttpClient.this.getSecureRequest(request, configOverride);
                // the above should have been -
                // newRequest = RetryableRibbonLoadBalancingHttpClient.this.getSecureRequest(newRequest, configOverride);
    
                HttpUriRequest httpUriRequest = newRequest.toRequest(requestConfig);
                HttpResponse httpResponse = ((CloseableHttpClient)RetryableRibbonLoadBalancingHttpClient.this.delegate).execute(httpUriRequest);
                if (retryPolicy.retryableStatusCode(httpResponse.getStatusLine().getStatusCode())) {
                    if (CloseableHttpResponse.class.isInstance(httpResponse)) {
                        ((CloseableHttpResponse)httpResponse).close();
                    }
    
                    throw new RetryableStatusCodeException(RetryableRibbonLoadBalancingHttpClient.this.clientName, httpResponse.getStatusLine().getStatusCode());
                } else {
                    return new RibbonApacheHttpResponse(httpResponse, httpUriRequest.getURI());
                }
            }
        };
        return this.executeWithRetry(request, retryPolicy, retryCallback);
    }
    

    Because of the bug in the code above, although the ribbon configuration (for maxAutoRetries and maxAutoRetriesNextServer) is set based on the yaml file, the updates to the request object with the next server uri were ignored and were always going against the same server and were causing side-effects.

    This seems to be fixed in the spring-cloud-netflix-core-1.4.3.RELEASE.jar.

    So, updating the pom/bom

    • spring-cloud-dependencies with version Edgware.SR2

    updated the ribbon client dependency to spring-cloud-netflix-core-1.4.3.RELEASE.jar and this issue is now resolved.