I have Spring Cloud packaged Zuul API Gateway running with the below configuration -
# Eureka Client Config to register ZUUL with Eureka
eureka:
client:
healthcheck:
enabled: true
lease:
duration: 5
service-url:
defaultZone: http://localhost:8761/eureka/
# Ribbon Global Config
ribbon:
OkToRetryOnAllOperations: false
ReadTimeout: 30000
ConnectTimeout: 1000
MaxTotalHttpConnections: 1600
MaxConnectionsPerHost: 800
MaxAutoRetries: 11
MaxAutoRetriesNextServer: 111
# Ribbon Named Client Config for Ingest API
ingestService:
ribbon:
eureka:
enabled: false
NIWSServerListClassName: com.netflix.loadbalancer.ConfigurationBasedServerList
listOfServers: http://test-nlb-zuul-us-west-2c-6af11a3ede8a872a.elb.us-west-2.amazonaws.com
OkToRetryOnAllOperations: true
MaxAutoRetries: 1
MaxAutoRetriesNextServer: 1
MaxTotalHttpConnections: 500
MaxConnectionsPerHost: 200
retryableStatusCodes: 500, 501, 502, 503
ReadTimeout: 10000
ConnectTimeout: 1000
# Zuul Routes
zuul:
debug:
request: true
parameter: true
host: # timeout config for direct URL based requests from Zuul to external URLs
connect-timeout-millis: 10000
socket-timeout-millis: 20000
ignored-services: '*'
routes:
ingest:
path: /ingest/**
retryable: true
stripPrefix: false
serviceId: ingestService
management.security.enabled : false
spring:
application:
name: zuul-gateway
cloud:
loadbalancer:
retry:
enabled: true
logging:
level:
org:
springframework:
retry: DEBUG
apache:
http: DEBUG
com:
netflix:
ribbon: DEBUG
eureka: DEBUG
discovery: DEBUG
hystrix:
command:
default:
execution:
isolation:
strategy: THREAD
thread:
timeoutInMilliseconds: 30000
When I hit the zuul /ingest endpoint, the request gets redirected to the server listed under the ribbon serviceId configuration for ingestService. However, I see that the ribbon retry configuration is completely ignored.
When there is a HTTP 500 errors from the server temporarily, both the global ribbon config - ribbon.MaxAutoRetries=11
and the named client config - ingestService.ribbon.MaxAutoRetries=1
are ignored. I see retries are happening exactly 10 times which I have no clue where that retry config is coming from which seems totally against the documentation available. I am not sure where to start debugging as I new to the whole netflix ecosystem of tools. Wanted to check if there was some config error I was making. Please advice.
Here is my pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mycomp</groupId>
<artifactId>zuul-gateway</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>zuul-gateway</name>
<description>Spring Boot Zuul</description>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.9.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<spring-cloud.version>Edgware.SR1</spring-cloud.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-zuul</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-eureka</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-netflix-ribbon</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.retry</groupId>
<artifactId>spring-retry</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-dependencies</artifactId>
<version>${spring-cloud.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
and the spring application startup
package com.mycomp.zuulgateway;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.cloud.client.circuitbreaker.EnableCircuitBreaker;
import org.springframework.cloud.client.discovery.EnableDiscoveryClient;
import org.springframework.cloud.netflix.zuul.EnableZuulProxy;
import org.springframework.context.annotation.Bean;
@SpringBootApplication
@EnableZuulProxy
@EnableDiscoveryClient
@EnableCircuitBreaker
public class ZuulGatewayApplication {
public static void main(String[] args) {
SpringApplication.run(ZuulGatewayApplication.class, args);
}
}
When I started this project, I started with
spring-cloud-dependencies
with version Edgware.SR1
Because of this, as per the pom/bom declarations - my ribbon application client seems to be using the spring-cloud-netflix-core-1.4.2.RELEASE.jar
.
In spring-cloud-netflix-core-1.4.2.RELEASE.jar
, the class org.springframework.cloud.netflix.ribbon.apache.RetryableRibbonLoadBalancingHttpClient
has a bug/defect -
from the method -
public RibbonApacheHttpResponse execute(final RibbonApacheHttpRequest request, final IClientConfig configOverride) throws Exception {
Builder builder = RequestConfig.custom();
IClientConfig config = configOverride != null ? configOverride : this.config;
builder.setConnectTimeout((Integer)config.get(CommonClientConfigKey.ConnectTimeout, this.connectTimeout));
builder.setSocketTimeout((Integer)config.get(CommonClientConfigKey.ReadTimeout, this.readTimeout));
builder.setRedirectsEnabled((Boolean)config.get(CommonClientConfigKey.FollowRedirects, this.followRedirects));
final RequestConfig requestConfig = builder.build();
final LoadBalancedRetryPolicy retryPolicy = this.loadBalancedRetryPolicyFactory.create(this.getClientName(), this);
RetryCallback retryCallback = new RetryCallback() {
public RibbonApacheHttpResponse doWithRetry(RetryContext context) throws Exception {
RibbonApacheHttpRequest newRequest = request;
if (context instanceof LoadBalancedRetryContext) {
ServiceInstance service = ((LoadBalancedRetryContext)context).getServiceInstance();
if (service != null) {
newRequest = newRequest.withNewUri(new URI(service.getUri().getScheme(), newRequest.getURI().getUserInfo(), service.getHost(), service.getPort(), newRequest.getURI().getPath(), newRequest.getURI().getQuery(), newRequest.getURI().getFragment()));
}
}
// ***** after getting a newRequest in the if block above, the newRequest is not passed to the getSecureRequest() below *****
newRequest = RetryableRibbonLoadBalancingHttpClient.this.getSecureRequest(request, configOverride);
// the above should have been -
// newRequest = RetryableRibbonLoadBalancingHttpClient.this.getSecureRequest(newRequest, configOverride);
HttpUriRequest httpUriRequest = newRequest.toRequest(requestConfig);
HttpResponse httpResponse = ((CloseableHttpClient)RetryableRibbonLoadBalancingHttpClient.this.delegate).execute(httpUriRequest);
if (retryPolicy.retryableStatusCode(httpResponse.getStatusLine().getStatusCode())) {
if (CloseableHttpResponse.class.isInstance(httpResponse)) {
((CloseableHttpResponse)httpResponse).close();
}
throw new RetryableStatusCodeException(RetryableRibbonLoadBalancingHttpClient.this.clientName, httpResponse.getStatusLine().getStatusCode());
} else {
return new RibbonApacheHttpResponse(httpResponse, httpUriRequest.getURI());
}
}
};
return this.executeWithRetry(request, retryPolicy, retryCallback);
}
Because of the bug in the code above, although the ribbon configuration (for maxAutoRetries and maxAutoRetriesNextServer) is set based on the yaml file, the updates to the request object with the next server uri were ignored and were always going against the same server and were causing side-effects.
This seems to be fixed in the spring-cloud-netflix-core-1.4.3.RELEASE.jar
.
So, updating the pom/bom
spring-cloud-dependencies
with version Edgware.SR2
updated the ribbon client dependency to spring-cloud-netflix-core-1.4.3.RELEASE.jar
and this issue is now resolved.