Search code examples
amazon-web-servicesterraformamazon-vpcaws-security-groupamazon-elb

AWS Network Load Balancer healthchecks not behaving as expected


I have an issue where the Network Load Balancer healthcheck is showing targets as unhealthy. The targets are AWS Fargate instances.

The healthcheck port and application port is 8080. Everything works perfectly if I place the ECS Fargate cluster service to a security group with 0.0.0.0/0 to port 8080 open. Healthchecks are successful and the application works.

The problem is that I do not want the application to be accessed from the public internet. So when I try to restrict access with ip-addresses I get issues. Because the healthcheck port and application access all happen on port 8080 I have to restrict it so that the healthcheck works only from the private ip-address of the Network load balancer.

According to AWS documentation this should be possible: https://docs.aws.amazon.com/elasticloadbalancing/latest/network/target-group-register-targets.html#target-security-groups According to this documentation it should be possible to add the VPC CIDR to the security group rules. I did that and the healthchecks fail. I also tried to add the private ip-addresses of the network load balancer in to the security group and they fail as well.

I am completely out of ideas.

Could someone verify that this is even possible to limit healthcheck traffic to the private ip address range of the Network Load Balancer?

Here is most of the Terraform code:

## security groups. This is the security group that the Fargate cluster service is placed in

resource "aws_security_group" "nlb_sg_health_test" {
  vpc_id = aws_vpc.test_vpc.id

  ingress {
    from_port   = 8080
    to_port     = 8080
    protocol    = "tcp"
    cidr_blocks = [
      "10.167.0.0/16"
    ]
    description = "Allow connection from NLB to app healthcheck"
  }

  egress {
    protocol    = "-1"
    from_port   = 0
    to_port     = 0
    cidr_blocks = ["0.0.0.0/0"]
  }

}

resource "aws_vpc" "test_vpc" {
    cidr_block                       = "10.167.0.0/16"

    tags = {
     "Name" = "${var.prefix}"
   }
}

## private subnets

resource "aws_subnet" "test_eu_west_private_1a" {
  cidr_block              = "10.167.10.0/24"
  map_public_ip_on_launch = false
  vpc_id                  = aws_vpc.test_vpc.id
  availability_zone       = "${data.aws_region.current.name}a"

  tags = {
   "Name" = "${var.prefix} private eu-west-1a"
 }
}


resource "aws_subnet" "test_eu_west_private_1b" {
  cidr_block              = "10.167.20.0/24"
  map_public_ip_on_launch = false
  vpc_id                  = aws_vpc.test_vpc.id
  availability_zone       = "${data.aws_region.current.name}b"

  tags = {
   "Name" = "${var.prefix} private eu-west-1b"
 }
}

resource "aws_route_table" "test_private" {
  vpc_id = aws_vpc.test_vpc.id

  tags = {
   "Name" = "${var.prefix} private a"
 }
}

resource "aws_route_table_association" "test_main_a" {
  subnet_id      = aws_subnet.test_eu_west_private_1a.id
  route_table_id = aws_route_table.test_private.id
}

resource "aws_route_table_association" "test_main_b" {
  subnet_id      = aws_subnet.test_eu_west_private_1b.id
  route_table_id = aws_route_table.test_private.id
}

## public subnets

resource "aws_subnet" "test_eu_west_public_1a" {
  cidr_block              = "10.167.1.0/24"
  map_public_ip_on_launch = true
  vpc_id                  = aws_vpc.test_vpc.id
  availability_zone       = "${data.aws_region.current.name}a"

  tags = {
   "Name" = "${var.prefix} public eu-west-1a"
 }
}


resource "aws_subnet" "test_eu_west_public_1b" {
  cidr_block              = "10.167.2.0/24"
  map_public_ip_on_launch = true
  vpc_id                  = aws_vpc.test_vpc.id
  availability_zone       = "${data.aws_region.current.name}b"

  tags = {
   "Name" = "${var.prefix} public eu-west-1b"
 }
}

resource "aws_route_table" "test_public" {
  vpc_id = aws_vpc.test_vpc.id

  tags = {
   "Name" = "${var.prefix} public a"
 }
}

resource "aws_route_table_association" "test_public_a" {
  subnet_id      = aws_subnet.test_eu_west_public_1a.id
  route_table_id = aws_route_table.test_public.id
}

resource "aws_route_table_association" "test_public_b" {
  subnet_id      = aws_subnet.test_eu_west_public_1b.id
  route_table_id = aws_route_table.test_public.id
}


resource "aws_internet_gateway" "test_igw" {
  vpc_id = aws_vpc.test_vpc.id

  tags = {
    Name = "${var.prefix} IGW"
  }
}

resource "aws_route" "public_internet_access_a" {
  route_table_id            = aws_route_table.test_public.id
  destination_cidr_block    = "0.0.0.0/0"
  gateway_id                = aws_internet_gateway.test_igw.id
}

resource "aws_route" "public_internet_access_b" {
  route_table_id            = aws_route_table.test_public.id
  destination_cidr_block    = "0.0.0.0/0"
  gateway_id                = aws_internet_gateway.test_igw.id
}

## NLB

resource "aws_lb_target_group" "test_http_nlb_target_group" {
  name     = "${var.prefix}-target-group-http"
  port     = 8080
  protocol = "TCP"
  vpc_id   = aws_vpc.test_vpc.id
  target_type = "ip"
  preserve_client_ip = true
}

resource "aws_lb_listener" "test_https_nlb_listener" {
  load_balancer_arn = aws_lb.test_nlb.arn
  port              = 443
  protocol          = "TLS"
  ssl_policy        = "ELBSecurityPolicy-2016-08"
  certificate_arn   = aws_acm_certificate.app.arn

  default_action {
    type             = "forward"
    target_group_arn = aws_lb_target_group.test_http_nlb_target_group.arn
  }
}

resource "aws_lb" "test_nlb" {
  name               = "${var.prefix}-nlb"
  load_balancer_type = "network"

  subnet_mapping {
    subnet_id     = aws_subnet.test_eu_west_public_1a.id
    allocation_id = aws_eip.nlb_public_ip_a.id
  }

  subnet_mapping {
    subnet_id     = aws_subnet.test_eu_west_public_1b.id
    allocation_id = aws_eip.nlb_public_ip_b.id
  }
}

## ECS

resource "aws_ecs_service" "main" {
  name            = "${var.prefix}"
  cluster         = aws_ecs_cluster.test_cluster.id
  task_definition = aws_ecs_task_definition.app.arn
  desired_count   = 1
  launch_type     = "FARGATE"
  enable_execute_command = true
  health_check_grace_period_seconds = 300

  network_configuration {
    security_groups  = [
      aws_security_group.nlb_sg_health_test.id
    ]
    subnets          = [
      aws_subnet.test_eu_west_private_1a.id,
      aws_subnet.test_eu_west_private_1b.id
    ]
    assign_public_ip = false
  }

  load_balancer {
    target_group_arn = aws_lb_target_group.test_http_nlb_target_group.id
    container_name   = var.test_frontend_container_name
    container_port   = 8080
  }
}

Solution

  • I dont know if this can be accepted as an answer, but there was something wrong with my AWS setup. I recreated to entire infrastructure with Terraform in a different VPC and now everything works as expected.

    So to answer one part of my question: Yes, it is possible to limit the health check traffic from the network load balancer to the Fargate instance as described in the AWS documentation. This can be done even with the "internet facing" network load balancer. Just add the VPC CIDR or private ip-addresses of the load balancer to the security group.

    I am not sure what the problem was in my original setup, but after recreating everything now it works.