amazon-web-services amazon-ec2 terraform amazon-ecs terraform-provider-aws

Impossible to SSH to EC2 instance and unable to place ECS task

Given the following terraform.tf file:

provider "aws" {
  profile = "default"
  region = "us-east-1"
}


locals {
  vpc_name = "some-vpc-name"
  dev_vpn_source = "*.*.*.*/32"  # Insted of * I have a CIDR block of our VPN here
}

resource "aws_vpc" "vpc" {
  cidr_block = "10.0.0.0/16"
  enable_dns_hostnames = true
  tags = {
    Name: local.vpc_name
  }
}


resource "aws_subnet" "a" {
  cidr_block = "10.0.0.0/17"
  vpc_id = aws_vpc.vpc.id
  tags = {
    Name: "${local.vpc_name}-a"
  }
}

resource "aws_subnet" "b" {
  cidr_block = "10.0.128.0/17"
  vpc_id = aws_vpc.vpc.id
  tags = {
    Name: "${local.vpc_name}-b"
  }
}

resource "aws_security_group" "ssh" {
  name = "${local.vpc_name}-ssh"
  vpc_id = aws_vpc.vpc.id
  tags = {
    Name: "${local.vpc_name}-ssh"
  }
}


resource "aws_security_group_rule" "ingress-ssh" {
  from_port = 22
  protocol = "ssh"
  security_group_id = aws_security_group.ssh.id
  to_port = 22
  type = "ingress"
  cidr_blocks = [local.dev_vpn_source]
  description = "SSH access for developer"
}


resource "aws_security_group" "outbound" {
  name = "${local.vpc_name}-outbound"
  vpc_id = aws_vpc.vpc.id
  tags = {
    Name: "${local.vpc_name}-outbound"
  }
}


resource "aws_security_group_rule" "egress" {
  from_port = 0
  protocol = "all"
  security_group_id = aws_security_group.outbound.id
  to_port = 65535
  type = "egress"
  cidr_blocks = ["0.0.0.0/0"]
  description = "All outbound allowed"
}

module "ecs-clusters" {
  source = "./ecs-clusters/"
  subnets = [aws_subnet.a, aws_subnet.b]
  vpc_name = local.vpc_name
  security_groups = [aws_security_group.ssh, aws_security_group.outbound]
}

And the following ecs-clusters/ecs-cluster.tf file:

variable "vpc_name" {
  type = string
}

variable "subnets" {
  type = list(object({
    id: string
  }))
}

variable "security_groups" {
  type = list(object({
    id: string
  }))
}


data "aws_ami" "amazon_linux_ecs" {
  most_recent = true
  owners = ["amazon"]
  filter {
    name   = "name"
    values = ["amzn2-ami-ecs*"]
  }
}

resource "aws_iam_instance_profile" "ecs-launch-profile" {
  name = "${var.vpc_name}-ecs"
  role = "ecsInstanceRole"
}

resource "aws_launch_template" "ecs" {
  name          = "${var.vpc_name}-ecs"
  image_id      = data.aws_ami.amazon_linux_ecs.id
  instance_type = "r5.4xlarge"
  key_name = "some-ssh-key-name"
  iam_instance_profile {
    name = "${var.vpc_name}-ecs"
  }
  block_device_mappings {
    device_name = "/dev/xvda"
    ebs {
      volume_type = "gp3"
      volume_size = 1024
      delete_on_termination = false
    }
  }
  network_interfaces {
    associate_public_ip_address = true
    subnet_id = var.subnets[0].id
    security_groups = var.security_groups[*].id
  }
  update_default_version = true
}

resource "aws_autoscaling_group" "ecs-autoscaling_group" {
  name = "${var.vpc_name}-ecs"
  vpc_zone_identifier = [for subnet in var.subnets: subnet.id]
  desired_capacity   = 1
  max_size           = 1
  min_size           = 1
  protect_from_scale_in = true
  launch_template {
    id = aws_launch_template.ecs.id
    version = aws_launch_template.ecs.latest_version
  }
  tag {
    key = "Name"
    propagate_at_launch = true
    value = "${var.vpc_name}-ecs"
  }
  depends_on = [aws_launch_template.ecs]
}

resource "aws_ecs_capacity_provider" "ecs-capacity-provider" {
  name = var.vpc_name

  auto_scaling_group_provider {
    auto_scaling_group_arn         = aws_autoscaling_group.ecs-autoscaling_group.arn
    managed_termination_protection = "ENABLED"

    managed_scaling {
      maximum_scaling_step_size = 1
      minimum_scaling_step_size = 1
      status                    = "ENABLED"
      target_capacity           = 1
    }
  }
  depends_on = [aws_autoscaling_group.ecs-autoscaling_group]
}


resource "aws_ecs_cluster" "ecs-cluster" {
  name = var.vpc_name
  capacity_providers = [aws_ecs_capacity_provider.ecs-capacity-provider.name]
  depends_on = [aws_ecs_capacity_provider.ecs-capacity-provider]
}

resource "aws_iam_role" "ecs-execution" {
  name = "${var.vpc_name}-ecs-execution"
  assume_role_policy = <<EOF
{
 "Version": "2012-10-17",
 "Statement": [
   {
     "Action": "sts:AssumeRole",
     "Principal": {
       "Service": "ecs-tasks.amazonaws.com"
     },
     "Effect": "Allow",
     "Sid": ""
   }
 ]
}
EOF
}

resource "aws_iam_role" "ecs" {
  name = "${var.vpc_name}-ecs"

  assume_role_policy = <<EOF
{
 "Version": "2012-10-17",
 "Statement": [
   {
     "Action": "sts:AssumeRole",
     "Principal": {
       "Service": "ecs-tasks.amazonaws.com"
     },
     "Effect": "Allow",
     "Sid": ""
   }
 ]
}
EOF
}

resource "aws_iam_role_policy_attachment" "execution-role" {
  role       = aws_iam_role.ecs-execution.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}

resource "aws_iam_role_policy_attachment" "role" {
  role       = aws_iam_role.ecs.name
  policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
}

I'm facing two problems:

I can't SSH into EC2 instance created by the autoscaling group, despite the fact that I'm using the same SSH key and VPN to access other EC2 instances. My VPN client config includes route to the target machine via VPN gateway.
I can't execute task on the ESC cluster. The task gets stuck in provisioning status and then fails with "Unable to run task". The task is configured to use 1 GB of RAM and 1 vCPU.

What am I doing wrong?

Solution

Based on the comments.

There were two issues with the original setup:

Lack of connectivity to ECS and ECR services, which was solved by enabling internet access in the VPC. It is also possible to use VPC interface endpoints for ECS, ECR and S3, if the internet access is not desired.
Container instances did not register with ECS. This was fixed by using user_data to bootstrap ECS instances so that they can register with the ECS cluster.