Search code examples
amazon-iamamazon-eksaws-cdklets-encryptcert-manager

Cannot sts:AssumeRole with a service account for CDK-generated EKS cluster


Having deployed an EKS 1.21 cluster using CDK, then using https://cert-manager.io/docs/installation/ as a guide, I have attempted to install cert-manager with the end goal of using Let's Encrypt certificates for TLS-enabled services.

Creating IAM policies in my Stack's code:

...
        var externalDnsPolicy = new PolicyDocument(
            new PolicyDocumentProps
            {
                Statements = new[]
                {
                    new PolicyStatement(
                        new PolicyStatementProps
                        {
                            Actions = new[] { "route53:ChangeResourceRecordSets", },
                            Resources = new[] { "arn:aws:route53:::hostedzone/*", },
                            Effect = Effect.ALLOW,
                        }
                    ),
                    new PolicyStatement(
                        new PolicyStatementProps
                        {
                            Actions = new[]
                            {
                                "route53:ListHostedZones",
                                "route53:ListResourceRecordSets",
                            },
                            Resources = new[] { "*", },
                            Effect = Effect.ALLOW,
                        }
                    ),
                }
            }
        );
        var AllowExternalDNSUpdatesRole = new Role(
            this,
            "AllowExternalDNSUpdatesRole",
            new RoleProps
            {
                Description = "Route53 External DNS Role",
                InlinePolicies = new Dictionary<string, PolicyDocument>
                {
                    ["AllowExternalDNSUpdates"] = externalDnsPolicy
                },
                RoleName = "AllowExternalDNSUpdatesRole",
                AssumedBy = new ServicePrincipal("eks.amazonaws.com"),
            }
        );

        var certManagerPolicy = new PolicyDocument(new PolicyDocumentProps {
          Statements = new []
          {
            new PolicyStatement(new PolicyStatementProps 
            {
              Effect = Effect.ALLOW,
              Actions = new []
              {
                "route53:GetChange",
              },
              Resources = new []
              {
                "arn:aws:route53:::change/*",
              }
            }),
            new PolicyStatement(new PolicyStatementProps
            {
              Effect = Effect.ALLOW,
              Actions = new []
              {
                "route53:ChangeResourceRecordSets",
                "route53:ListResourceRecordSets"
              },
              Resources = new []
              {
                "arn:aws:route53:::hostedzone/*",
              },
            }),
          },
        });
        var AllowCertManagerRole = new Role(
            this,
            "AllowCertManagerRole",
            new RoleProps
            {
                Description = "Route53 Cert Manager Role",
                InlinePolicies = new Dictionary<string, PolicyDocument>
                {
                    ["AllowCertManager"] = certManagerPolicy
                },
                RoleName = "AllowCertManagerRole",
                AssumedBy = new ServicePrincipal("eks.amazonaws.com"),
            }
        );
...

And my cert issuer manifest:

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: cert-issuer
  annotations:
    eks.amazonaws.com/role-arn: arn:aws:iam::XREMOVEDX:role/AllowCertManagerRole
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: cert-issuer-viewer
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cert-issuer
subjects:
- kind: ServiceAccount
  name: cert-issuer
  namespace: default
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
  name: sometls-net-letsencrypt
spec:
  acme:
    email: [email protected]
    preferredChain: ""
    server: https://acme-v02.api.letsencrypt.org/directory
    privateKeySecretRef:
      name: sometls-net-letsencrypt-account-key
    solvers:
    - dns01:
        route53:
          hostedZoneID: Z999999999999
          region: us-east-2
          role: arn:aws:iam::XREMOVEDX:role/AllowExternalDNSUpdatesRole
      selector:
        dnsZones:
        - sometls.net
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
  name: sometls-cluster-lets-encrypt
spec:
  secretName: somtls-cluster-lets-encrypt
  issuerRef:
    name: sometls-net-letsencrypt
    kind: ClusterIssuer
    group: cert-manager.io
  subject:
    organizations:
      - sometls
  dnsNames:
    - "*.sometls.net"

But I'm getting spammed with these errors, and cert-manager doesn't work:

(combined from similar events): Error presenting challenge: error instantiating route53 challenge solver: unable to assume role: AccessDenied: User: arn:aws:sts::XREMOVEDX:assumed-role/EksStackEast-EksClusterNodegroupDefaultC-U7IJ1PNZ2123/i-007c425b7a5e39123 is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::XREMOVEDX:role/AllowCertManagerRole status code: 403, request id: 2bd885a2-97a0-4a21-b017-40e099cb4123

I'm very very iffy on how the IAM Roles allow the Kubernetes ServiceAccount to assume them. I must be missing some connection piece that lets the magic of EKS IAM Role for Service Accounts (IRSA) happen.

Please help!

UPDATE: Using CfnJson I am able to create the role so it looks like this:

{
    "Role": {
        "Path": "/",
        "RoleName": "AllowCertManagerRole",
        "RoleId": "REDACTED",
        "Arn": "arn:aws:iam::REDACTED:role/AllowCertManagerRole",
        "CreateDate": "2022-03-24T21:42:32+00:00",
        "AssumeRolePolicyDocument": {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Principal": {
                        "Federated": "arn:aws:iam::REDACTED:oidc-provider/oidc.eks.us-east-2.amazonaws.com/id/REDACTED"
                    },
                    "Action": "sts:AssumeRoleWithWebIdentity",
                    "Condition": {
                        "StringLike": {
                            "oidc.eks.us-east-2.amazonaws.com/id/REDACTED:sub": "system:serviceaccount:*:cert-issuer"
                        }
                    }
                }
            ]
        },
        "Description": "Route53 Cert Manager Role",
        "MaxSessionDuration": 3600,
        "Tags": [
            {
                "Key": "dynasty",
                "Value": "sometls-1.0"
            }
        ],
        "RoleLastUsed": {}
    }
}

I'm still getting the same errors. The condition in the new Role uses the "StringLike" operator. Not sure if that is correct or not, and I'm not sure how to avoid needing to use a non-derived lvalue when setting up the IDictionary<string, object> for the conditions. Also-- the error message is the same in that it expects to be able to sts:AssumeRole not sts:AssumeRoleWithWebIdentity ... I tried changing the action in the Role to sts:AssumeRole with the same effect.

UPDATE #2:

The actual problem with cert-manager was a modification to the install manifests that I missed required for AWS IRSA to work. https://cert-manager.io/docs/configuration/acme/dns01/route53/#service-annotation ... turns out that is really important.

For anyone who wants to see how to add an OIDC provider as a AssumedBy principal with Conditions in C# see snip below. I would have thought there would be a convenience method in AWS CDK that would take care of these machinations automatically. I couldn't find it...

...
        var Cluster = new Cluster(this,"EksCluster", new ClusterProps
        { ... });
...
        var CertIssuerCondition = new CfnJson(this, "CertIssuerCondition", new CfnJsonProps 
        {
            Value = new Dictionary<string, object>
            {
                {$"{Cluster.ClusterOpenIdConnectIssuer}:sub", "system:serviceaccount:*:cert-manager"},
            }
        });

        var certManagerPolicy = new PolicyDocument(new PolicyDocumentProps {
          Statements = new []
          {
            new PolicyStatement(new PolicyStatementProps 
            {
              Effect = Effect.ALLOW,
              Actions = new []
              {
                "route53:GetChange",
              },
              Resources = new []
              {
                "arn:aws:route53:::change/*",
              }
            }),
            new PolicyStatement(new PolicyStatementProps
            {
              Effect = Effect.ALLOW,
              Actions = new []
              {
                "route53:ChangeResourceRecordSets",
                "route53:ListResourceRecordSets"
              },
              Resources = new []
              {
                "arn:aws:route53:::hostedzone/*",
              },
            }),
            new PolicyStatement(new PolicyStatementProps
            {
                Effect = Effect.ALLOW,
                Actions = new[]
                {
                    "route53:ListHostedZonesByName",
                },
                Resources = new[]
                {
                    "*",
                }
            }),
          },
        });
        var AllowCertManagerRole = new Role(
            this,
            "AllowCertManagerRole",
            new RoleProps
            {
                Description = "Route53 Cert Manager Role",
                InlinePolicies = new Dictionary<string, PolicyDocument>
                {
                    ["AllowCertManager"] = certManagerPolicy
                },
                RoleName = "AllowCertManagerRole",
                AssumedBy = new FederatedPrincipal(Cluster.OpenIdConnectProvider.OpenIdConnectProviderArn, new Dictionary<string, object>
                { 
                    ["StringLike"] = CertIssuerCondition,
                },"sts:AssumeRoleWithWebIdentity")
            }
        );

Solution

  • The trust relationship of your IAM role looks wrong to me.

    You need to use a federated principal pointing to the OIDC provider of your EKS cluster, ideally with a condition that correctly reflects your service account and namespace names.

    The principal has to look something like this:

    const namespaceName = 'cert-manager'
    const serviceAccountName = 'cert-issuer'
    
    // If you're deploying EKS with CloudFormation/CDK you could for example export the OIDC provider ARN and get it with Fn.importValue(...) in your stack.
    const oidcProviderUrl = 'oidc.eks.YOUR-REGION.amazonaws.com/id/REDACTED';
    
    // You can use wildcards for the namespace name and/or service account name if you want to have a less restrictive condition.
    const conditionValue = `system:serviceaccount:${namespaceName}:${serviceAccountName}`;
    const roleCondition = new CfnJson(this.stack, `CertIssuerRoleCondition`, {
        value: { [`${oidcProviderUrl}:sub`]: conditionValue }
    });
    
    // If you're deploying EKS with CloudFormation/CDK you could for example export the OIDC provider ARN and get it with Fn.importValue(...) in your stack.
    const oidcProviderArn = 'arn:aws:iam::REDACTED:oidc-provider/oidc.eks.YOUR-REGION.amazonaws.com/id/REDACTED';
    
    const principal = new FederatedPrincipal(oidcProviderArn, roleCondition, 'sts:AssumeRoleWithWebIdentity');
    
    // Now use that principal for your IAM role.