Search code examples
dockerkubernetesgoogle-cloud-platformgoogle-kubernetes-enginekubernetes-helm

ImagePullBackOff error when deploying app to GKE


I'm trying to deploy a dockerised python app to GKE using helm, hosting the image on GCP Artifact registry. I'm hitting this error:

rpc error: code = Unknown desc = failed to pull and unpack image "us-central1-docker.pkg.dev/myapp-dev-411610/myapp-repo/myapp-app:latest": failed to resolve reference "us-central1-docker.pkg.dev/myapp-dev-xxxxxx/myapp-repo/myapp-app:latest": failed to authorize: failed to fetch oauth token: unexpected status from GET request to https://us-central1-docker.pkg.dev/v2/token?scope=repository%3Amyapp-dev-xxxxxx%2Fmyapp-repo%2Fmyapp-app%3Apull&service=us-central1-docker.pkg.dev: 403 Forbidden

My Terraform config is based off https://antonputra.com/google/create-gke-cluster-using-terraform/ and is as follows:

resource "google_container_cluster" "primary" {
    name = "primary"
    location = "us-central1-b" # region or availablility zone. Should be region for prod
    remove_default_node_pool = true
    initial_node_count = 1
    network = google_compute_network.main.self_link
    subnetwork = google_compute_subnetwork.private.self_link
    logging_service = "logging.googleapis.com/kubernetes"
    monitoring_service = "monitoring.googleapis.com/kubernetes"
    networking_mode = "VPC_NATIVE"

    addons_config {
        http_load_balancing {
            disabled = true
        }
        horizontal_pod_autoscaling {
            disabled = false
        }
    }

    release_channel {
        channel = "REGULAR"
    }

    workload_identity_config {
        workload_pool = "myapp-dev-xxxxxx.svc.id.goog"
    }

    ip_allocation_policy {
        cluster_secondary_range_name = "k8s-pod-range"
        services_secondary_range_name = "k8s-service-range"
    }

    private_cluster_config {
        enable_private_nodes = true
        enable_private_endpoint = false
        master_ipv4_cidr_block = "172.16.0.0/28"
    }
}

resource "google_service_account" "kubernetes" {
    account_id = "kubernetes"
}

resource "google_container_node_pool" "general" {
    name = "general"
    cluster = google_container_cluster.primary.id
    node_count = 1

    management {
        auto_repair = true
        auto_upgrade = true

    }

    node_config {
        preemptible = false
        machine_type = "e2-small"

        labels = {
            role = "general"
        }

        service_account = google_service_account.kubernetes.email
        oauth_scopes = [
            "https://www.googleapis.com/auth/cloud-platform"
        ]
    }
}

resource "google_container_node_pool" "spot" {
    name = "spot"
    cluster = google_container_cluster.primary.id
    node_count = 1

    management {
        auto_repair = true
        auto_upgrade = true

    }

    autoscaling {
        min_node_count = 0
        max_node_count = 10
    }

    node_config {
        preemptible = true # Do not use for prod pool - can be taken offline by GCP at any time
        machine_type = "e2-small"
        
        labels = {
            team = "devops"
        }

        taint {
            key = "instance_type"
            value = "spot"
            effect = "NO_SCHEDULE"
        }

        service_account = google_service_account.kubernetes.email
        oauth_scopes = [
            "https://www.googleapis.com/auth/cloud-platform"
        ]
    }
}

resource "google_service_account" "artifact_registry_service_account" {
  account_id   ="artifact-registry-svc-acct"
  display_name = "Artifact Registry Service Account"
}

resource "google_project_iam_member" "artifact_registry_service_account_reader" {
  project = "myapp-dev-xxxxxx"
  role    = "roles/artifactregistry.reader"
  member  = "serviceAccount:${google_service_account.artifact_registry_service_account.email}"
}

resource "google_service_account_key" "artifact_registry_service_account_key" {
  service_account_id = google_service_account.artifact_registry_service_account.name
  public_key_type    = "TYPE_X509_PEM_FILE"
}

resource "local_file" "artifact_registry_service_account_key_file" {
  content  = base64decode(google_service_account_key.artifact_registry_service_account_key.private_key)
  filename = "${path.module}/artifact-registry-service-account-key.json"
}

resource "kubernetes_secret" "artifact_registry_json_key" {
  metadata {
    name = "artifact-registry-json-key"
  }

  data = {
    ".dockerconfigjson" = jsonencode({
      "auths" = {
        "us-central1-docker.pkg.dev" = {
          "username" = "_json_key"
          "password" = google_service_account_key.artifact_registry_service_account_key.private_key
          "email" = google_service_account.artifact_registry_service_account.email
          "auth" = base64encode("_json_key:${google_service_account_key.artifact_registry_service_account_key.private_key}")
        }
      }
    })
  }
}

resource "google_artifact_registry_repository" "docker_repository" {
  provider = google
  location = "us-central1"
  repository_id = "myapp-repo"
  description = "Docker Repository"
  format = "DOCKER"

  labels = {
    env = "dev"
  }
}

In my helm values.yaml I put:

image:
  repository: us-central1-docker.pkg.dev/myapp-dev-xxxxxx/myapp-repo/myapp-app
  pullPolicy: IfNotPresent
  # Overrides the image tag whose default is the chart appVersion.
  tag: "latest"

imagePullSecrets:
  - name: artifact-registry-json-key

I can build and push to Artifact Registry successfully using docker. My image is in my newly created repo. I can pull the image through my cli using both my main account and the service account declared in the terraform file. I get the same error if I try and deploy to GKE through the UI, so I'm not sure why it's not even working using my credentials which have the default Owner and Organization Administrator roles.

My hunch is that the secret isn't being used somehow (I've tried deploying using kubectl instead of Helm and get the same error), or that there's a difference between pulling the image using the docker cli and kubernetes hence one works and one does not.

Could it be that the service account needs additional permissions? But if that's the case why don't my Owner credentials work when deploying through the UI? I'm struggling to think of other issues that could cause this.

Worth noting that I also had the same issue using GCR, but switched to GAR anyway because of deprecation. Tried deploying on cloud run and it is able to pull the image so just seems to be related to GKE.


Solution

  • This was solved using DazWilkin's comment above. I added the following to my terraform configuration:

    resource "google_project_iam_member" "k8s_account_artifact_registry_reader" {
      project = "myapp-dev-xxxxxx"
      role    = "roles/artifactregistry.reader"
      member  = "serviceAccount:[email protected]"
    }