I am trying to deploy a glue crawler
for an s3. Unfortunately I cant manage to find an appropriate IAM role that allows the crawler to run. The permissions I need are just to read/write to S3, and logs:PutLogsEvent, but somehow I am not getting it right.
Here is my code, it can be deployed but the crawler
does not have permissions to run.
from aws_cdk import (
aws_events as events,
aws_lambda as lambda_,
aws_events_targets as targets,
aws_iam as iam,
aws_glue as glue,
core
)
class MyStack(core.Stack):
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
super().__init__(scope, id, **kwargs)
# what should I put in the role exactly?
glue_role = iam.Role(
self, 'Role__arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole',
assumed_by=iam.ServicePrincipal('glue.amazonaws.com'),
)
glue_trigger = glue.CfnTrigger(self, "glue-daily-trigger",
name = "etl-trigger",
schedule = "cron(5 * * * ? *)", # every hour at X.05, every day
type="SCHEDULED",
actions=[
{
"jobName": "glue_crawler-daily"
}
],
start_on_creation=True
)
crawler_name = 'crawler_units_data'
glue_crawler = glue.CfnCrawler(
self, crawler_name,
name=crawler_name,
database_name='data_science',
role=glue_role.role_arn,#'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole',
targets={"s3Targets": [{"path": "s3://random_s3/units/"}]},
)
glue_trigger.add_depends_on(glue_crawler)
I tried several things and translating code from javascript examples like this one but the methods being called from javascript do not map 100% with python.
This role (created from the GUI) works correctly and has 2 policies.
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::random_s3/units*"
]
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"glue:*",
"s3:GetBucketLocation",
"s3:ListBucket",
"s3:ListAllMyBuckets",
"s3:GetBucketAcl",
"ec2:DescribeVpcEndpoints",
"ec2:DescribeRouteTables",
"ec2:CreateNetworkInterface",
"ec2:DeleteNetworkInterface",
"ec2:DescribeNetworkInterfaces",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSubnets",
"ec2:DescribeVpcAttribute",
"iam:ListRolePolicies",
"iam:GetRole",
"iam:GetRolePolicy",
"cloudwatch:PutMetricData"
],
"Resource": [
"*"
]
},
{
"Effect": "Allow",
"Action": [
"s3:CreateBucket"
],
"Resource": [
"arn:aws:s3:::aws-glue-*"
]
},
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject"
],
"Resource": [
"arn:aws:s3:::aws-glue-*/*",
"arn:aws:s3:::*/*aws-glue-*/*"
]
},
{
"Effect": "Allow",
"Action": [
"s3:GetObject"
],
"Resource": [
"arn:aws:s3:::crawler-public*",
"arn:aws:s3:::aws-glue-*"
]
},
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:*:*:/aws-glue/*"
]
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateTags",
"ec2:DeleteTags"
],
"Condition": {
"ForAllValues:StringEquals": {
"aws:TagKeys": [
"aws-glue-service-resource"
]
}
},
"Resource": [
"arn:aws:ec2:*:*:network-interface/*",
"arn:aws:ec2:*:*:security-group/*",
"arn:aws:ec2:*:*:instance/*"
]
}
]
}
As it turns out, I needed to pass the name and policy in a different way
glue_role = iam.Role(
self, 'glue_role_id2323',
role_name = 'Rolename',
assumed_by=iam.ServicePrincipal('glue.amazonaws.com'),
managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSGlueServiceRole')]
)