Search code examples
pythonamazon-web-servicespysparkboto3aws-glue

Get tables from AWS Glue using boto3


I need to harvest tables and column names from AWS Glue crawler metadata catalogue. I used boto3 but constantly getting number of 100 tables even though there are more. Setting up NextToken doesn't help. Please help if possible.

Desired results is list as follows:

lst = [table_one.col_one, table_one.col_two, table_two.col_one....table_n.col_n]

def harvest_aws_crawler():
glue = boto3.client('glue', region_name='')
response = glue.get_tables(DatabaseName='', NextToken = '')

#response syntax:
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
crawler_list_tables = []

for tables in response['TableList']:
    while (response.get('NextToken') is not None):
        crawler_list_tables.append(tables['Name'])
        break
print(len(crawler_list_tables))

harvest_aws_crawler()

UPDATED code, still need to have tablename+columnname:

def harvest_aws_crawler():
glue = boto3.client('glue', region_name='')
next_token = ""

#response syntax:
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
response = glue.get_tables(DatabaseName='', NextToken = next_token)

tables_from_crawler = []
while True:
    table_list = response['TableList']
    for table_dict in table_list:
        table_name = table_dict['Name']
        
        #append table_name+column_name
        for columns in table_name['StorageDescriptor']['Columns']:
            tables_from_crawler.append(table_name + '.' + columns['Name'])
                
        #tables_from_crawler.append(table_name)
    next_token = response.get('NextToken')
    if next_token is None:
        break
print(tables_from_crawler)

harvest_aws_crawler()

Solution

  • Adding sub-loop did the trick to get table+column result.

    #harvest aws crawler metadata
    next_token = ""
    client = boto3.client('glue',region_name='us-east-1')
    crawler_tables = []
    
    while True:
      response = client.get_tables(DatabaseName = '', NextToken = next_token)
      for tables in response['TableList']:
        for columns in tables['StorageDescriptor']['Columns']:
            crawler_tables.append(tables['Name'] + '.' + columns['Name'])
      next_token = response.get('NextToken')
      if next_token is None:
        break
    print(crawler_tables)