Search code examples
pythonhiveapache-spark-sqlaws-glue

AWS Glue: How to expand nested Hive struct to Dict?


I'm trying to expand field mappings in a Table mapped by my AWS Glue crawler to a nested dictionary in Python. But, I can't find any Spark/Hive parsers to deserialize the

var_type = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'

string located in table_schema['Table']['StorageDescriptor']['Columns'] to a Python dict.

How to dump the table definition in Glue:

import boto3
client = boto3.client('glue')
client.get_table(DatabaseName=selected_db, Name=selected_table)

Response:

table_schema = {'Table': {'Name': 'asdfasdf',
  'DatabaseName': 'asdfasdf',
  'Owner': 'owner',
  'CreateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
  'UpdateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
  'LastAccessTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
  'Retention': 0,
  'StorageDescriptor': {'Columns': [{'Name': 'version', 'Type': 'int'},
    {'Name': 'payload',
     'Type': 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'},
    {'Name': 'origin', 'Type': 'string'}],
   'Location': 's3://asdfasdf/',
   'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
   'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
   'Compressed': False,
   'NumberOfBuckets': -1,
   'SerdeInfo': {'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe',
    'Parameters': {'paths': 'origin,payload,version'}},
   'BucketColumns': [],
   'SortColumns': [],
   'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
    'CrawlerSchemaSerializerVersion': '1.0',
    'UPDATED_BY_CRAWLER': 'asdfasdf',
    'averageRecordSize': '799',
    'classification': 'json',
    'compressionType': 'none',
    'objectCount': '94',
    'recordCount': '92171',
    'sizeKey': '74221058',
    'typeOfData': 'file'},
   'StoredAsSubDirectories': False},
  'PartitionKeys': [{'Name': 'partition_0', 'Type': 'string'},
   {'Name': 'partition_1', 'Type': 'string'},
   {'Name': 'partition_2', 'Type': 'string'}],
  'TableType': 'EXTERNAL_TABLE',
  'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
   'CrawlerSchemaSerializerVersion': '1.0',
   'UPDATED_BY_CRAWLER': 'asdfasdf',
   'averageRecordSize': '799',
   'classification': 'json',
   'compressionType': 'none',
   'objectCount': '94',
   'recordCount': '92171',
   'sizeKey': '74221058',
   'typeOfData': 'file'},
  'CreatedBy': 'arn:aws:sts::asdfasdf'},
 'ResponseMetadata': {'RequestId': 'asdfasdf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 01 Aug 2019 16:23:06 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '3471',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'asdfasdf'},
  'RetryAttempts': 0}}

Goal would be a python dictionary and values for each field type, vs. the embedded string. E.g.

expand_function('struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'})

returns

{
 'loc_lat':'double', 
 'service_handler':'string',
 'ip_address':'string',
 'device':'bigint',
 'source':{'id':'string',
           'contacts': {
               'admin': {
                   'email':'string',
                   'name':'string'
               },
           'name':'string'
           },
 'loc_name':'string'
}

Thanks!


Solution

  • The accepted answer doesn't handle arrays. This solution does:

    import json
    import re
    
    
    def _hive_struct_to_json(hive_str):
        """
        Expands embedded Hive struct strings to Python dictionaries
        Args:
            Hive struct format as string
        Returns
            JSON object
        """
        r = re.compile(r'(.*?)(struct<|array<|[:,>])(.*)')
        root = dict()
    
        to_parse = hive_str
        parents = []
        curr_elem = root
    
        key = None
        while to_parse:
            left, operator, to_parse = r.match(to_parse).groups()
    
            if operator == 'struct<' or operator == 'array<':
                parents.append(curr_elem)
                new_elem = dict() if operator == 'struct<' else list()
                if key:
                    curr_elem[key] = new_elem
                    curr_elem = new_elem
                elif isinstance(curr_elem, list):
                    curr_elem.append(new_elem)
                    curr_elem = new_elem
                key = None
            elif operator == ':':
                key = left
            elif operator == ',' or operator == '>':
                if left:
                    if isinstance(curr_elem, dict):
                        curr_elem[key] = left
                    elif isinstance(curr_elem, list):
                        curr_elem.append(left)
    
                if operator == '>':
                    curr_elem = parents.pop()
    
        return root
    
    
    hive_str = '''
        struct<
            loc_lat:double,
            service_handler:string,
            ip_address:string,
            device:bigint,
            source:struct<
                id:string,
                contacts:struct<
                    admin:struct<
                        email:string,
                        name:array<string>
                    >
                >,
                name:string
            >,
            loc_name:string,
            tags:array<
                struct<
                    key:string,
                    value:string
                >
            >
        >
    '''
    
    hive_str = re.sub(r'[\s]+', '', hive_str).strip()
    
    print(hive_str)
    print(json.dumps(_hive_struct_to_json(hive_str), indent=2))
    

    Prints:

    struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:array<string>>>,name:string>,loc_name:string,tags:array<struct<key:string,value:string>>>
    
    {
      "loc_lat": "double",
      "service_handler": "string",
      "ip_address": "string",
      "device": "bigint",
      "source": {
        "id": "string",
        "contacts": {
          "admin": {
            "email": "string",
            "name": [
              "string"
            ]
          }
        },
        "name": "string"
      },
      "loc_name": "string",
      "tags": [
        {
          "key": "string",
          "value": "string"
        }
      ]
    }