Search code examples
pythonjsonpandasamazon-cloudtrailjson-normalize

How to read AWS CloudTrail JSON Logs into a pandas dataframe


I have a problem as I am loading my data to pandas using Jupyterlab running with Anaconda3 as my VM suddenly went down. After it was up, I found that my code doesn't work anymore for some reason. Here is my code:

awsc = pd.DataFrame()
json_pattern = os.path.join('logs_old/AWSCloudtrailLog/','*')
file_list = glob.glob(json_pattern)
for file in file_list:
    data = pd.read_json(file, lines=True)
    awsc = awsc.append(data, ignore_index = True)
awsc = pd.concat([awsc, pd.json_normalize(awsc['userIdentity'])], axis=1).drop('userIdentity', 1)
awsc.rename(columns={'type':'userIdentity_type',
                     'principalId':'userIdentity_principalId',
                     'arn':'userIdentity_arn',
                     'accountId':'userIdentity_accountId',
                     'accessKeyId':'userIdentity_accessKeyId',
                     'userName':'userIdentity_userName',}, inplace=True)

When I run the code it gave me the KeyError message like this:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/envs/environment/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2888             try:
-> 2889                 return self._engine.get_loc(casted_key)
   2890             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'userIdentity'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-9-efd1d2e600a5> in <module>
      1 # unpack nested json
      2 
----> 3 awsc = pd.concat([awsc, pd.json_normalize(awsc['userIdentity'])], axis=1).drop('userIdentity', 1)
      4 awsc.rename(columns={'type':'userIdentity_type',
      5                      'principalId':'userIdentity_principalId',

~/anaconda3/envs/environment/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]

~/anaconda3/envs/environment/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2889                 return self._engine.get_loc(casted_key)
   2890             except KeyError as err:
-> 2891                 raise KeyError(key) from err
   2892 
   2893         if tolerance is not None:

KeyError: 'userIdentity'

The output of the dataframe awsc as i run print(awss.info()) or print(awsc.info()):

 <class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrameNone

Any solution to solve this issue? Does the problem come from the Pandas or Anaconda?


Solution

  • Using Code from OP

    • The method for creating the dataframe is not correct, which is way awsc is empty.
    • Without seeing a file, it's not possible to know if pd.read_json(file, lines=True) is the correct method to use.
    • pd.json_normalize(awsc['userIdentity']) will work on a column of dicts. It's more than likely the column is strings though.
      • If the dicts are str type, use ast.literal_eval to convert them to dict type.
    import pandas as pd
    from ast import literal_eval
    
    # crate a list to add dataframes to
    awsc_list = list()
    
    # iterate through the list of and append them to awsc_list
    for file in file_list:
        awsc_list.append(pd.read_json(file, lines=True))
        
    # concat the files into a single dataframe
    awsc = pd.concat(awsc_list).reset_index(drop=True)
    
    # convert the userIdentity column to dict type, if it contains str type
    awsc.userIdentity = awsc.userIdentity.apply(literal_eval)
    
    # normalize userIdentity
    normalized = pd.json_normalize(awsc['userIdentity'], sep='_')
    
    # join awsc with normalized and drop the userIdentity column
    awsc = awsc.join(normalized).drop('userIdentity', 1)
    
    # rename the columns
    awsc.rename(columns={'type':'userIdentity_type',
                         'principalId':'userIdentity_principalId',
                         'arn':'userIdentity_arn',
                         'accountId':'userIdentity_accountId',
                         'accessKeyId':'userIdentity_accessKeyId',
                         'userName':'userIdentity_userName',}, inplace=True)
    

    New Code with Sample Data

    import json
    import pandas as pd
    
    # crate a list to add dataframes to
    awsc_list = list()
    
    # list of files
    files_list = ['test.json', 'test2.json']
    
    # read the filess
    for file in files_list:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.loads(f.read())
        
        # normalize the file and append it to the list of dataframe
        awsc_list.append(pd.json_normalize(data, 'Records', sep='_'))
        
    # concat the files into a single dataframe
    awsc = pd.concat(awsc_list).reset_index(drop=True)
    
    # display(awsc)
      eventVersion             eventTime        eventSource       eventName  awsRegion  sourceIPAddress                                                                                 userAgent userIdentity_type userIdentity_principalId                      userIdentity_arn userIdentity_accessKeyId userIdentity_accountId userIdentity_userName requestParameters_instancesSet_items                                                                                                 responseElements_instancesSet_items requestParameters_force userIdentity_sessionContext_attributes_mfaAuthenticated userIdentity_sessionContext_attributes_creationDate requestParameters_keyName responseElements_keyName                              responseElements_keyFingerprint responseElements_keyMaterial
    0          1.0  2014-03-06T21:22:54Z  ec2.amazonaws.com  StartInstances  us-east-2  205.251.233.176                                                                    ec2-api-tools 1.6.12.2           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice       [{'instanceId': 'i-ebeaf9e2'}]    [{'instanceId': 'i-ebeaf9e2', 'currentState': {'code': 0, 'name': 'pending'}, 'previousState': {'code': 80, 'name': 'stopped'}}]                     NaN                                                     NaN                                                 NaN                       NaN                      NaN                                                          NaN                          NaN
    1          1.0  2014-03-06T21:01:59Z  ec2.amazonaws.com   StopInstances  us-east-2  205.251.233.176                                                                    ec2-api-tools 1.6.12.2           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice       [{'instanceId': 'i-ebeaf9e2'}]  [{'instanceId': 'i-ebeaf9e2', 'currentState': {'code': 64, 'name': 'stopping'}, 'previousState': {'code': 16, 'name': 'running'}}]                   False                                                     NaN                                                 NaN                       NaN                      NaN                                                          NaN                          NaN
    2          1.0  2014-03-06T17:10:34Z  ec2.amazonaws.com   CreateKeyPair  us-east-2     72.21.198.64  EC2ConsoleBackend, aws-sdk-java/Linux/x.xx.fleetxen Java_HotSpot(TM)_64-Bit_Server_VM/xx           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice                                  NaN                                                                                                                                 NaN                     NaN                                                   false                                2014-03-06T15:15:06Z                 mykeypair                mykeypair  30:1d:46:d0:5b:ad:7e:1b:b6:70:62:8b:ff:38:b5:e9:ab:5d:b8:21       <sensitiveDataRemoved>
    3          1.0  2014-03-06T21:22:54Z  ec2.amazonaws.com  StartInstances  us-east-2  205.251.233.176                                                                    ec2-api-tools 1.6.12.2           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice       [{'instanceId': 'i-ebeaf9e2'}]    [{'instanceId': 'i-ebeaf9e2', 'currentState': {'code': 0, 'name': 'pending'}, 'previousState': {'code': 80, 'name': 'stopped'}}]                     NaN                                                     NaN                                                 NaN                       NaN                      NaN                                                          NaN                          NaN
    

    Sample Data

    test.json

    • List of JSONs
    [{
            "Records": [{
                    "eventVersion": "1.0",
                    "userIdentity": {
                        "type": "IAMUser",
                        "principalId": "EX_PRINCIPAL_ID",
                        "arn": "arn:aws:iam::123456789012:user/Alice",
                        "accessKeyId": "EXAMPLE_KEY_ID",
                        "accountId": "123456789012",
                        "userName": "Alice"
                    },
                    "eventTime": "2014-03-06T21:22:54Z",
                    "eventSource": "ec2.amazonaws.com",
                    "eventName": "StartInstances",
                    "awsRegion": "us-east-2",
                    "sourceIPAddress": "205.251.233.176",
                    "userAgent": "ec2-api-tools 1.6.12.2",
                    "requestParameters": {
                        "instancesSet": {
                            "items": [{
                                    "instanceId": "i-ebeaf9e2"
                                }
                            ]
                        }
                    },
                    "responseElements": {
                        "instancesSet": {
                            "items": [{
                                    "instanceId": "i-ebeaf9e2",
                                    "currentState": {
                                        "code": 0,
                                        "name": "pending"
                                    },
                                    "previousState": {
                                        "code": 80,
                                        "name": "stopped"
                                    }
                                }
                            ]
                        }
                    }
                }
            ]
        }, {
            "Records": [{
                    "eventVersion": "1.0",
                    "userIdentity": {
                        "type": "IAMUser",
                        "principalId": "EX_PRINCIPAL_ID",
                        "arn": "arn:aws:iam::123456789012:user/Alice",
                        "accountId": "123456789012",
                        "accessKeyId": "EXAMPLE_KEY_ID",
                        "userName": "Alice"
                    },
                    "eventTime": "2014-03-06T21:01:59Z",
                    "eventSource": "ec2.amazonaws.com",
                    "eventName": "StopInstances",
                    "awsRegion": "us-east-2",
                    "sourceIPAddress": "205.251.233.176",
                    "userAgent": "ec2-api-tools 1.6.12.2",
                    "requestParameters": {
                        "instancesSet": {
                            "items": [{
                                    "instanceId": "i-ebeaf9e2"
                                }
                            ]
                        },
                        "force": false
                    },
                    "responseElements": {
                        "instancesSet": {
                            "items": [{
                                    "instanceId": "i-ebeaf9e2",
                                    "currentState": {
                                        "code": 64,
                                        "name": "stopping"
                                    },
                                    "previousState": {
                                        "code": 16,
                                        "name": "running"
                                    }
                                }
                            ]
                        }
                    }
                }
            ]
        }, {
            "Records": [{
                    "eventVersion": "1.0",
                    "userIdentity": {
                        "type": "IAMUser",
                        "principalId": "EX_PRINCIPAL_ID",
                        "arn": "arn:aws:iam::123456789012:user/Alice",
                        "accountId": "123456789012",
                        "accessKeyId": "EXAMPLE_KEY_ID",
                        "userName": "Alice",
                        "sessionContext": {
                            "attributes": {
                                "mfaAuthenticated": "false",
                                "creationDate": "2014-03-06T15:15:06Z"
                            }
                        }
                    },
                    "eventTime": "2014-03-06T17:10:34Z",
                    "eventSource": "ec2.amazonaws.com",
                    "eventName": "CreateKeyPair",
                    "awsRegion": "us-east-2",
                    "sourceIPAddress": "72.21.198.64",
                    "userAgent": "EC2ConsoleBackend, aws-sdk-java/Linux/x.xx.fleetxen Java_HotSpot(TM)_64-Bit_Server_VM/xx",
                    "requestParameters": {
                        "keyName": "mykeypair"
                    },
                    "responseElements": {
                        "keyName": "mykeypair",
                        "keyFingerprint": "30:1d:46:d0:5b:ad:7e:1b:b6:70:62:8b:ff:38:b5:e9:ab:5d:b8:21",
                        "keyMaterial": "\u003csensitiveDataRemoved\u003e"
                    }
                }
            ]
        }
    ]
    
    

    test2.json

    • One JSON
    {
        "Records": [{
                "eventVersion": "1.0",
                "userIdentity": {
                    "type": "IAMUser",
                    "principalId": "EX_PRINCIPAL_ID",
                    "arn": "arn:aws:iam::123456789012:user/Alice",
                    "accessKeyId": "EXAMPLE_KEY_ID",
                    "accountId": "123456789012",
                    "userName": "Alice"
                },
                "eventTime": "2014-03-06T21:22:54Z",
                "eventSource": "ec2.amazonaws.com",
                "eventName": "StartInstances",
                "awsRegion": "us-east-2",
                "sourceIPAddress": "205.251.233.176",
                "userAgent": "ec2-api-tools 1.6.12.2",
                "requestParameters": {
                    "instancesSet": {
                        "items": [{
                                "instanceId": "i-ebeaf9e2"
                            }
                        ]
                    }
                },
                "responseElements": {
                    "instancesSet": {
                        "items": [{
                                "instanceId": "i-ebeaf9e2",
                                "currentState": {
                                    "code": 0,
                                    "name": "pending"
                                },
                                "previousState": {
                                    "code": 80,
                                    "name": "stopped"
                                }
                            }
                        ]
                    }
                }
            }
        ]
    }