Search code examples
arraysjsonpowershellkey-value

Extracting the key-value pair in loop from text file using Powershell Script


I am trying to capture the specific key value pairs from a text file having other data as well than key:value pattern using powershell. Can anyone help me out? I have tried the code so far with the help of internet as I am newbie to Powershell. Any help will be appreciated.

Source Text sample:

ResourceGroupName    : DataLake-Gen2
DataFactoryName      : dna-production-gen2
TriggerName          : TRG_RP_Optimizely_Import
TriggerRunId         : 08586050680855766354964895535CU57
TriggerType          : ScheduleTrigger
TriggerRunTimestamp  : 8/4/2020 10:59:59 AM
Status               : Succeeded
TriggeredPipelines   : {[PL_DATA_OPTIMIZELY_MART, 1f89fc3a-27b5-442e-9685-a444f751f607]}
Message              :
Properties           : {[TriggerTime, 8/4/2020 10:59:59 AM], [ScheduleTime, 8/4/2020 11:00:00 AM], [triggerObject, {
                         "name": "Trigger_421B8CAF-BE66-42CF-83DA-E3028693F304",
                         "startTime": "2020-08-04T10:59:59.8982174Z",
                         "endTime": "2020-08-04T10:59:59.8982174Z",
                         "scheduledTime": "2020-08-04T11:00:00Z",
                         "trackingId": "fdf58bb2-ecd5-4fe9-b2ef-d94fd71729c3",
                         "clientTrackingId": "08586050680855766354964895535CU57",
                         "originHistoryName": "08586050680855766354964895535CU57",
                         "code": "OK",
                         "status": "Succeeded"
                       }]}
AdditionalProperties : {[groupId, 08586050680855766354964895535CU57]}

ResourceGroupName    : DataLake-Gen2
DataFactoryName      : dna-production-gen2
TriggerName          : TRG_RP_Optimizely_Import
TriggerRunId         : 08586049816852049265494275953CU24
TriggerType          : ScheduleTrigger
TriggerRunTimestamp  : 8/5/2020 11:00:00 AM
Status               : Succeeded
TriggeredPipelines   : {[PL_DATA_OPTIMIZELY_MART, dd6b5beb-b7f6-44ef-8903-34c845003dfc]}
Message              :
Properties           : {[TriggerTime, 8/5/2020 11:00:00 AM], [ScheduleTime, 8/5/2020 11:00:00 AM], [triggerObject, {
                         "name": "Trigger_421B8CAF-BE66-42CF-83DA-E3028693F304",
                         "startTime": "2020-08-05T11:00:00.2662252Z",
                         "endTime": "2020-08-05T11:00:00.2662252Z",
                         "scheduledTime": "2020-08-05T11:00:00Z",
                         "trackingId": "ba223bbd-8cb2-40e8-951f-87130dbbbfe8",
                         "clientTrackingId": "08586049816852049265494275953CU24",
                         "originHistoryName": "08586049816852049265494275953CU24",
                         "code": "OK",
                         "status": "Succeeded"
                       }]}
AdditionalProperties : {[groupId, 08586049816852049265494275953CU24]}

Code used so far:

[CmdletBinding()]
Param(
    [Parameter(Mandatory=$true)]
    $path
)

function Format-LogFile {
    [CmdletBinding()]
    param (
        $log
    )

    $targets = 'TriggerRunTimestamp','ResourceGroupName', 'DataFactoryName', 'TriggerName', 'TriggerRunId', 'TriggerType', 'Status'
    [System.Collections.ArrayList]$lines = @()
    $log | ForEach-Object {
        $line = $_
        $targets | ForEach-Object {
            if ($line.Contains($_) -and $line -notin $lines) {
                $lines.Add($line) | Out-Null
            }
        }
    }
#    $lines[0] = $lines[0].TrimStart("JournalSMS  ")
#    return $lines
    
}


function Get-LogFields {
    [CmdletBinding()]
    param (

        $lines
    )
    $targets = 'TriggerRunTimestamp','ResourceGroupName', 'DataFactoryName', 'TriggerName', 'TriggerRunId', 'TriggerType', 'Status'
    $matchs = $lines | Select-String -Pattern "(?<=(\s||\b))[A-Z][\s\[A-Z]/]+?\s*?\:\s+[^\s\b]+" -AllMatches 
    
    $dict = @{}
    $matchs.Matches | ForEach-Object {
        $val = $_.Value
        $arr = $val.Split("")
        if ($arr[0].Trim() -in $targets)  {
            $dict.Add($arr[0].Trim(), $arr[1].Trim())
        } 
    }
    
    return $dict
}


$log = get-content 'D:\\output.txt'
$path = "D:\\output.txt"
$info = Get-ChildItem -File -Recurse -Path $path | ForEach-Object {
    $log = Get-Content $_.FullName -Encoding Default
    $lines = Format-LogFile $log
    $dict = Get-LogFields $lines
    $values = New-Object -TypeName psobject -Property $dict
    return $values
} 



# $info |
# Select-Object   @{name='TriggerRunTimestamp';expression={$_.'TriggerRunTimestamp'}},
#                 @{name='ResourceGroupName';expression={$_."ResourceGroupName"}},
#                 @{name='DataFactoryName';expression={$_.'DataFactoryName'}},
#                 @{name='TriggerName';expression={$_.'TriggerName'}},
#                 @{name='TriggerRunId';expression={$_.'TriggerRunId'}} 
#                  @{name='TriggerType';expression={$_.'TriggerType'}}
#                 @{name='Status';expression={$_.'Status'}}|
# Export-Csv -Encoding UTF8 -Path .\result.csv -Force


$info |
Select-Object   'TriggerRunTimestamp', "ResourceGroupName", 'DataFactoryName',
                'TriggerName', 'TriggerRunId', 'TriggerType', 'Status' |
ConvertTo-CSV -Delimiter ";" -NoTypeInformation |
% {$_.Replace('"','')} |
Set-Content -Path 'D:\\result.csv' -Force
# Export-Csv -Encoding UTF8 -Path .\result.csv -Force

Expected Output:

TriggerRunTimestamp ResourceGroupName DataFactoryName TriggerName TriggerRunId TriggerType Status TriggeredPipeline Properties_TriggerTime Properties_ScheduleTime triggerObject_name triggerObject_startTime triggerObject_endTime triggerObject_scheduledTime 8/4/2020 10:59 DataLake-Gen2 dna-production-gen2 TRG_RP_Optimizely_Import 08586050680855766354964895535CU57 ScheduleTrigger Succeeded PL_DATA_OPTIMIZELY_MART 8/4/2020 10:59 8/4/2020 11:00 Trigger_421B8CAF-BE66-42CF-83DA-E3028693F304 2020-08-04T10:59:59.8982174Z 2020-08-04T10:59:59.8982174Z 2020-08-04T11:00:00Z

NOTE: Bold values are the column headers and values are in plain text.

Help Much Needed !!

Thanks


Solution

  • The problematic part in this log file is on property Properties, which is a JSON string. Luckily, you don't want any of this in your output CSV file, so the below should work:

    # read the file as a single, multiline string using the -Raw switch
    $log = Get-Content -Path 'D:\Test\the_input_log.txt' -Raw
    # split the content into several blocks on the empty line, skip blocks that do not contain text
    $result = ($log -split '(\r?\n){2,}' | Where-Object {$_ -match '\S'}) | ForEach-Object {
        # split the block to get only the part with the properties you are interested in
        # replace ' : ' into an equals sign (mind the extra spaces around the colon, otherwise
        # you will also replace the colons in the 'TriggerRunTimestamp' property.
    
        # use ConvertFrom-StringData cmdlet to create a Hashtable from this and convert that to a PsCustomObject
        # finally, use Select-Object to output a new PSObject with only the properties you need in the wanted order.
        [PsCustomObject](($_ -split 'TriggeredPipelines')[0] -replace ' : ', '=' | ConvertFrom-StringData)  |
        Select-Object 'TriggerRunTimestamp', 'ResourceGroupName', 'DataFactoryName', 'TriggerName', 'TriggerRunId', 'TriggerType', 'Status'
    }
    
    # output on screen
    $result | Format-Table -AutoSize
    
    # write to CSV file
    $result | Export-Csv -Path 'D:\Test\result.csv' -Encoding UTF8 -NoTypeInformation -Force
    

    I have added quite a few comments in the code for you to hopefully make it understandable what is going on in there.

    The resulting CSV file will contain quotes:

    "TriggerRunTimestamp","ResourceGroupName","DataFactoryName","TriggerName","TriggerRunId","TriggerType","Status"
    "8/4/2020 10:59:59 AM","DataLake-Gen2","dna-production-gen2","TRG_RP_Optimizely_Import","08586050680855766354964895535CU57","ScheduleTrigger","Succeeded"
    "8/5/2020 11:00:00 AM","DataLake-Gen2","dna-production-gen2","TRG_RP_Optimizely_Import","08586049816852049265494275953CU24","ScheduleTrigger","Succeeded"
    
    

    If you absolutely do not want quotes and you are using PowerShell version 7, you can add -UseQuotes AsNeeded to the Export-Csv cmdlet.

    For older PowerShell versions, you can use my function ConvertTo-CsvNoQuotes


    Edit

    As per your comment, you also need properties from the (what seemed to be JSON) elements, you would need a completely different approach.

    For the example you have given you can use:

    # read the file as a single, multiline string using the -Raw switch
    $log = Get-Content -Path 'D:\Test\the_input_log.txt' -Raw
    # split the content into several blocks on the empty line, skip blocks that do not contain text
    $result = ($log -split '(\r?\n){2,}' | Where-Object {$_ -match '\S'}) | ForEach-Object {
        # create a Hashtable to store the key/value properties we find looping over each line in the block
        $hash = @{}
        switch -Regex ($_.Trim() -split '\r?\n') {
            '^(\w+)\s+:\s*(.*)' { $key = $matches[1]; $hash[$key] = $matches[2] }     # found a key/value property
            '^\s+(\S.+)'        { if ($key) {$hash[$key] += ("`r`n"+ $matches[1])} }  # add to a multiline property
        }
        # test if the above actually was able to parse 'TriggeredPipelines'
        if (![string]::IsNullOrWhiteSpace($hash['TriggeredPipelines'])) {
        # remove the brackets from TriggeredPipelines
            $hash['TriggeredPipeline'] = ($hash['TriggeredPipelines'].Trim("{[]}") -split ',')[0]
        }
    
        # test if the above actually was able to parse 'Properties'
        if (![string]::IsNullOrWhiteSpace($hash['Properties'])) {
            # the 'Properties' property needs a bit more work:
            # 1) remove the surrounding brackets, split into the first line and a textblock with the rest of the properties
            $props = $hash['Properties'].Trim("{[ ]}") -split '\r?\n', 2
            # $props[0] is now "[TriggerTime, 8/4/2020 10:59:59 AM], [ScheduleTime, 8/4/2020 11:00:00 AM], [triggerObject, {"
            # parse the TriggerTime and ScheduleTime from that line and add them to the hash
            $temp  = ([regex]'(?i)TriggerTime,\s*([^\]]+)').Matches($props[0]).Groups[1].Value
            if (![string]::IsNullOrWhiteSpace($temp)) { $hash['Properties_TriggerTime'] = $temp }
    
            $temp = ([regex]'(?i)ScheduleTime,\s*([^\]]+)').Matches($props[0]).Groups[1].Value
            if (![string]::IsNullOrWhiteSpace($temp)) { $hash['Properties_ScheduleTime'] = $temp }
    
            if ($props.Count -eq 2) {
                # 2) surround $props[1] with curly brackets, so it will become valid JSON and convert from that
                $props = '{{{0}}}' -f $props[1] | ConvertFrom-Json
                # loop through the properties and add these to the hash with "TriggerObject_" prefix
                foreach($p in $props.PSObject.Properties.name) {
                    $hash["TriggerObject_$p"] = $props.$p
                }
            }
        }
    
        # final test to see if we have managed to capture anything
        # more strict but memory consuming would be 
        # if ($hash.Count -and ![string]::IsNullOrWhiteSpace(-join $hash.Values)) {..}
    
        if ($hash.Count) {
            # convert the completed hash into a PSObject and select the properties you need from it
            [PsCustomObject]$hash | Select-Object 'TriggerRunTimestamp', 'ResourceGroupName', 'DataFactoryName',
                                                  'TriggerName', 'TriggerRunId', 'TriggerType', 'Status', 
                                                  'TriggeredPipeline', 'Properties_TriggerTime', 'Properties_ScheduleTime',
                                                  'TriggerObject_name', 'TriggerObject_startTime', 
                                                  'TriggerObject_endTime', 'TriggerObject_scheduledTime'
        }
    }
    
    # output on screen (won't fit as Table in the console)
    $result
    
    # write to CSV file
    $result | Export-Csv -Path 'D:\Test\result.csv' -Encoding UTF8 -NoTypeInformation -Force
    

    The resulting CSV file will now look like

    "TriggerRunTimestamp","ResourceGroupName","DataFactoryName","TriggerName","TriggerRunId","TriggerType","Status","TriggeredPipeline","Properties_TriggerTime","Properties_ScheduleTime","TriggerObject_name","TriggerObject_startTime","TriggerObject_endTime","TriggerObject_scheduledTime"
    "8/4/2020 10:59:59 AM","DataLake-Gen2","dna-production-gen2","TRG_RP_Optimizely_Import","08586050680855766354964895535CU57","ScheduleTrigger","Succeeded","PL_DATA_OPTIMIZELY_MART","8/4/2020 10:59:59 AM","8/4/2020 11:00:00 AM","Trigger_421B8CAF-BE66-42CF-83DA-E3028693F304","2020-08-04T10:59:59.8982174Z","2020-08-04T10:59:59.8982174Z","2020-08-04T11:00:00Z"
    "8/5/2020 11:00:00 AM","DataLake-Gen2","dna-production-gen2","TRG_RP_Optimizely_Import","08586049816852049265494275953CU24","ScheduleTrigger","Succeeded","PL_DATA_OPTIMIZELY_MART","8/5/2020 11:00:00 AM","8/5/2020 11:00:00 AM","Trigger_421B8CAF-BE66-42CF-83DA-E3028693F304","2020-08-05T11:00:00.2662252Z","2020-08-05T11:00:00.2662252Z","2020-08-05T11:00:00Z"