Search code examples
linuxbashshellawkterminal

Extract the logs that are logged within 2 timestamps


I am in a situation where I wrote a script to extract the lines within the specified time frame from a log file. This script works fine until I found that it is only printing the lines which has the timestamp that is logged within the specified time and leaves the lines which are not having a timestamp but logged within the specified time frame. Those lines which are not having a timestamp but present within the specified time frame must also be printed. But I don't know how to achieve this.

The below is the log file

[17:02:12:161][01-03-2024]some log info here:
step1
step2
step3
[17:02:12:163][01-03-2024]some log here
a
b
c
[17:02:12:185][01-03-2024]Time taken   : 11

start timestamp: [17:02:12:161][01-03-2024] end timestamp: [17:02:12:163][01-03-2024]

But I also want those a, b, c lines since they are also logged during the end timestamp

Please note that I have no authority to change this timestamp format. I know this is not in a correct format.

Below is the script

#!/bin/bash

# Function to check the timestamp format
timestamp_pattern_checker() {
    local input_pattern="^\[([0-9]{2}:[0-9]{2}:[0-9]{2}:[0-9]{3})\]\[([0-9]{2}-[0-9]{2}-[0-9]{4})\]$"
    if [[ ! $1 =~ $input_pattern ]]; then
        echo "Invalid Timestamp pattern"
        echo "Timestamps should be in the format '[HH:MM:SS:SSS][DD-MM-YYYY]' or HH"
        exit 1
    fi
}

# Function to convert hours to timestamp format
convert_hours_to_timestamp() {
    local hour=$1
    printf "[%02d:00:00:000]" "$hour"
}

# Check if correct number of arguments are passed
if [ "$#" -ne 3 ]; then
    echo "Usage: $0 <log_file_name> <start_timestamp> <end_timestamp>"
    echo "Timestamps should be in the format '[HH:MM:SS:SSS][DD-MM-YYYY]' or as integers representing hours (00 to 23)"
    exit 1
fi

log_file=$1
start_input=$2
end_input=$3

# Check if log file exists
if [ ! -f "$log_file" ]; then
    echo "File not found: $log_file"
    exit 1
fi

# Determine if inputs are hours or full timestamps
if [[ $start_input =~ ^[0-9]{2}$ && $end_input =~ ^[0-9]{2}$ ]]; then
    if [[ $start_input -ge 0 && start_input -le 23 && $end_input -ge 0 && $end_input -le 23 ]]; then
        if [[ $start_input -le $end_input ]]; then
            start_timestamp=$(convert_hours_to_timestamp "$start_input")
            end_timestamp=$(convert_hours_to_timestamp "$end_input")

            # Extract unique dates from log file
            unique_dates=$(awk -F'[][]' '{print $4}' "$log_file" | sort | uniq)

            start_timestamps=()
            end_timestamps=()
            for date in $unique_dates; do
                start_timestamps+=("${start_timestamp}[$date]")
                end_timestamps+=("${end_timestamp}[$date]")
            done
        else
           echo "Error: start hour must be less than or equal to end hour."
           exit 1
        fi
    else
       echo "Error: Hours must be between 00 and 23."
       exit 1
    fi
else
    start_timestamp=$start_input
    end_timestamp=$end_input
    timestamp_pattern_checker "$start_timestamp"
    timestamp_pattern_checker "$end_timestamp"
    start_timestamps=("$start_timestamp")
    end_timestamps=("$end_timestamp")
fi

awk -v starts="${start_timestamps[*]}" -v ends="${end_timestamps[*]}" '
  function parsedate(date) {
        split(date, a, /[]:[-]+/)
        return a[6] "-" a[7] "-" a[8] "T" a[1] ":" a[2] ":" a[3] ":" a[4] "." a[5]
  }
  BEGIN {
    split(starts, start_arr, " ")
    split(ends, end_arr, " ")
    for (i in start_arr) {
        st[i] = parsedate(start_arr[i])
        et[i] = parsedate(end_arr[i])
    }
    log_count = 0
  }
  {
    p = parsedate($0)
    for (i in st) {
        if (p >= st[i] && p <= et[i]) {
            print $0
            log_count = 1
        }
    }
  }
  END {
    if (log_count == 0) {
        print "Nothing was logged at this given time frame"
    }
  }' "$log_file"

Solution

  • Finally, I figured it out with the help of the guys here who tried. I learned a lot. Thank you so much for reaching out to help me. The perfect answer that works for me is here.

    #!/bin/bash
    
    # Function to check the timestamp format
    timestamp_pattern_checker() {
        local input_pattern="^\[([0-9]{2}:[0-9]{2}:[0-9]{2}:[0-9]{3})\]\[([0-9]{2}-[0-9]{2}-[0-9]{4})\]$"
        if [[ ! $1 =~ $input_pattern ]]; then
            echo "Invalid Timestamp pattern"
            echo "Timestamps should be in the format '[HH:MM:SS:SSS][DD-MM-YYYY]' or HH"
            exit 1
        fi
    }
    
    # Function to convert hours to timestamp format
    # Input: hour (integer)
    convert_hours_to_timestamp() {
        local hour=$1
        printf "[%02d:00:00:000]" "$hour"
    }
    
    # Function to convert date to timestamp format [DD-MM-YYYY]
    # Input: date (string)
    convert_date_to_timestamp() {
        local date=$1
        printf "[%s]" "$date"
    }
    
    # Main script starts here
    if [ "$#" -ne 1 ]; then
        echo "Usage: $0 <log_file>"
        exit 1
    fi
    
    log_file=$1
    
    # Check if log file exists
    if [ ! -f "$log_file" ]; then
        echo "File not found: $log_file"
        exit 1
    fi
    
    read -p "Enter start timestamp (HH or [HH:MM:SS:SSS][DD-MM-YYYY]): " start_input
    read -p "Enter end timestamp (HH or [HH:MM:SS:SSS][DD-MM-YYYY]): " end_input
    
    # Determine if inputs are hours or full timestamps
    if [[ $start_input =~ ^[0-9]{2}$ && $end_input =~ ^[0-9]{2}$ ]]; then
        if [[ $start_input -ge 0 && $start_input -le 23 && $end_input -ge 0 && $end_input -le 23 ]]; then
            if [[ $start_input -le $end_input ]]; then
                start_timestamp=$(convert_hours_to_timestamp "$start_input")
                end_timestamp=$(convert_hours_to_timestamp "$end_input")
    
                # Extract unique dates from log file
                unique_dates=$(awk -F'[][]' '/\[/{print $4}' "$log_file" | sort | uniq)
    
                start_timestamps=()
                end_timestamps=()
                for date in $unique_dates; do
                    start_timestamps+=("${start_timestamp}[$date]")
                    end_timestamps+=("${end_timestamp}[$date]")
                done
            else
                echo "Error: start hour must be less than or equal to end hour."
                exit 1
            fi
        else
            echo "Error: Hours must be between 00 and 23."
            exit 1
        fi
    else
        timestamp_pattern_checker "$start_input"
        timestamp_pattern_checker "$end_input"
        start_timestamps=("$start_input")
        end_timestamps=("$end_input")
    fi
    
    awk -v starts="${start_timestamps[*]}" -v ends="${end_timestamps[*]}" '
      function parsedate(date) {
            split(date, a, /[]:[-]+/)
            #ISO 8601 format timestamp conversion
            return a[8] "-" a[7] "-" a[6] "T" a[2] ":" a[3] ":" a[4] "." a[5]
      }
      BEGIN {
        split(starts, start_arr, " ")
        split(ends, end_arr, " ")
        for (i in start_arr) {
            st[i] = parsedate(start_arr[i])
            et[i] = parsedate(end_arr[i])
        }
        log_count = 0
      }
      {
        if (match($0, /^\[[0-9]{2}:[0-9]{2}:[0-9]{2}:[0-9]{3}\]\[[0-9]{2}-[0-9]{2}-[0-9]{4}\]/)) {
            p = parsedate(substr($0, RSTART, RLENGTH))
            in_range = 0
            for (i in st) {
                if (p >= st[i] && p <= et[i]) {
                    in_range = 1
                    break
                }
            }
        }
        if (in_range) {
            print
            log_count = 1
        }
      }
      END {
        if (log_count == 0) {
            print "Nothing was logged at this given time frame"
        }
      }
    ' "$log_file"