Search code examples
regexperlnon-ascii-characters

How can I translate non printable ascii chars to readable text with Perl


I'm trying to test some probes connected via USB on an Linux device using Perl 5.28 and Linux (Debian 8). When I read out a large file buffer of the probe, often none readable ASCII signs occur like \0 or \x02. I want to translate these signs into readable tagged text. I've written a small subroutine, but it seems to me a little bit clunky for large translation list to test every entry. Is there a better way to do that?

Example script

#!/usr/bin/env perl -w

# test-escape.pl --- test none readable chars

use strict;

sub escBuf() {
    my $buf = shift;
    my @numNul = $buf =~ /\0/g;
    my @numCR  = $buf =~ /\r/g;
    $buf =~ s/\r/\n/g;
    $buf =~ s/\x00/<NUL>/g;
    $buf =~ s/\x01/<SOH>/g;
    $buf =~ s/\x02/<STX>/g;
    $buf =~ s/\x03/<ETX>/g;
    $buf =~ s/\x04/<EOT>/g;
    $buf =~ s/\x05/<ENQ>/g;
    $buf =~ s/\x06/<ACK>/g;
    $buf =~ s/\x07/<BEL>/g;
    $buf =~ s/\x08/<BS>/g;
    $buf =~ s/\x0B/<VT>/g;
    $buf =~ s/\x0C/<FF>/g;
    $buf =~ s/\x0E/<SO>/g;
    $buf =~ s/\x0F/<SI>/g;
    my $numNUL = @numNul;
    my $numCR  = @numCR;
    return ($buf, $numNUL, $numCR);
}

# Buffer example
my $buffer = "\x01\r\x02This is a test with\r\n ".
    "sometimes qiurks \0 inside \x0C stuff \0 and regular \x03\r\x04";

# Translate output 
my ($out, $numNUL, $numCR) = &escBuf($buffer); 

# Not printed correctly due to \0
# print "ORG.TEXT: '$buffer' \n\n";

# Result of the translation
print "ESC.TEXT: '$out' \n\n";
print "NUM.NUL:  $numNUL\n";
print "NUM.CR:   $numCR\n\n";

Result

/usr/bin/env perl -w "test-escape.pl"
ESC.TEXT: '<SOH>
<STX>This is a test with

 sometimes qiurks <NUL> inside <FF> stuff <NUL> and regular <ETX>
<EOT>' 

NUM.NUL:  2
NUM.CR:   3

EDIT: Adopted code with proposed solution by ikegami

#!/usr/bin/env perl -w
# test-escape.pl --- test none readable chars

use strict;

# Dictionary of non printable signs
my %NONE_ASC_DICT = (
    "\x00" => "NUL", "\x01" => "SOH", "\x02" => "STX", "\x03" => "ETX",
    "\x04" => "EOT", "\x05" => "ENQ", "\x06" => "ACK", "\x07" => "BEL",
    "\x08" => "BS",
    # Essenital for parsing "\x09" => "TAB" "\x0a" => "LF" 
    "\x0b" => "VT",  "\x0c" => "FF", "\x0d" => "CR",
    "\x0e" => "SO",  "\x0f" => "SI",
    "\x10" => "DLE",
    "\x11" => "DC1", "\x12" => "DC2", "\x13" => "DC3", "\x14" => "DC4",
    "\x15" => "NAK", "\x16" => "SYN", "\x17" => "ETB", "\x18" => "CAN",
    "\x19" => "EM",  "\x1A" => "SUB", "\x1B" => "ESC", "\x1C" => "FS",
    "\x1D" => "GS",  "\x1E" => "RS",  "\x1F" => "US",  "\x7F" => "DEL",
);

# Mapping of the entries and corresponding predefined REGEX
my $NONE_ASC_CLASS = join "", map quotemeta, keys(%NONE_ASC_DICT);
my $NONE_ASC_REGEX = qr/([$NONE_ASC_CLASS])/;

# Translator subroutine
sub escBuffer() {
    my ($buf, $dict, $regex, $prefix, $suffix) = @_;

    # Set default sprefix suffix strings if not present
    $prefix   //= '<';  $suffix   //= '>';

    # Count the real quirks
    my @numNUL = $buf =~ /\0/g;
    my $numNUL = @numNUL;

    # Clean up mixed UNIX / DOS context
    $buf =~ s/\r\n/\n/g; 
    $buf =~ s/\r/\n/g;   # translate all remaining \r to \n 
    
    # Calc resulting number of lines
    my @numLF  = $buf =~ /\n/g; 
    my $numLF  = @numLF;

    # Translate the remaining non printables
    $buf =~ s/$regex/ $prefix.$dict->{$1}.$suffix /eg;

    # Result set translated buffer, count quirks, count lines
    return ($buf, $numNUL, $numCR);
}

# Buffer example
my $buffer = "\x01\r\x02This is a test with\r\n ".
    "sometimes qiurks \0 inside \x0C stuff \0 and regular \x03\r\x04";

# Translate output 
my ($out, $numNUL, $numLF) = &escBuffer
                               ($buffer, \%NONE_ASC_DICT, $NONE_ASC_REGEX); 

# Result of the translation
print "ESC.TEXT: '$out' \n\n";
print "NUM.NUL:  $numNUL\n";
print "NUM.LF:   $numLF\n\n";

Solution

  • Use a table.

    Setup:

    my %map = (
       "\x00" => "<NUL>",
       ...,
    );
    
    my $class = join "", map quotemeta, keys(%map);
    my $re = qr/([$class])/;
    

    Replacing:

    s/$re/$map{$1}/g