Search code examples
perltextcomparisonmismatch

Find mismatch on 2nd column between 2 text files


I have these 2 text files and I would like to find any mismatch on 2nd column between files. The mismatch to be identified is based on type of F ,P and N regardless which lines they occur. I have 1F, 3P in first file while 2P,1N and 1F in second file. When do comparison, both files should have equal occurrence of type 1F, 3P and 1N.

Text1:

f0x11 F
f0x34 P
drx99 
dex67 P
edx43 P
sdx33 

Text2:

1 P
2 N
4 
5 F
6 
7 P

Expected Output:

Text 1 has missing type of N
Text 2 has missing type of P 

What I have tried so far does not produce desired output.

code:

use strict;
my %ref_data;
my %ref_data2;
open my $fh, '<', 'Text1' or die "Could not open file to read:$!";
while (<$fh>) {
  chomp;
    my ($res, $type) = split;
    if (defined $type){
             $ref_data{$type} = "$type"; 
            }           
 }
our ($data,$data2);
open $fh, '<', 'Text2' or die "Could not open file to read:$!";
while (<$fh>) {
  chomp;
 my ($res, $type) = split;
    if (defined $type){
                $ref_data2{$type}= "$type";
                $data2= $ref_data2{$type};
                $data = $ref_data{$type};
                print "File 2 has missing type of $type\n" unless $data;
         }
  }
foreach ($data){
print "File 1 has missing type of $_\n" if $data ne $data2;
}

Solution

  • I've refactored your code where you seem to be duplicating the same behavior.

    The output isn't to spec, but should be clear enough for you to understand and finish up yourself.

    I added a close $fh; and use warnings; as well

    #!/usr/bin/perl
    
    use strict;
    use warnings;
    
    #run
    my %max; # hash of combined data
    my $file_data_1 = parse_file_into_hash("text1", \%max);
    my $file_data_2 = parse_file_into_hash("text2", \%max);
    diff_hashes(\%max, $file_data_1, $file_data_2);
    
    # diff_hashes($max, $h1, $h2)
    #
    # diffs 2 hash refs against a combined $max hash and prints results
    sub diff_hashes {
        my ($max, $h1, $h2) = @_;
    
        # TODO - do all the comparisios and some error checking (if keys exist etc...) here
        for my $key (keys %$max) {
            print "max/combined: $key = $max->{$key}\n";
    
            my $h1_print = exists $h1->{$key} ? $h1->{$key} : "0";
            my $h2_print = exists $h2->{$key} ? $h2->{$key} : "0";
    
            print "h1: $key = $h1_print\n";
            print "h2: $key = $h2_print\n";
        }
    }
    
    # parse_file_into_hash($file, $max)
    #
    # $max is a hash reference (passed by reference) so you can count occurences over 
    # multiple files...
    # returns reference of hash ( $line_number => $data_value )
    sub parse_file_into_hash {
        my ($file, $max) = @_;
        my %ref_data;
    
        open my $fh, '<', $file or die "Could not open file to read:$!";
        while (my $line = <$fh>) {
            chomp $line;
            my ($res, $type) = split /\s+/, $line;
    
            if ($type) {
                $ref_data{$type}++;
    
                if (!exists $max->{$type} || $ref_data{$type} > $max->{$type}) {
                    $max->{$type} = $ref_data{$type};
                }
            }
        }
        close $fh;
    
        return \%ref_data;
    }
    

    Output ran against your example files:

    $ ./example.pl
    max/combined: F = 1
    h1: F = 1
    h2: F = 1
    max/combined: N = 1
    h1: N = 0
    h2: N = 1
    max/combined: P = 3
    h1: P = 3
    h2: P = 2