Search code examples
pythonunicodecsvsplitfixed-width

Need help in splitting a fixed-width string with unprintable characters


I have a particularly nasty fixed-width file to work with. It's not encoded in the formats I thought it would be. In a nutshell, I'm trying to do a variety of things:

  1. Skip all the whitespace before RPLY01.
  2. Remove those wacky \x00* characters. I've looked into removing them but the character that is covered in the asterisk (*) failed to be removed no matter what I have tried.
  3. Split at FDXSPD01/
  4. Split at CHKP01.
  5. Eventually, split every CSV into a nice list and trap all the other crap (RPLY and CHKP) into a parse-able friendly format.

Here is the original output:

�cRPLY01  IREQ    0000011                                                                         
N00    �9FDXSPD01"CASH","","",10219575.34,0.00,0,"000000000000773"
�EFDXSPD01"CAD","CANADA DOLLAR","CU",-14564.52,0.00,0,"000000000000773"    
�PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",3644.00,0.00,0,"000000000000773"
�QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-3641.07,0.00,0,"000000000000773"
�PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",1457.00,0.00,0,"000000000000773"
�QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-1456.43,0.00,0,"000000000000773"
�DFDXSPD01"CD5427","JOHN CD","CD",100000.00,197.95,0,"000000000000773"
�LFDXSPD01"CP5427","COMMERCIAL PAPER","CP",9925000.00,0.00,0,"000000000000773"
�JFDXSPD01"FS5427","JOHNFORSTOCK","FS",81000.00,10000.00,0,"000000000000773"
�FFDXSPD01"FUT5427","JOHNFUTURE","FT",264000.00,0.00,0,"000000000000773"
�BFDXSPD01"JKSTOCK","JK STOCK","S",31500.00,0.00,0,"000000000000773"
�LFDXSPD01"MB5427","JOHN MUNI BOND","M",255000.00,15611.92,0,"000000000000773"
�QFDXSPD01"MBS5427","JOHNMORTGAGEBACKED","G1",996500.00,2916.67,0,"000000000000773"
�EFDXSPD01"OPT5427","JOHNOPTION","O",464000.00,0.00,0,"000000000000773"
�CFDXSPD01"TB5427","TREASURY BILL","TI",0.00,0.00,0,"000000000000773"
�HFDXSPD01"UB5427","JOHN BOND","G",2994000.00,13281.26,0,"000000000000773"
�9FDXSPD01"UNITS","UNITS","S",0.00,0.00,0,"000000000000773"
�CHKP01  N0000000170

Here's the same output but through repr:

\x00cRPLY01  IREQ    0000011
N00    \x009FDXSPD01"CASH","","",10219575.34,0.00,0,"000000000000773"
\x00EFDXSPD01"CAD","CANADA DOLLAR","CU",-14564.52,0.00,0,"000000000000773"
\x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",3644.00,0.00,0,"000000000000773"
\x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-3641.07,0.00,0,"000000000000773"
\x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",1457.00,0.00,0,"000000000000773"
\x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-1456.43,0.00,0,"000000000000773"
\x00DFDXSPD01"CD5427","JOHN CD","CD",100000.00,197.95,0,"000000000000773"
\x00LFDXSPD01"CP5427","COMMERCIAL PAPER","CP",9925000.00,0.00,0,"000000000000773"
\x00JFDXSPD01"FS5427","JOHNFORSTOCK","FS",81000.00,10000.00,0,"000000000000773"
\x00FFDXSPD01"FUT5427","JOHNFUTURE","FT",264000.00,0.00,0,"000000000000773"
\x00BFDXSPD01"JKSTOCK","JK STOCK","S",31500.00,0.00,0,"000000000000773"
\x00LFDXSPD01"MB5427","JOHN MUNI BOND","M",255000.00,15611.92,0,"000000000000773"
\x00QFDXSPD01"MBS5427","JOHNMORTGAGEBACKED","G1",996500.00,2916.67,0,"000000000000773"
\x00EFDXSPD01"OPT5427","JOHNOPTION","O",464000.00,0.00,0,"000000000000773"
\x00CFDXSPD01"TB5427","TREASURY BILL","TI",0.00,0.00,0,"000000000000773"
\x00HFDXSPD01"UB5427","JOHN BOND","G",2994000.00,13281.26,0,"000000000000773"
\x009FDXSPD01"UNITS","UNITS","S",0.00,0.00,0,"000000000000773"
\x00\x13CHKP01  N0000000170'

Here's my stream of thought thus far:

#!/usr/local/bin/python
import string
import struct
import re
import array

cols = []
splitcols = []

def removeNonAscii(s): return "".join(i for i in s if ord(i) < 128)
rgx = r'[\x00-\x20\x22\x2F\x3A\x3C\x3E\x5C]'

fname2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')
fname = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')
ename2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')
ename = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')
losfile = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')
losfile2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')

filename = '/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt'
filename2 = '/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt'

almost = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')
almost2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')

for line in fname.read().split('\t'):
    print repr(line)
    filtered_string = filter(lambda x: x in string.printable, line)
    print "unfilter line: " + line
    print "filtered line: " + filtered_string
    removeNonAscii(line)

print "======================================================================================"

for line in ename.read().split('\t'):
    filtered_string = filter(lambda x: x in string.printable, line)
    print "error unfilter line: " + line
    print "error filtered line: " + filtered_string

print "======================================================================================"

for line in ename2.read().split('\t'):
    for chars in line:
        if chars in string.printable:
            print chars
        else:
            print "!!!!!!!!!!!!!!!"

print "======================================================================================"

#for line in fname2.read().split('\t'):
    #for chars in line:
        #if chars in string.printable:
            #print chars
        #else:
            #print "@@@@@@@@@@@@@@@"

print "======================================================================================"
print "======================================================================================"

testlist = []
testlist2 = []

with open(filename) as f:
    readarray = f.readline()
    for eachitem in readarray[6:]:
        if(all(ord(c) < 127 and c in string.printable for c in eachitem)):
            testlist.append(eachitem)
print repr(''.join(testlist))

with open(filename2) as f:
    readarray2 = f.readline()
    for eachitem in readarray2[6:]:
        testlist2.append(removeNonAscii(eachitem))
print repr(''.join(testlist2))

with open(filename) as f:
    readarray = f.readline()
    print repr(readarray[6:].split(' FDXSPD01'))
    for eachitem in readarray[6:]:
        if(all(ord(c) < 127 and c in string.printable for c in eachitem)):
            testlist.append(eachitem)
print repr(''.join(testlist))
#for eachitem in almost.read():
    #if(all(ord(c) < 127 and c in string.printable for c in eachitem)):
        #testlist.append(eachitem)
#print ''.join(testlist)

#for eachitem in losfile2.read():
    #testlist2.append(removeNonAscii(eachitem))
#print ''.join(testlist2)

Maybe it's lack of sleep but I can't seem to find the right answer. Perhaps someone well versed in Python can show me the way.


Solution

  • Okay, I'm also somewhat asleep, but I think coffee is starting to kick in.

    Going by the repr() I think this just about does what you want, albeit not necessarily efficiently or robustly.

    import csv, re
    
    data="""
    \x00cRPLY01  IREQ    0000011
    N00    \x009FDXSPD01"CASH","","",10219575.34,0.00,0,"000000000000773"
    \x00EFDXSPD01"CAD","CANADA DOLLAR","CU",-14564.52,0.00,0,"000000000000773"
    \x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",3644.00,0.00,0,"000000000000773"
    \x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-3641.07,0.00,0,"000000000000773"
    \x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",1457.00,0.00,0,"000000000000773"
    \x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-1456.43,0.00,0,"000000000000773"
    \x00DFDXSPD01"CD5427","JOHN CD","CD",100000.00,197.95,0,"000000000000773"
    \x00LFDXSPD01"CP5427","COMMERCIAL PAPER","CP",9925000.00,0.00,0,"000000000000773"
    \x00JFDXSPD01"FS5427","JOHNFORSTOCK","FS",81000.00,10000.00,0,"000000000000773"
    \x00FFDXSPD01"FUT5427","JOHNFUTURE","FT",264000.00,0.00,0,"000000000000773"
    \x00BFDXSPD01"JKSTOCK","JK STOCK","S",31500.00,0.00,0,"000000000000773"
    \x00LFDXSPD01"MB5427","JOHN MUNI BOND","M",255000.00,15611.92,0,"000000000000773"
    \x00QFDXSPD01"MBS5427","JOHNMORTGAGEBACKED","G1",996500.00,2916.67,0,"000000000000773"
    \x00EFDXSPD01"OPT5427","JOHNOPTION","O",464000.00,0.00,0,"000000000000773"
    \x00CFDXSPD01"TB5427","TREASURY BILL","TI",0.00,0.00,0,"000000000000773"
    \x00HFDXSPD01"UB5427","JOHN BOND","G",2994000.00,13281.26,0,"000000000000773"
    \x009FDXSPD01"UNITS","UNITS","S",0.00,0.00,0,"000000000000773"
    \x00\x13CHKP01  N0000000170'"""
    
    lines = data.split('\x00')
    for line in lines:
        try:
            pos = line.index('"')
            print 'ROW:', next(csv.reader([line[pos:]]))
        except ValueError as e:
            try:
                print 'OTH:', line[next(re.finditer('RPLY|CHKP', line)).start():].split()
            except StopIteration as e:
                print 'XXX:', line
    

    prints

    OTH: XXX: 
    
    OTH: ['RPLY01', 'IREQ', '0000011', 'N00']
    ROW: ['CASH', '', '', '10219575.34', '0.00', '0', '000000000000773']
    ROW: ['CAD', 'CANADA DOLLAR', 'CU', '-14564.52', '0.00', '0', '000000000000773']
    ROW: ['CCTUSD', 'CURRENCY CONTRACT - USD', 'CC', '3644.00', '0.00', '0', '000000000000773']
    ...
    ROW: ['TB5427', 'TREASURY BILL', 'TI', '0.00', '0.00', '0', '000000000000773']
    ROW: ['UB5427', 'JOHN BOND', 'G', '2994000.00', '13281.26', '0', '000000000000773']
    ROW: ['UNITS', 'UNITS', 'S', '0.00', '0.00', '0', '000000000000773']
    OTH: ['CHKP01', "N0000000170'"]