Search code examples
pythonpython-3.xcsvmmap

How to set current position of `mmap.mmap.seek(pos)` to beginning of any Nth line for a text file?


I am trying to read some of lines from an out-of-memory csv file using mmap.

this is how my csv file looks like [i have seperated lines for readability]:

'','InputText',101,102,103,104,105,106,107,108,109,110\n
0,'abcde efgh ijkl mnop',1,0,0,0,0,1,1,0,0,0\n
1,'qwerty uiop asdf',1,0,0,1,0,0,0,0,0,0\n
2,'zxcv',0,1,1,0,0,0,0,1,0,0\n
3,'qazxswedc vfrtgbnhy nhyummjikkig jhguopjfservcs fdtuugdsae dsawruoh',1,0,0,0,0,1,1,1,0,0\n
4,'plmnkoijb vhuygcxf tr r mhjease',1,0,0,0,0,0,0,0,0,1\n

Here's what I have done till now:

# imports
import mmap
import os

# open the file buffer
fbuff = open("big_file.csv", mode="r", encoding="utf8")
# now read that file buffer to mmap
f1_mmap = mmap.mmap(fbuff.fileno(), length=os.path.getsize("big_file.csv"),
                      access=mmap.ACCESS_READ, offset=0)

after having read the file to mmap.mmap(), here's how I am trying to read a line, as mentioned in python-3.7 docs here:

# according to python docs: https://docs.python.org/3.7/library/mmap.html#mmap.mmap.seek
# this mmap.mmap.seek need to be set to the byte position in the file
# and when I set it to 0th position(beginning of file) like below, readline() would print entire line till '\n'
f1_mmap.seek(0)
f1_mmap.readline()

If I want to read 102,457th line in the file, I need to find the beginning byte position for that line and set it in mmap.mmap.seek(pos=<this-position>). How do I find that position for any given line of my text file?


Solution

  • Here's how to build an index consisting of a list of offsets to the beginning of each line in the file, and then how use it to read arbitrary lines as well as rows of the memory-mapped CSV file:

    import csv
    from io import StringIO
    import mmap
    import random
    
    my_csv_dialect = dict(delimiter=',', quotechar="'")
    filepath = 'big_file.csv'
    
    # Build list of offsets where each line of file starts.
    fbuff = open(filepath, mode='r', encoding='utf8')
    f1_mmap = mmap.mmap(fbuff.fileno(), 0, access=mmap.ACCESS_READ)
    
    print('Index:')
    offsets = [0]  # First line is always at offset 0.
    for line_no, line in enumerate(iter(f1_mmap.readline, b'')):
        offsets.append(f1_mmap.tell())  # Append where *next* line would start.
        print(f'{line_no} ({offsets[line_no]:3d}) {line!r}')
    print()
    
    # Access arbitrary lines in the memory-mapped file.
    print('Line access:')
    for line_no in (3, 1, 5):
        f1_mmap.seek(offsets[line_no])
        line = f1_mmap.readline()
        print(f'{line_no}: {line!r}')
    print()
    
    # Access arbitrary rows of memory-mapped csv file.
    print('CSV row access:')
    for line_no in (3, 1, 5):
        f1_mmap.seek(offsets[line_no])
        line = f1_mmap.readline()
        b = StringIO(line.decode())
        r = csv.reader(b, **my_csv_dialect)
        values = next(r)
        print(f'{line_no}: {values}')
    
    f1_mmap.close()
    fbuff.close()
    

    Printed results:

    Index:
    0 (  0) b"'','InputText',101,102,103,104,105,106,107,108,109,110\r\n"
    1 ( 56) b"0,'abcde efgh ijkl mnop',1,0,0,0,0,1,1,0,0,0\r\n"
    2 (102) b"1,'qwerty uiop asdf',1,0,0,1,0,0,0,0,0,0\r\n"
    3 (144) b"2,'zxcv',0,1,1,0,0,0,0,1,0,0\r\n"
    4 (174) b"3,'qazxswedc vfrtgbnhy nhyummjikkig jhguopjfservcs fdtuugdsae dsawruoh',1,0,0,0,0,1,1,1,0,0\r\n"
    5 (267) b"4,'plmnkoijb vhuygcxf tr r mhjease',1,0,0,0,0,0,0,0,0,1\r\n"
    
    Line access:
    3: b"2,'zxcv',0,1,1,0,0,0,0,1,0,0\r\n"
    1: b"0,'abcde efgh ijkl mnop',1,0,0,0,0,1,1,0,0,0\r\n"
    5: b"4,'plmnkoijb vhuygcxf tr r mhjease',1,0,0,0,0,0,0,0,0,1\r\n"
    
    CSV row access:
    3: ['2', 'zxcv', '0', '1', '1', '0', '0', '0', '0', '1', '0', '0']
    1: ['0', 'abcde efgh ijkl mnop', '1', '0', '0', '0', '0', '1', '1', '0', '0', '0']
    5: ['4', 'plmnkoijb vhuygcxf tr r mhjease', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1']