Search code examples
delphiasciidelphi-xe2pascal

How to read last line in a text file using Delphi


I need to read the last line in some very large textfiles (to get the timestamp from the data). TStringlist would be a simple approach but it returns an out of memory error. I'm trying to use seek and blockread, but the characters in the buffer are all nonsense. Is this something to do with unicode?

    Function TForm1.ReadLastLine2(FileName: String): String;
    var
      FileHandle: File;
      s,line: string;
      ok: 0..1;
      Buf: array[1..8] of Char;
      k: longword;
      i,ReadCount: integer;
    begin
      AssignFile (FileHandle,FileName);
      Reset (FileHandle);           // or for binary files: Reset (FileHandle,1);
      ok := 0;
      k := FileSize (FileHandle);
      Seek (FileHandle, k-1);
      s := '';
      while ok<>1 do begin
        BlockRead (FileHandle, buf, SizeOf(Buf)-1, ReadCount);  //BlockRead ( var FileHandle : File; var Buffer; RecordCount : Integer {; var RecordsRead : Integer} ) ;
        if ord (buf[1]) <>13 then         //Arg to integer
          s := s + buf[1]
        else
          ok := ok + 1;
        k := k-1;
        seek (FileHandle,k);
      end;
      CloseFile (FileHandle);

      // Reverse the order in the line read
      setlength (line,length(s));
      for i:=1 to length(s) do
        line[length(s) - i+1 ] := s[i];
      Result := Line;
    end;

Based on www.delphipages.com/forum/showthread.php?t=102965

The testfile is a simple CSV I created in excel ( this is not the 100MB I ultimately need to read).

    a,b,c,d,e,f,g,h,i,j,blank
    A,B,C,D,E,F,G,H,I,J,blank
    1,2,3,4,5,6,7,8,9,0,blank
    Mary,had,a,little,lamb,His,fleece,was,white,as,snow
    And,everywhere,that,Mary,went,The,lamb,was,sure,to,go

Solution

  • You really have to read the file in LARGE chunks from the tail to the head. Since it is so large it does not fit the memory - then reading it line by line from start to end would be very slow. With ReadLn - twice slow.

    You also has to be ready that the last line might end with EOL or may not.

    Personally I would also account for three possible EOL sequences:

    • CR/LF aka #13#10=^M^J - DOS/Windows style
    • CR without LF - just #13=^M - Classic MacOS file
    • LF without CR - just #10=^J - UNIX style, including MacOS version 10

    If you are sure your CSV files would only ever be generated by native Windows programs it would be safe to assume full CR/LF be used. But if there can be other Java programs, non-Windows platforms, mobile programs - I would be less sure. Of course pure CR without LF would be the least probable case of them all.

    uses System.IOUtils, System.Math, System.Classes;
    
    type FileChar = AnsiChar; FileString = AnsiString; // for non-Unicode files
    // type FileChar = WideChar; FileString = UnicodeString;// for UTF16 and UCS-2 files
    const FileCharSize = SizeOf(FileChar);
    // somewhere later in the code add: Assert(FileCharSize = SizeOf(FileString[1]);
    
    function ReadLastLine(const FileName: String): FileString; overload; forward;
    
    const PageSize = 4*1024; 
    // the minimal read atom of most modern HDD and the memory allocation atom of Win32
    // since the chances your file would have lines longer than 4Kb are very small - I would not increase it to several atoms.
    
    function ReadLastLine(const Lines: TStringDynArray): FileString; overload;
    var i: integer;
    begin
      Result := '';
      i := High(Lines);
      if i < Low(Lines) then exit; // empty array - empty file
    
      Result := Lines[i];
      if Result > '' then exit; // we got the line
    
      Dec(i); // skip the empty ghost line, in case last line was CRLF-terminated
      if i < Low(Lines) then exit; // that ghost was the only line in the empty file
      Result := Lines[i];
    end;
    
    // scan for EOLs in not-yet-scanned part
    function FindLastLine(buffer: TArray<FileChar>; const OldRead : Integer; 
         const LastChunk: Boolean; out Line: FileString): boolean;
    var i, tailCRLF: integer; c: FileChar;
    begin
      Result := False;
      if Length(Buffer) = 0 then exit;
    
      i := High(Buffer);    
      tailCRLF := 0; // test for trailing CR/LF
      if Buffer[i] = ^J then begin // LF - single, or after CR
         Dec(i);
         Inc(tailCRLF);
      end;
      if (i >= Low(Buffer)) and (Buffer[i] = ^M) then begin // CR, alone or before LF
         Inc(tailCRLF);
      end;
    
      i := High(Buffer) - Max(OldRead, tailCRLF);
      if i - Low(Buffer) < 0 then exit; // no new data to read - results would be like before
    
      if OldRead > 0 then Inc(i); // the CR/LF pair could be sliced between new and previous buffer - so need to start a bit earlier
    
      for i := i downto Low(Buffer) do begin
          c := Buffer[i];
          if (c=^J) or (c=^M) then begin // found EOL
             SetString( Line, @Buffer[i+1], High(Buffer) - tailCRLF - i);
             exit(True); 
          end;
      end;  
    
      // we did not find non-terminating EOL in the buffer (except maybe trailing),
      // now we should ask for more file content, if there is still left any
      // or take the entire file (without trailing EOL if any)
    
      if LastChunk then begin
         SetString( Line, @Buffer[ Low(Buffer) ], Length(Buffer) - tailCRLF);
         Result := true;
      end;
    end;
    
    
    function ReadLastLine(const FileName: String): FileString; overload;
    var Buffer, tmp: TArray<FileChar>; 
        // dynamic arrays - eases memory management and protect from stack corruption
        FS: TFileStream; FSize, NewPos: Int64; 
        OldRead, NewLen : Integer; EndOfFile: boolean;
    begin
      Result := '';
      FS := TFile.OpenRead(FileName);
      try
        FSize := FS.Size;
        if FSize <= PageSize then begin // small file, we can be lazy!
           FreeAndNil(FS);  // free the handle and avoid double-free in finally
           Result := ReadLastLine( TFile.ReadAllLines( FileName, TEncoding.ANSI )); 
              // or TEncoding.UTF16
              // warning - TFIle is not share-aware, if the file is being written to by another app
           exit;
        end;
    
        SetLength( Buffer, PageSize div FileCharSize);
        OldRead := 0;
        repeat
          NewPos := FSize - Length(Buffer)*FileCharSize;
          EndOfFile := NewPos <= 0;
          if NewPos < 0 then NewPos := 0; 
          FS.Position := NewPos;
    
          FS.ReadBuffer( Buffer[Low(Buffer)], (Length(Buffer) - OldRead)*FileCharSize);
    
          if FindLastLine(Buffer, OldRead, EndOfFile, Result) then 
             exit; // done !
    
          tmp := Buffer; Buffer := nil; // flip-flop: preparing to broaden our mouth
    
          OldRead := Length(tmp); // need not to re-scan the tail again and again when expanding our scanning range
          NewLen := Min( 2*Length(tmp), FSize div FileCharSize );
    
          SetLength(Buffer, NewLen); // this may trigger EOutOfMemory...
          Move( tmp[Low(tmp)], Buffer[High(Buffer)-OldRead+1], OldRead*FileCharSize);
          tmp := nil; // free old buffer
        until EndOfFile;
      finally
        FS.Free;
      end;
    end;
    

    PS. Note one extra special case - if you would use Unicode chars (two-bytes ones) and would give odd-length file (3 bytes, 5 bytes, etc) - you would never be ble to scan the starting single byte (half-widechar). Maybe you should add the extra guard there, like Assert( 0 = FS.Size mod FileCharSize)

    PPS. As a rule of thumb you better keep those functions out of the form class, - because WHY mixing them? In general you should separate concerns into small blocks. Reading file has nothing with user interaction - so should better be offloaded to an extra UNIT. Then you would be able to use functions from that unit in one form or 10 forms, in main thread or in multi-threaded application. Like LEGO parts - they give you flexibility by being small and separate.

    PPPS. Another approach here would be using memory-mapped files. Google for MMF implementations for Delphi and articles about benefits and problems with MMF approach. Personally I think rewriting the code above to use MMF would greatly simplify it, removing several "special cases" and the troublesome and memory copying flip-flop. OTOH it would demand you to be very strict with pointers arithmetic.