Search code examples
encodingvb6filestream

Encoding of Text Files in VB 6.0


I have huge external files with the "ANSI" and "UCS-2 Little Endian" encoding formats.

Now I want to change the file encoding format into UTF-8 using Visual Basic 6.0. I don't want to modify the original file; I just want to change the encoding format alone.

I have searched on the web; but can't understand the VB code, and have no idea how to do it.

I would also like to be able to read lines one at a time from UTF-8 encoded files.


Solution

  • NOTE. This answer has been extensively expanded to fit in with the edited question, which in turn was due to Visual Basic 6 and UTF-8

    The following code wraps up converting ANSI, UTF-16 and UTF-32 encoded strings from a file to UTF-8 strings, in VB6. You have to load in the entire file and output it. Note that if it was truly generic, the LineInputUTF8() method would be LineInput(), and require a code page.

    Option Explicit
    
    Private Declare Function MultiByteToWideChar Lib "Kernel32.dll" ( _
        ByVal CodePage As Long, _
        ByVal dwFlags As Long, _
        ByVal lpMultiByteStr As Long, _
        ByVal cbMultiByte As Long, _
        ByVal lpWideCharStr As Long, _
        ByVal cchWideChar As Long _
    ) As Long
    
    Private Declare Function WideCharToMultiByte Lib "Kernel32.dll" ( _
        ByVal CodePage As Long, _
        ByVal dwFlags As Long, _
        ByVal lpWideCharStr As Long, _
        ByVal cchWideChar As Long, _
        ByVal lpMultiByteStr As Long, _
        ByVal cbMultiByte As Long, _
        ByVal lpDefaultChar As Long, _
        ByVal lpUsedDefaultChar As Long _
    ) As Long
    
    Public Const CP_ACP        As Long = 0          ' Default ANSI code page.
    Public Const CP_UTF8       As Long = 65001      ' UTF8.
    Public Const CP_UTF16_LE   As Long = 1200       ' UTF16 - little endian.
    Public Const CP_UTF16_BE   As Long = 1201       ' UTF16 - big endian.
    Public Const CP_UTF32_LE   As Long = 12000      ' UTF32 - little endian.
    Public Const CP_UTF32_BE   As Long = 12001      ' UTF32 - big endian.
    
    ' Purpose:  Heuristic to determine whether bytes in a file are UTF-8.
    Private Function FileBytesAreUTF8(ByVal the_iFileNo As Integer) As Boolean
    
        Const knSampleByteSize          As Long = 2048
        Dim nLof                        As Long
        Dim nByteCount                  As Long
        Dim nByteIndex                  As Long
        Dim nCharExtraByteCount         As Long
        Dim bytValue                    As Byte
    
        ' We look at the first <knSampleByteSize> bytes of the file. However, if the file is smaller, we will have to
        ' use the smaller size.
        nLof = LOF(the_iFileNo)
        If nLof < knSampleByteSize Then
            nByteCount = nLof
        Else
            nByteCount = knSampleByteSize
        End If
    
        ' Go to the start of the file.
        Seek #the_iFileNo, 1
    
        For nByteIndex = 1 To nByteCount
    
            Get #the_iFileNo, , bytValue
    
            ' If the character we are processing has bytes beyond 1, then we are onto the next character.
            If nCharExtraByteCount = 0 Then
                '
                ' The UTF-8 specification says that the first byte of a character has masking bits which indicate how many bytes follow.
                '
                ' See: http://en.wikipedia.org/wiki/UTF-8#Description
                '
                ' Bytes in
                ' sequence   Byte 1   Byte 2   Byte 3   Byte 4
                ' 1          0xxxxxxx
                ' 2          110xxxxx 10xxxxxx
                ' 3          1110xxxx 10xxxxxx 10xxxxxx
                ' 4          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                '
                If (bytValue And &H80) = &H0 Then
                    nCharExtraByteCount = 0
                ElseIf (bytValue And &HE0) = &HC0 Then
                    nCharExtraByteCount = 1
                ElseIf (bytValue And &HF0) = &HE0 Then
                    nCharExtraByteCount = 2
                ElseIf (bytValue And &HF8) = &HF0 Then
                    nCharExtraByteCount = 3
                Else
                    ' If none of these masks were matched, then this can't be a UTF-8 character.
                    FileBytesAreUTF8 = False
                    Exit Function
                End If
            Else
                ' All following bytes must be masked as in the table above.
                If (bytValue And &HC0) = &H80 Then
                    nCharExtraByteCount = nCharExtraByteCount - 1
                    If nCharExtraByteCount = 0 Then
                        FileBytesAreUTF8 = True
                    End If
                Else
                    ' Not a UTF8 character.
                    FileBytesAreUTF8 = False
                    Exit Function
                End If
            End If
    
        Next nByteIndex
    
    End Function
    
    ' Purpose:  Take a string whose bytes are in the byte array <the_abytCPString>, with code page <the_nCodePage>, convert to a VB string.
    Private Function FromCPString(ByRef the_abytCPString() As Byte, ByVal the_nCodePage As Long) As String
    
        Dim sOutput                     As String
        Dim nValueLen                   As Long
        Dim nOutputCharLen              As Long
    
        ' If the code page says this is already compatible with the VB string, then just copy it into the string. No messing.
        If the_nCodePage = CP_UTF16_LE Then
            FromCPString = the_abytCPString()
        Else
    
            ' Cache the input length.
            nValueLen = UBound(the_abytCPString) - LBound(the_abytCPString) + 1
    
            ' See how big the output buffer will be.
            nOutputCharLen = MultiByteToWideChar(the_nCodePage, 0&, VarPtr(the_abytCPString(LBound(the_abytCPString))), nValueLen, 0&, 0&)
    
            ' Resize output byte array to the size of the UTF-8 string.
            sOutput = Space$(nOutputCharLen)
    
            ' Make this API call again, this time giving a pointer to the output byte array.
            MultiByteToWideChar the_nCodePage, 0&, VarPtr(the_abytCPString(LBound(the_abytCPString))), nValueLen, StrPtr(sOutput), nOutputCharLen
    
            ' Return the array.
            FromCPString = sOutput
    
        End If
    
    End Function
    
    Public Function GetContents(ByVal the_sTextFile As String, ByRef out_nCodePage As Long, Optional ByVal the_nDesiredCodePage As Long = -1, Optional ByRef out_bContainedBOM As Boolean) As String
    
        Dim iFileNo                     As Integer
        Dim abytFileContents()          As Byte
        Dim nDataSize                   As Long
    
        iFileNo = FreeFile
    
        OpenForInput the_sTextFile, iFileNo, out_nCodePage, the_nDesiredCodePage, out_bContainedBOM
    
        ' We want to read the entire contents of the file (not including any BOM value).
        ' After calling OpenForInput(), the file pointer should be positioned after any BOM.
        ' So size file contents buffer to <file size> - <current position> + 1.
        nDataSize = LOF(iFileNo) - Seek(iFileNo) + 1
        ReDim abytFileContents(1 To nDataSize)
        Get #iFileNo, , abytFileContents()
    
        Close iFileNo
    
        ' Now we must convert this to UTF-8. But we have to first convert to the Windows NT standard UTF-16 LE.
        GetContents = FromCPString(abytFileContents(), out_nCodePage)
    
    End Function
    
    ' Purpose:  Reads up to the end of the current line of the file, repositions to the beginning of the next line, if any, and
    '           outputs all characters found.
    ' Inputs:   the_nFileNo     The number of the file.
    ' Outputs:  out_sLine       The line from the current position in the file.
    ' Return:   True if there is more data.
    Public Function LineInputUTF8(ByVal the_nFileNo As Integer, ByRef out_sLine As String) As Boolean
    
        Dim bytValue            As Byte
        Dim abytLine()          As Byte
        Dim nStartOfLinePos     As Long
        Dim nEndOfLinePos       As Long
        Dim nStartOfNextLine    As Long
        Dim nLineLen            As Long
    
        ' Save the current file position as the beginning of the line, and cache this value.
        nStartOfLinePos = Seek(the_nFileNo)
    
        ' Retrieves the first byte from the current position.
        Get #the_nFileNo, , bytValue
    
        ' Loop until the end of file is encountered.
        Do Until EOF(the_nFileNo)
    
            ' Check whether this byte represents a carriage return or line feed character (indicating new line).
            If bytValue = 13 Or bytValue = 10 Then
                ' By this point, the current position is *after* the CR or LF character, so to get the position of the
                ' last byte in the line, we must go back two bytes.
                nEndOfLinePos = Seek(the_nFileNo) - 2
    
                ' If this is a carriage return, then we must check the next character.
                If bytValue = 13 Then
                    Get #the_nFileNo, , bytValue
                    ' Is this a line feed?
                    If bytValue = 10 Then
                    ' Yes. Assume that CR-LF counts as a single NewLine. So the start of the next line should skip over the line feed.
                        nStartOfNextLine = nEndOfLinePos + 3
                    Else
                    ' No. The start of the next line is the current position.
                        nStartOfNextLine = nEndOfLinePos + 2
                    End If
                ElseIf bytValue = 10 Then
                ' If this is a line feed, then the start of the next line is the current position.
                    nStartOfNextLine = nEndOfLinePos + 2
                End If
    
                ' Since we have processed all the bytes in the line, exit the loop.
                Exit Do
            End If
    
            ' Get the next byte.
            Get #the_nFileNo, , bytValue
        Loop
    
        ' Check to see if there was an end of line.
        If nEndOfLinePos = 0 Then
        ' No, this is the end of the file - so use all the remaining characters.
            nLineLen = Seek(the_nFileNo) - nStartOfLinePos - 1
        Else
        ' Yes - so use all the characters up to the end of line position.
            nLineLen = nEndOfLinePos - nStartOfLinePos + 1
        End If
    
        ' Is this line empty?
        If nLineLen = 0 Then
        ' Yes - just return an empty string.
            out_sLine = vbNullString
        Else
        ' No - pull all the bytes from the beginning to the end of the line into a byte array, and then convert that from UTF-8 to a VB string.
            ReDim abytLine(1 To nLineLen)
            Get #the_nFileNo, nStartOfLinePos, abytLine()
            out_sLine = FromCPString(abytLine(), CP_UTF8)
        End If
    
        ' If there is a line afterwards, then move to the beginning of the line, and return True.
        If nStartOfNextLine > 0 Then
            Seek #the_nFileNo, nStartOfNextLine
            LineInputUTF8 = True
        End If
    
    End Function
    
    ' Purpose:  Analogue of 'Open "fileName" For Input As #fileNo' - but also return what type of text this is via a Code Page value.
    ' Inputs:   the_sFileName
    '           the_iFileNo
    '           (the_nDesiredCodePage)  The code page that you want to use with this file.
    '                                   If this value is set to the default, -1, this indicates that the code page will be ascertained from the file.
    ' Outputs:  out_nCodePage           There are only six valid values that are returned if <the_nDesiredCodePage> was set to -1.
    '               CP_ACP        ANSI code page
    '               CP_UTF8       UTF-8
    '               CP_UTF16LE    UTF-16 Little Endian (VB and NT default string encoding)
    '               CP_UTF16BE    UTF-16 Big Endian
    '               CP_UTF32LE    UTF-32 Little Endian
    '               CP_UTF32BE    UTF-32 Big Endian
    '           (out_bContainedBOM)     If this was set to True, then the file started with a BOM (Byte Order Marker).
    Public Sub OpenForInput(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, ByRef out_nCodePage As Long, Optional ByVal the_nDesiredCodePage As Long = -1, Optional ByRef out_bContainedBOM As Boolean)
    
        ' Note if we want to take account of every case, we should read in the first four bytes, and check for UTF-32 low and high endian BOMs, check
        ' the first three bytes for the UTF-8 BOM, and finally check the first two bytes for UTF-16 low and hight endian BOMs.
        Dim abytBOM(1 To 4)             As Byte
        Dim nCodePage                   As Long
    
        ' By default, there is no BOM.
        out_bContainedBOM = False
    
        Open the_sFilename For Binary Access Read As #the_iFileNo
    
        ' We are interested in -1 (ascertain code page), and then various UTF encodings.
        Select Case the_nDesiredCodePage
        Case -1, CP_UTF8, CP_UTF16_BE, CP_UTF16_LE, CP_UTF32_BE, CP_UTF32_LE
    
            ' Default code page.
            nCodePage = CP_ACP
    
            ' Pull in the first four bytes to determine the BOM (byte order marker).
            Get #the_iFileNo, , abytBOM()
    
            ' The following are the BOMs for text files:
            '
            ' FF FE         UTF-16, little endian
            ' FE FF         UTF-16, big endian
            ' EF BB BF      UTF-8
            ' FF FE 00 00   UTF-32, little endian
            ' 00 00 FE FF   UTF-32, big-endian
            '
            ' Work out the code page from this information.
    
            Select Case abytBOM(1)
            Case &HFF
                If abytBOM(2) = &HFE Then
                    If abytBOM(3) = 0 And abytBOM(4) = 0 Then
                        nCodePage = CP_UTF32_LE
                    Else
                        nCodePage = CP_UTF16_LE
                    End If
                End If
            Case &HFE
                If abytBOM(2) = &HFF Then
                    nCodePage = CP_UTF16_BE
                End If
            Case &HEF
                If abytBOM(2) = &HBB And abytBOM(3) = &HBF Then
                    nCodePage = CP_UTF8
                End If
            Case &H0
                If abytBOM(2) = &H0 And abytBOM(3) = &HFE And abytBOM(4) = &HFF Then
                    nCodePage = CP_UTF32_BE
                End If
            End Select
    
            ' Did we match any BOMs?
            If nCodePage = CP_ACP Then
            ' No - we are still defaulting to the ANSI code page.
                ' Special check for UTF-8. The BOM is not specified in the standard for UTF-8, but according to Wikipedia (which is always right :-) ),
                ' only Microsoft includes this marker at the beginning of files.
                If FileBytesAreUTF8(the_iFileNo) Then
                    out_nCodePage = CP_UTF8
                Else
                    out_nCodePage = CP_ACP
                End If
            Else
            ' Yes - we have worked out the code page from the BOM.
                ' If no code page was suggested, we now return the code page we found.
                If the_nDesiredCodePage = -1 Then
                    out_nCodePage = nCodePage
                End If
    
                ' Inform the caller that a BOM was found.
                out_bContainedBOM = True
            End If
    
            ' Reset the file pointer to the beginning of the file data.
            If out_bContainedBOM Then
                ' Note that if the code page found was one of the two UTF-32 values, then we are already in the correct position.
                ' Otherwise, we have to move to just after the end of the BOM.
                Select Case nCodePage
                Case CP_UTF16_BE, CP_UTF16_LE
                    Seek #the_iFileNo, 3
                Case CP_UTF8
                    Seek #the_iFileNo, 4
                End Select
            Else
                ' There is no BOM, so simply go the beginning of the file.
                Seek #the_iFileNo, 1
            End If
    
        Case Else
            out_nCodePage = the_nDesiredCodePage
        End Select
    
    End Sub
    
    ' Purpose:  Analogue of 'Open "fileName" For Append As #fileNo'
    Public Sub OpenForAppend(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bPrefixWithBOM As Boolean = True)
    
        ' Open the file and move to the end of the file.
        Open the_sFilename For Binary Access Write As #the_iFileNo
        Seek the_iFileNo, LOF(the_iFileNo) + 1
    
        If the_bPrefixWithBOM Then
            WriteBOM the_iFileNo, the_nCodePage
        End If
    
    End Sub
    
    ' Purpose:  Analogue of 'Open "fileName" For Output As #fileNo'
    Public Sub OpenForOutput(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bPrefixWithBOM As Boolean = True)
    
        ' Ensure we overwrite the file by deleting it ...
        On Error Resume Next
        Kill the_sFilename
        On Error GoTo 0
    
        ' ... before creating it.
        Open the_sFilename For Binary Access Write As #the_iFileNo
    
        If the_bPrefixWithBOM Then
            WriteBOM the_iFileNo, the_nCodePage
        End If
    
    End Sub
    
    ' Purpose:  Analogue of the 'Print #fileNo, value' statement. But only one value allowed.
    '           Setting <the_bAppendNewLine> = False is analagous to 'Print #fileNo, value;'.
    Public Sub Print_(ByVal the_iFileNo As Integer, ByRef the_sValue As String, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bAppendNewLine As Boolean = True)
    
        Const kbytNull                  As Byte = 0
        Const kbytCarriageReturn        As Byte = 13
        Const kbytNewLine               As Byte = 10
    
        Put #the_iFileNo, , ToCPString(the_sValue, the_nCodePage)
    
        If the_bAppendNewLine Then
            Select Case the_nCodePage
            Case CP_UTF16_BE
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytCarriageReturn
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNewLine
            Case CP_UTF16_LE
                Put #the_iFileNo, , kbytCarriageReturn
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNewLine
                Put #the_iFileNo, , kbytNull
            Case CP_UTF32_BE
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytCarriageReturn
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNewLine
            Case CP_UTF32_LE
                Put #the_iFileNo, , kbytCarriageReturn
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNewLine
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
                Put #the_iFileNo, , kbytNull
            Case Else
                Put #the_iFileNo, , kbytCarriageReturn
                Put #the_iFileNo, , kbytNewLine
            End Select
        End If
    
    End Sub
    
    Public Sub PutContents(ByRef the_sFilename As String, ByRef the_sFileContents As String, Optional ByVal the_nCodePage As Long = CP_ACP, Optional the_bPrefixWithBOM As Boolean)
    
        Dim iFileNo                     As Integer
    
        iFileNo = FreeFile
        OpenForOutput the_sFilename, iFileNo, the_nCodePage, the_bPrefixWithBOM
        Print_ iFileNo, the_sFileContents, the_nCodePage, False
        Close iFileNo
    
    End Sub
    
    ' Purpose:  Converts a VB string (UTF-16) to UTF8 - as a binary array.
    Private Function ToCPString(ByRef the_sValue As String, ByVal the_nCodePage As Long) As Byte()
    
        Dim abytOutput()                As Byte
        Dim nValueLen                   As Long
        Dim nOutputByteLen              As Long
    
        If the_nCodePage = CP_UTF16_LE Then
            ToCPString = the_sValue
        Else
    
            ' Cache the input length.
            nValueLen = Len(the_sValue)
    
            ' See how big the output buffer will be.
            nOutputByteLen = WideCharToMultiByte(the_nCodePage, 0&, StrPtr(the_sValue), nValueLen, 0&, 0&, 0&, 0&)
    
            If nOutputByteLen > 0 Then
                ' Resize output byte array to the size of the UTF-8 string.
                ReDim abytOutput(1 To nOutputByteLen)
    
                ' Make this API call again, this time giving a pointer to the output byte array.
                WideCharToMultiByte the_nCodePage, 0&, StrPtr(the_sValue), nValueLen, VarPtr(abytOutput(1)), nOutputByteLen, 0&, 0&
            End If
    
            ' Return the array.
            ToCPString = abytOutput()
    
        End If
    
    End Function
    
    Private Sub WriteBOM(ByVal the_iFileNo As Integer, ByVal the_nCodePage As Long)
    
        ' FF FE         UTF-16, little endian
        ' FE FF         UTF-16, big endian
        ' EF BB BF      UTF-8
        ' FF FE 00 00   UTF-32, little endian
        ' 00 00 FE FF   UTF-32, big-endian
    
        Select Case the_nCodePage
        Case CP_UTF8
            Put #the_iFileNo, , CByte(&HEF)
            Put #the_iFileNo, , CByte(&HBB)
            Put #the_iFileNo, , CByte(&HBF)
        Case CP_UTF16_LE
            Put #the_iFileNo, , CByte(&HFF)
            Put #the_iFileNo, , CByte(&HFE)
        Case CP_UTF16_BE
            Put #the_iFileNo, , CByte(&HFE)
            Put #the_iFileNo, , CByte(&HFF)
        Case CP_UTF32_LE
            Put #the_iFileNo, , CByte(&HFF)
            Put #the_iFileNo, , CByte(&HFE)
            Put #the_iFileNo, , CByte(&H0)
            Put #the_iFileNo, , CByte(&H0)
        Case CP_UTF32_BE
            Put #the_iFileNo, , CByte(&H0)
            Put #the_iFileNo, , CByte(&H0)
            Put #the_iFileNo, , CByte(&HFE)
            Put #the_iFileNo, , CByte(&HFF)
        End Select
    
    End Sub
    

    The following code was added to a Form which had a VSFlexGrid control with Lucida Console font - purely to provide a way to display as many characters as possible:

    Option Explicit
    
    Private Sub Command_Click()
        Example_ConvertFileToUTF8
    End Sub
    
    Private Sub Command2_Click()
        Example_IterateUTF8Lines
    End Sub
    
    Private Sub Command3_Click()
        Example_ReadWriteUTF8Lines
    End Sub
    
    Private Sub Form_Load()
        VSFlexGrid.ColWidth(0) = 7000!
    End Sub
    
    ' Purpose:  Converts *any* pure text file (UTF16, ASCII, ANSI) to UTF8.
    Private Sub Example_ConvertFileToUTF8()
    
        Dim nCodePage                   As Long
        Dim bContainedBOM               As Boolean
        Dim sFileContents               As String
    
        ' Read in contents.
        sFileContents = TextFile.GetContents("C:\MysteryEncoding.txt", nCodePage, , bContainedBOM)
    
        ' And then convert to UTF8.
        TextFile.PutContents "C:\output.txt", sFileContents, CP_UTF8, bContainedBOM
    
    End Sub
    
    ' Purpose:  Iterates through each line of a UTF-8 text file, and adds it to a control which can display VB strings containing non-ANSI characters.
    '           In this case, I am adding items to a FlexGrid with Font = "Lucida Console".
    Private Sub Example_IterateUTF8Lines()
    
        Dim iFileNo             As Integer
        Dim lCodePage           As Long
        Dim sLine               As String
    
        iFileNo = FreeFile
    
        TextFile.OpenForInput "C:\UTF8.txt", iFileNo, lCodePage
    
        If lCodePage = CP_UTF8 Then
            Do While TextFile.LineInputUTF8(iFileNo, sLine)
                VSFlexGrid.AddItem sLine
            Loop
            VSFlexGrid.AddItem sLine
        Else
            MsgBox "This is not a UTF8 file."
        End If
    
        Close #iFileNo
    
    End Sub
    
    Private Sub Example_ReadWriteUTF8Lines()
    
        Dim iFileNoInput        As Integer
        Dim iFileNoOutput       As Integer
        Dim lCodePage           As Long
        Dim bBOM                As Boolean
        Dim sLine               As String
    
        iFileNoInput = FreeFile
        TextFile.OpenForInput "C:\UTF8.txt", iFileNoInput, lCodePage, , bBOM
    
        If lCodePage = CP_UTF8 Then
    
            iFileNoOutput = FreeFile
            TextFile.OpenForOutput "C:\output.txt", iFileNoOutput, lCodePage, bBOM
    
            Do While TextFile.LineInputUTF8(iFileNoInput, sLine)
                TextFile.Print_ iFileNoOutput, sLine, lCodePage
            Loop
            TextFile.Print_ iFileNoOutput, sLine, lCodePage, False
    
            Close #iFileNoOutput
    
        Else
            MsgBox "This is not a UTF8 file."
        End If
    
        Close #iFileNoInput
    
    End Sub