Results 1 to 5 of 5

Thread: [RESOLVED] windows-1251 to utf-8

  1. #1

    Thread Starter
    Addicted Member
    Join Date
    Mar 2006
    Posts
    186

    Resolved [RESOLVED] windows-1251 to utf-8

    Is there a way to do it in VB6 like in PHP?
    PHP Code:
    $str iconv('windows-1251''utf-8//IGNORE'$str); 

  2. #2
    VB6, XHTML & CSS hobbyist Merri's Avatar
    Join Date
    Oct 2002
    Location
    Finland
    Posts
    6,654

    Re: windows-1251 to utf-8

    VB has StrConv, however it does not do UTF-8. VB6 also stores strings in UTF-16 format internally, however tons of automatic conversions are done behind the scenes when handling stuff (such as handling files, doing API calls with string parameters...).

    Below is some fast code for handling UTF-8 in VB6:
    Code:
    Option Explicit
    
    Private Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpMultiByteStr As Long, ByVal cchMultiByte As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
    Private Declare Sub PutMem4 Lib "msvbvm60" (Destination As Any, Value As Any)
    Private Declare Function SysAllocStringByteLen Lib "oleaut32" (ByVal OleStr As Long, ByVal bLen As Long) As Long
    Private Declare Function SysAllocStringLen Lib "oleaut32" (ByVal OleStr As Long, ByVal bLen As Long) As Long
    Private Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, ByVal lpMultiByteStr As Long, ByVal cchMultiByte As Long, ByVal lpDefaultChar As Long, lpUsedDefaultChar As Long) As Long
    
    Public Function IsUTF8(ByRef Text As String, Optional ByVal lngReadSize As Long = 2048) As Boolean
        Dim bytArray() As Byte, lngArraySize As Long, lngBytes As Long, lngPos As Long, lngUTF8 As Long
        lngArraySize = LenB(Text)
        If lngArraySize Then
            If lngReadSize > 0 Then
                ' see if we take the entire string or just a part of it
                If lngReadSize > lngArraySize Then
                    ' take everything
                    lngReadSize = lngArraySize
                    bytArray = Text
                Else
                    ' take just the part of it
                    bytArray = LeftB$(Text, lngReadSize)
                End If
                ' now keep going until we have gone through the entire buffer
                Do While lngPos < lngReadSize
                    ' the code here is just standard UTF-8 decoding
                    If bytArray(lngPos) < &H80 Then
                        lngPos = lngPos + 1
                    ElseIf bytArray(lngPos) < &HC0 Then
                        Exit Function
                    ElseIf (bytArray(lngPos) >= &HC0) And (bytArray(lngPos) <= &HFD) Then
                        If (bytArray(lngPos) And &HFC) = &HFC Then
                            lngBytes = 5
                        ElseIf (bytArray(lngPos) And &HF8) = &HF8 Then
                            lngBytes = 4
                        ElseIf (bytArray(lngPos) And &HF0) = &HF0 Then
                            lngBytes = 3
                        ElseIf (bytArray(lngPos) And &HE0) = &HE0 Then
                            lngBytes = 2
                        ElseIf (bytArray(lngPos) And &HC0) = &HC0 Then
                            lngBytes = 1
                        End If
                        For lngPos = (lngPos + 1) To (lngPos + lngBytes)
                            If Not ((bytArray(lngPos) >= &H80) And (bytArray(lngPos) <= &HBF)) Then
                                Exit Function
                            ElseIf lngPos >= lngReadSize Then
                                Exit Do
                            End If
                        Next lngPos
                        'lngUTF8 = lngUTF8 + 1
                    Else
                        lngPos = lngPos + 1
                    End If
                Loop
                IsUTF8 = True 'lngUTF8 > 0
            End If
        End If
    End Function
    
    Public Sub UTF16toUTF8(ByRef Text As String, ByRef TextOut As String)
        Dim lngBufferLen As Long, lngBufferPtr As Long, lngTextLen As Long, strBuffer As String
        ' first check text length
        lngTextLen = Len(Text)
        If lngTextLen Then
            ' calculate maximum output size
            lngBufferLen = lngTextLen * 4
            ' create the buffer string to hold output data
            lngBufferPtr = SysAllocStringByteLen(0, lngBufferLen)
            PutMem4 ByVal VarPtr(strBuffer), ByVal lngBufferPtr
            ' convert to UTF-8
            lngBufferLen = WideCharToMultiByte(65001, 0&, StrPtr(Text), lngTextLen, lngBufferPtr, lngBufferLen, 0&, ByVal 0&)
            ' output as is best fit
            If lngBufferLen > 0 Then
                TextOut = LeftB$(strBuffer, lngBufferLen)
            Else
                TextOut = vbNullString
            End If
            ' clean buffer
            strBuffer = vbNullString
        Else
            TextOut = vbNullString
        End If
    End Sub
    
    Public Sub UTF8toUTF16(ByRef Text As String, ByRef TextOut As String)
        Dim lngBufferLen As Long, lngBufferPtr As Long, lngTextLen As Long, strBuffer As String
        ' first check text length
        lngTextLen = LenB(Text)
        If lngTextLen Then
            ' calculate maximum output size
            lngBufferLen = lngTextLen * 2
            ' create the buffer string to hold output data
            lngBufferPtr = SysAllocStringByteLen(0, lngBufferLen)
            PutMem4 ByVal VarPtr(strBuffer), ByVal lngBufferPtr
            ' convert from UTF-8
            lngBufferLen = MultiByteToWideChar(65001, 0&, StrPtr(Text), lngTextLen, lngBufferPtr, lngBufferLen)
            ' output as is best fit
            If lngBufferLen > 0 Then
                TextOut = Left$(strBuffer, lngBufferLen)
            Else
                TextOut = vbNullString
            End If
            ' clean buffer
            strBuffer = vbNullString
        Else
            TextOut = vbNullString
        End If
    End Sub
    With that code you would do something like this:
    Code:
    Dim strOut As String
    
    ' take regular VB6 string and convert it to UTF-8
    UTF16toUTF8 "&#197;&#196;&#214;", strOut
    
    ' because VB6 does some conversions automatically with strings,
    '  we have to do the opposite to see the results...
    Debug.Print StrConv(strOut, vbUnicode)

  3. #3
    PowerPoster dilettante's Avatar
    Join Date
    Feb 2006
    Posts
    24,487

    Re: windows-1251 to utf-8

    Starting with ADO 2.5, there is a Stream object that can do conversions among various character encodings. The Stream object can handle data using two conversion types: binary and text.

    When Type = adTypeBinary you can move data into or out of the Stream literally using Read and Write methods with Byte arrays.

    When adTypeText you can use ReadText and WriteText methods with VB6 Strings. In this state the Stream can also use various Charset property settings, that cause ReadText to convert from and WriteText to convert to different character encodings within the Stream content.

    Thus:
    Code:
    stm.Type = adTypeText
    stm.Charset = "utf-8"
    stm.LineSeparator = adLF
    stm.WriteText "Abcdefg", adWriteLine
    Puts "Abcdefg" and a Line Feed into the Stream encoded as UTF-8. You could then move the Position property to 0 and change Type to adTypeBinary as:
    Code:
    stm.Position = 0
    stm.Type = adTypeBinary
    bytArray = stm.Read(adReadAll)
    Where bytArray is a dynamic Byte array.

    The result is that bytArray now contains your UTF-8 character stream, with a UTF-8 BOM. You can also skip over the BOM before calling the Read method. For UTF-8 this is 3 bytes:
    Code:
    stm.Position = 0
    stm.Type = adTypeBinary
    stm.Position = 3
    bytArray = stm.Read(adReadAll)
    The attached example loads a Stream with some UTF-8 Russian and converts it to a Byte array of KOI8-R characters, which it dumps to a RichTextBox in hex. Then it takes the KIO8-R Byte array and uses a Stream to convert it back to UTF-8 and saves the Stream contents back into a second file.

    Each conversion is done via an intermediate UTF-16 String value, because the Stream only converts to/from Windows Native (UTF-16) encoding and an alternative encoding. It does not convert directly between alternatives.

    The example can easily be modified to convert to/from Windows-1251 instead of KOI8-R.


    While you could byte-stuff a VB6 String to contain the 8-bit data this is a bad idea. Odd numbers of characters could result in a good deal of confusion and potential for errors.
    Attached Files Attached Files

  4. #4

    Thread Starter
    Addicted Member
    Join Date
    Mar 2006
    Posts
    186

    Re: windows-1251 to utf-8

    I've done some tests on this, and looks like i'm ready to dump MSXML parser and write my own to parse windows-1251 encoded XML. Here's what i've been doing:
    Code:
    XML_file = MSHFlexGrid1.TextMatrix(PageRow, 1)
    DoEvents
    XML_data = OpenURL(XML_file, 0)
    DoEvents
    
    XML_data = Replace(XML_data, Chr(13), "")
    XML_data = Replace(XML_data, Chr(10), "")
    XML_data = Replace(XML_data, vbCrLf, "")
    XML_data = Replace(XML_data, "\", "")
    XML_data = Replace(XML_data, "'", "\'")
    
    xmlDoc.loadXML XML_data
    myOK = "0"
    
    Set dNodeList = xmlDoc.selectNodes("//MASTER_PAD_VERSION_INFO")
    For Each dNode In dNodeList
        DoEvents
        If dNode.selectSingleNode("MASTER_PAD_VERSION").Text >= 2 Then myOK = "1"
    Next
    
    If myOK = "1" Then
    Set dNodeList = xmlDoc.selectNodes("//Company_Info")
    For Each dNode In dNodeList
        DoEvents
        MSHFlexGrid1.TextMatrix(PageRow, 2) = cleanSTR(dNode.selectSingleNode("Company_Name").Text)
        MSHFlexGrid1.TextMatrix(PageRow, 3) = dNode.selectSingleNode("Address_1").Text
        MSHFlexGrid1.TextMatrix(PageRow, 4) = dNode.selectSingleNode("Address_2").Text
        MSHFlexGrid1.TextMatrix(PageRow, 5) = dNode.selectSingleNode("City_Town").Text
        MSHFlexGrid1.TextMatrix(PageRow, 6) = dNode.selectSingleNode("State_Province").Text
        MSHFlexGrid1.TextMatrix(PageRow, 7) = dNode.selectSingleNode("Zip_Postal_Code").Text
        MSHFlexGrid1.TextMatrix(PageRow, 8) = dNode.selectSingleNode("Country").Text
        MSHFlexGrid1.TextMatrix(PageRow, 9) = dNode.selectSingleNode("Company_WebSite_URL").Text
    Next
    OpenURL() uses Winsock.bas to fetch the XML file, but nevermind that. It looks like MSXML parser automatically converts encoded text in nodes into utf-16: http://msdn.microsoft.com/en-us/library/aa468560.aspx
    Is there any way to set MSXML to convert to utf-8 instead of utf-16? Otherwise I guess will need a utf-16 to utf-8 encoding when i save text extracted from XML. Confusion

  5. #5

    Thread Starter
    Addicted Member
    Join Date
    Mar 2006
    Posts
    186

    Re: windows-1251 to utf-8

    Problem with char encoding was caused by Winsock.bas, as all got sorted out correctly when i downloaded XML with Inet control. Go figure! I close this topic, thank you all for help

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  



Click Here to Expand Forum to Full Width