Merri
Oct 14th, 2005, 11:51 PM
Yes and no. For yes, VB strings can hold 16-bit characters and is thus Unicode compatible. The problems come from several directions:
Strings passed to API are converted to ANSI and vice versa
Reading a file to a string is done by automatically converting ANSI to Unicode
The same applies when saving: string is converted to ANSI
Visual Basic controls are not Unicode aware
Luckily we have byte arrays to help us with the reading and writing business: they are easy to convert to strings and string contents can be copied to byte arrays easily with native VB code. Thus when working with Unicode, a byte array is your best bet.
Where are the controls?
The worst news is that VB6 controls use ANSI. The VB propertybags can't hold Unicode data and VB runtime also passes the data to the controls after ANSI conversion. Thus none of the default controls can be used for Unicode. The only free choice left is to code the controls by yourself or to seek for controls done by others on the internet. The easy solution costs money: people are selling Unicode aware controls and there is no free version available for many. Personally, I've made a UniLabel and a UniCommand and both are available for free (link (http://www.vbforums.com/showthread.php?p=3084062#post3084062)).
Switching codepages
This code is still a work in progress: it is a simple module that allows to switch between codepages. This includes UTF-8 conversions, which might be handy. The functions take in and return a byte array. This far I haven't found an error with the current code. Use GetACP to find out the default codepage the system has in use (this codepage is used when a file is loaded by VB into a string).
Option Explicit
Public Enum KnownCodePage
CP_UNKNOWN = -1
CP_ACP = 0
CP_OEMCP = 1
CP_MACCP = 2
CP_THREAD_ACP = 3
CP_SYMBOL = 42
' ARABIC
CP_AWIN = 101 ' Bidi Windows codepage
CP_709 = 102 ' MS-DOS Arabic Support CP 709
CP_720 = 103 ' MS-DOS Arabic Support CP 720
CP_A708 = 104 ' ASMO 708
CP_A449 = 105 ' ASMO 449+
CP_TARB = 106 ' MS Transparent Arabic
CP_NAE = 107 ' Nafitha Enhanced Arabic Char Set
CP_V4 = 108 ' Nafitha v 4.0
CP_MA2 = 109 ' Mussaed Al Arabi (MA/2) CP 786
CP_I864 = 110 ' IBM Arabic Supplement CP 864
CP_A437 = 111 ' Ansi 437 codepage
CP_AMAC = 112 ' Macintosh Code Page
' HEBREW
CP_HWIN = 201 ' Bidi Windows codepage
CP_862I = 202 ' IBM Hebrew Supplement CP 862
CP_7BIT = 203 ' IBM Hebrew Supplement CP 862 Folded
CP_ISO = 204 ' ISO Hebrew 8859-8 Character Set
CP_H437 = 205 ' Ansi 437 codepage
CP_HMAC = 206 ' Macintosh Code Page
' CODE PAGES
CP_OEM_437 = 437
CP_ARABICDOS = 708
CP_DOS720 = 720
CP_DOS737 = 737
CP_DOS775 = 775
CP_IBM850 = 850
CP_IBM852 = 852
CP_DOS861 = 861
CP_DOS862 = 862
CP_IBM866 = 866
CP_DOS869 = 869
CP_THAI = 874
CP_EBCDIC = 875
CP_JAPAN = 932
CP_CHINA = 936
CP_KOREA = 949
CP_TAIWAN = 950
' UNICODE
CP_UNICODELITTLE = 1200
CP_UNICODEBIG = 1201
' CODE PAGES
CP_EASTEUROPE = 1250
CP_RUSSIAN = 1251
CP_WESTEUROPE = 1252
CP_GREEK = 1253
CP_TURKISH = 1254
CP_HEBREW = 1255
CP_ARABIC = 1256
CP_BALTIC = 1257
CP_VIETNAMESE = 1258
' KOREAN
CP_JOHAB = 1361
' MAC
CP_MAC_ROMAN = 10000
CP_MAC_JAPAN = 10001
CP_MAC_ARABIC = 10004
CP_MAC_GREEK = 10006
CP_MAC_CYRILLIC = 10007
CP_MAC_LATIN2 = 10029
CP_MAC_TURKISH = 10081
' CODE PAGES
CP_CHINESECNS = 20000
CP_CHINESEETEN = 20002
CP_IA5WEST = 20105
CP_IA5GERMAN = 20106
CP_IA5SWEDISH = 20107
CP_IA5NORWEGIAN = 20108
CP_ASCII = 20127
CP_RUSSIANKOI8R = 20866
CP_RUSSIANKOI8U = 21866
CP_ISOLATIN1 = 28591
CP_ISOEASTEUROPE = 28592
CP_ISOTURKISH = 28593
CP_ISOBALTIC = 28594
CP_ISORUSSIAN = 28595
CP_ISOARABIC = 28596
CP_ISOGREEK = 28597
CP_ISOHEBREW = 28598
CP_ISOTURKISH2 = 28599
CP_ISOLATIN9 = 28605
CP_HEBREWLOG = 38598
CP_USER = 50000
CP_AUTOALL = 50001
CP_JAPANNHK = 50220
CP_JAPANESC = 50221
CP_JAPANISO = 50222
CP_KOREAISO = 50225
CP_TAIWANISO = 50227
CP_CHINAISO = 50229
CP_AUTOJAPAN = 50932
CP_AUTOCHINA = 50936
CP_AUTOKOREA = 50949
CP_AUTOTAIWAN = 50950
CP_AUTORUSSIAN = 51251
CP_AUTOGREEK = 51253
CP_AUTOARABIC = 51256
CP_JAPANEUC = 51932
CP_CHINAEUC = 51936
CP_KOREAEUC = 51949
CP_TAIWANEUC = 51950
CP_CHINAHZ = 52936
CP_GB18030 = 54936
' UNICODE
CP_UTF7 = 65000
CP_UTF8 = 65001
End Enum
' Flags
Public Const MB_PRECOMPOSED = &H1
Public Const MB_COMPOSITE = &H2
Public Const MB_USEGLYPHCHARS = &H4
Public Const MB_ERR_INVALID_CHARS = &H8
Public Const WC_DEFAULTCHECK = &H100 ' check for default char
Public Const WC_COMPOSITECHECK = &H200 ' convert composite to precomposed
Public Const WC_DISCARDNS = &H10 ' discard non-spacing chars
Public Const WC_SEPCHARS = &H20 ' generate separate chars
Public Const WC_DEFAULTCHAR = &H40 ' replace with default char
Public Declare Function GetACP Lib "kernel32" () As Long
Private Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, _
ByVal dwFlags As Long, ByVal lpMultiByteStr As Long, ByVal cchMultiByte As Long, _
ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
Private Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, _
ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, _
ByVal lpMultiByteStr As Long, ByVal cchMultiByte As Long, ByVal lpDefaultChar As Long, _
lpUsedDefaultChar As Long) As Long
Public Function ANSItoUTF16(ByRef Text() As Byte, Optional ByVal cPage As KnownCodePage = CP_UNKNOWN, _
Optional lFlags As Long) As Byte()
Static tmpArr() As Byte, textStr As String
Dim tmpLen As Long, textLen As Long, A As Long
If (Not Text) = True Then Exit Function
' set code page to a valid one
If cPage = CP_UNKNOWN Then cPage = GetACP
If cPage = CP_ACP Or cPage = CP_WESTEUROPE Then
textLen = UBound(Text)
tmpLen = textLen + textLen + 1
If (Not tmpArr) = True Then ReDim Preserve tmpArr(tmpLen)
If UBound(tmpArr) <> tmpLen Then ReDim Preserve tmpArr(tmpLen)
For A = 0 To UBound(Text)
tmpArr(A + A) = Text(A)
Next A
Else
textStr = CStr(Text) & "|"
textLen = LenB(textStr)
tmpLen = textLen + textLen
ReDim Preserve tmpArr(tmpLen + 1)
' get the new string to tmpArr
tmpLen = MultiByteToWideChar(CLng(cPage), lFlags, ByVal StrPtr(textStr), -1, _
ByVal VarPtr(tmpArr(0)), tmpLen)
If tmpLen = 0 Then Exit Function
tmpLen = tmpLen + tmpLen - 5
'If tmpArr(tmpLen - 1) = 0 And tmpArr(tmpLen) = 0 Then tmpLen = tmpLen - 2
If UBound(tmpArr) <> tmpLen Then ReDim Preserve tmpArr(tmpLen)
End If
' return the result
ANSItoUTF16 = tmpArr
End Function
Public Function UTF16toANSI(ByRef Text() As Byte, Optional ByVal cPage As KnownCodePage = CP_UNKNOWN, _
Optional lFlags As Long) As Byte()
Static tmpArr() As Byte
Dim tmpLen As Long, textLen As Long, A As Long
If (Not Text) = True Then Exit Function
' set code page to a valid one
If cPage = CP_UNKNOWN Then cPage = GetACP
If cPage = CP_ACP Or cPage = CP_WESTEUROPE Then
textLen = UBound(Text)
tmpLen = (textLen + 1) \ 2 - 1
If (Not tmpArr) = True Then ReDim Preserve tmpArr(tmpLen)
If UBound(tmpArr) <> tmpLen Then ReDim Preserve tmpArr(tmpLen)
For A = 0 To tmpLen
tmpArr(A) = Text(A + A)
Next A
Else
textLen = (UBound(Text) + 1) \ 2
' at maximum ANSI can be four bytes per character in new Chinese encoding GB18030–2000
tmpLen = textLen + textLen + textLen + textLen + 1
ReDim Preserve tmpArr(tmpLen - 1)
' get the new string to tmpArr
tmpLen = WideCharToMultiByte(CLng(cPage), lFlags, ByVal VarPtr(Text(0)), textLen, ByVal VarPtr(tmpArr(0)), _
tmpLen, ByVal 0&, ByVal 0&)
If tmpLen = 0 Then Exit Function
' a hopeless try to correct a weird error?
ReDim Preserve tmpArr(tmpLen - 1)
End If
' return the result
UTF16toANSI = tmpArr
End Function
Strings passed to API are converted to ANSI and vice versa
Reading a file to a string is done by automatically converting ANSI to Unicode
The same applies when saving: string is converted to ANSI
Visual Basic controls are not Unicode aware
Luckily we have byte arrays to help us with the reading and writing business: they are easy to convert to strings and string contents can be copied to byte arrays easily with native VB code. Thus when working with Unicode, a byte array is your best bet.
Where are the controls?
The worst news is that VB6 controls use ANSI. The VB propertybags can't hold Unicode data and VB runtime also passes the data to the controls after ANSI conversion. Thus none of the default controls can be used for Unicode. The only free choice left is to code the controls by yourself or to seek for controls done by others on the internet. The easy solution costs money: people are selling Unicode aware controls and there is no free version available for many. Personally, I've made a UniLabel and a UniCommand and both are available for free (link (http://www.vbforums.com/showthread.php?p=3084062#post3084062)).
Switching codepages
This code is still a work in progress: it is a simple module that allows to switch between codepages. This includes UTF-8 conversions, which might be handy. The functions take in and return a byte array. This far I haven't found an error with the current code. Use GetACP to find out the default codepage the system has in use (this codepage is used when a file is loaded by VB into a string).
Option Explicit
Public Enum KnownCodePage
CP_UNKNOWN = -1
CP_ACP = 0
CP_OEMCP = 1
CP_MACCP = 2
CP_THREAD_ACP = 3
CP_SYMBOL = 42
' ARABIC
CP_AWIN = 101 ' Bidi Windows codepage
CP_709 = 102 ' MS-DOS Arabic Support CP 709
CP_720 = 103 ' MS-DOS Arabic Support CP 720
CP_A708 = 104 ' ASMO 708
CP_A449 = 105 ' ASMO 449+
CP_TARB = 106 ' MS Transparent Arabic
CP_NAE = 107 ' Nafitha Enhanced Arabic Char Set
CP_V4 = 108 ' Nafitha v 4.0
CP_MA2 = 109 ' Mussaed Al Arabi (MA/2) CP 786
CP_I864 = 110 ' IBM Arabic Supplement CP 864
CP_A437 = 111 ' Ansi 437 codepage
CP_AMAC = 112 ' Macintosh Code Page
' HEBREW
CP_HWIN = 201 ' Bidi Windows codepage
CP_862I = 202 ' IBM Hebrew Supplement CP 862
CP_7BIT = 203 ' IBM Hebrew Supplement CP 862 Folded
CP_ISO = 204 ' ISO Hebrew 8859-8 Character Set
CP_H437 = 205 ' Ansi 437 codepage
CP_HMAC = 206 ' Macintosh Code Page
' CODE PAGES
CP_OEM_437 = 437
CP_ARABICDOS = 708
CP_DOS720 = 720
CP_DOS737 = 737
CP_DOS775 = 775
CP_IBM850 = 850
CP_IBM852 = 852
CP_DOS861 = 861
CP_DOS862 = 862
CP_IBM866 = 866
CP_DOS869 = 869
CP_THAI = 874
CP_EBCDIC = 875
CP_JAPAN = 932
CP_CHINA = 936
CP_KOREA = 949
CP_TAIWAN = 950
' UNICODE
CP_UNICODELITTLE = 1200
CP_UNICODEBIG = 1201
' CODE PAGES
CP_EASTEUROPE = 1250
CP_RUSSIAN = 1251
CP_WESTEUROPE = 1252
CP_GREEK = 1253
CP_TURKISH = 1254
CP_HEBREW = 1255
CP_ARABIC = 1256
CP_BALTIC = 1257
CP_VIETNAMESE = 1258
' KOREAN
CP_JOHAB = 1361
' MAC
CP_MAC_ROMAN = 10000
CP_MAC_JAPAN = 10001
CP_MAC_ARABIC = 10004
CP_MAC_GREEK = 10006
CP_MAC_CYRILLIC = 10007
CP_MAC_LATIN2 = 10029
CP_MAC_TURKISH = 10081
' CODE PAGES
CP_CHINESECNS = 20000
CP_CHINESEETEN = 20002
CP_IA5WEST = 20105
CP_IA5GERMAN = 20106
CP_IA5SWEDISH = 20107
CP_IA5NORWEGIAN = 20108
CP_ASCII = 20127
CP_RUSSIANKOI8R = 20866
CP_RUSSIANKOI8U = 21866
CP_ISOLATIN1 = 28591
CP_ISOEASTEUROPE = 28592
CP_ISOTURKISH = 28593
CP_ISOBALTIC = 28594
CP_ISORUSSIAN = 28595
CP_ISOARABIC = 28596
CP_ISOGREEK = 28597
CP_ISOHEBREW = 28598
CP_ISOTURKISH2 = 28599
CP_ISOLATIN9 = 28605
CP_HEBREWLOG = 38598
CP_USER = 50000
CP_AUTOALL = 50001
CP_JAPANNHK = 50220
CP_JAPANESC = 50221
CP_JAPANISO = 50222
CP_KOREAISO = 50225
CP_TAIWANISO = 50227
CP_CHINAISO = 50229
CP_AUTOJAPAN = 50932
CP_AUTOCHINA = 50936
CP_AUTOKOREA = 50949
CP_AUTOTAIWAN = 50950
CP_AUTORUSSIAN = 51251
CP_AUTOGREEK = 51253
CP_AUTOARABIC = 51256
CP_JAPANEUC = 51932
CP_CHINAEUC = 51936
CP_KOREAEUC = 51949
CP_TAIWANEUC = 51950
CP_CHINAHZ = 52936
CP_GB18030 = 54936
' UNICODE
CP_UTF7 = 65000
CP_UTF8 = 65001
End Enum
' Flags
Public Const MB_PRECOMPOSED = &H1
Public Const MB_COMPOSITE = &H2
Public Const MB_USEGLYPHCHARS = &H4
Public Const MB_ERR_INVALID_CHARS = &H8
Public Const WC_DEFAULTCHECK = &H100 ' check for default char
Public Const WC_COMPOSITECHECK = &H200 ' convert composite to precomposed
Public Const WC_DISCARDNS = &H10 ' discard non-spacing chars
Public Const WC_SEPCHARS = &H20 ' generate separate chars
Public Const WC_DEFAULTCHAR = &H40 ' replace with default char
Public Declare Function GetACP Lib "kernel32" () As Long
Private Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, _
ByVal dwFlags As Long, ByVal lpMultiByteStr As Long, ByVal cchMultiByte As Long, _
ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
Private Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, _
ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, _
ByVal lpMultiByteStr As Long, ByVal cchMultiByte As Long, ByVal lpDefaultChar As Long, _
lpUsedDefaultChar As Long) As Long
Public Function ANSItoUTF16(ByRef Text() As Byte, Optional ByVal cPage As KnownCodePage = CP_UNKNOWN, _
Optional lFlags As Long) As Byte()
Static tmpArr() As Byte, textStr As String
Dim tmpLen As Long, textLen As Long, A As Long
If (Not Text) = True Then Exit Function
' set code page to a valid one
If cPage = CP_UNKNOWN Then cPage = GetACP
If cPage = CP_ACP Or cPage = CP_WESTEUROPE Then
textLen = UBound(Text)
tmpLen = textLen + textLen + 1
If (Not tmpArr) = True Then ReDim Preserve tmpArr(tmpLen)
If UBound(tmpArr) <> tmpLen Then ReDim Preserve tmpArr(tmpLen)
For A = 0 To UBound(Text)
tmpArr(A + A) = Text(A)
Next A
Else
textStr = CStr(Text) & "|"
textLen = LenB(textStr)
tmpLen = textLen + textLen
ReDim Preserve tmpArr(tmpLen + 1)
' get the new string to tmpArr
tmpLen = MultiByteToWideChar(CLng(cPage), lFlags, ByVal StrPtr(textStr), -1, _
ByVal VarPtr(tmpArr(0)), tmpLen)
If tmpLen = 0 Then Exit Function
tmpLen = tmpLen + tmpLen - 5
'If tmpArr(tmpLen - 1) = 0 And tmpArr(tmpLen) = 0 Then tmpLen = tmpLen - 2
If UBound(tmpArr) <> tmpLen Then ReDim Preserve tmpArr(tmpLen)
End If
' return the result
ANSItoUTF16 = tmpArr
End Function
Public Function UTF16toANSI(ByRef Text() As Byte, Optional ByVal cPage As KnownCodePage = CP_UNKNOWN, _
Optional lFlags As Long) As Byte()
Static tmpArr() As Byte
Dim tmpLen As Long, textLen As Long, A As Long
If (Not Text) = True Then Exit Function
' set code page to a valid one
If cPage = CP_UNKNOWN Then cPage = GetACP
If cPage = CP_ACP Or cPage = CP_WESTEUROPE Then
textLen = UBound(Text)
tmpLen = (textLen + 1) \ 2 - 1
If (Not tmpArr) = True Then ReDim Preserve tmpArr(tmpLen)
If UBound(tmpArr) <> tmpLen Then ReDim Preserve tmpArr(tmpLen)
For A = 0 To tmpLen
tmpArr(A) = Text(A + A)
Next A
Else
textLen = (UBound(Text) + 1) \ 2
' at maximum ANSI can be four bytes per character in new Chinese encoding GB18030–2000
tmpLen = textLen + textLen + textLen + textLen + 1
ReDim Preserve tmpArr(tmpLen - 1)
' get the new string to tmpArr
tmpLen = WideCharToMultiByte(CLng(cPage), lFlags, ByVal VarPtr(Text(0)), textLen, ByVal VarPtr(tmpArr(0)), _
tmpLen, ByVal 0&, ByVal 0&)
If tmpLen = 0 Then Exit Function
' a hopeless try to correct a weird error?
ReDim Preserve tmpArr(tmpLen - 1)
End If
' return the result
UTF16toANSI = tmpArr
End Function