This is just a simple core sample of how to open a XHTML file using MSXML. The code is wrapped to class module and the HTML file structure is pushed to a listbox.

I made this code only due to interest of trying out MSXML.DLL; though I probably won't code this thing any further, it seems MSXML is very slow for some reason (and it didn't load the full sample file I used using .load function, only the first line).

Hopefully this will be of some help for a person who wants to parse XML and/or XHTML and doesn't know how to get started with it.

Instructions:
  • Start a new project
  • Add a reference to Microsoft XML using Project > References
  • Add a new class module, rename it to clsXHTML and add the code below
  • Add a listbox to Form1 and paste the second code below


It should work from there. You also need to have some XHTML file on your harddisk to test.

Code:
' clsXHTML
Option Explicit

Private Const DOC_STRICT = "<!doctype html public ""-//w3c//dtd xhtml 1.0 strict//en"" ""http://www.w3.org/tr/xhtml1/dtd/xhtml1-strict.dtd"">"
Private Const DOC_TRANS = "<!doctype html public ""-//w3c//dtd xhtml 1.0 transitional//en"" ""http://www.w3.org/tr/xhtml1/dtd/xhtml1-transitional.dtd"">"
Private Const DOC_FRAME = "<!doctype html public ""-//w3c//dtd xhtml 1.0 frameset//en"" ""http://www.w3.org/tr/xhtml1/dtd/xhtml1-frameset.dtd"">"
Private Const DOC_11 = "<!doctype html public ""-//w3c//dtd xhtml 1.1//en"" ""http://www.w3.org/tr/xhtml11/dtd/xhtml11.dtd"">"

Public Enum XHTML_DOCTYPE
    XHTMLunvalid
    XHTML10Strict
    XHTML10Transitional
    XHTML10Frameset
    XHTML11
End Enum

Dim MainChild() As IXMLDOMNode
Dim MainChildCount As Integer
Dim XHTML As DOMDocument

Public Function Doctype() As XHTML_DOCTYPE
    Dim intA As Integer, strDoctype As String
    For intA = 0 To MainChildCount - 1
        If Left$(LCase$(MainChild(intA).xml), 9) = "<!doctype" Then
            strDoctype = MainChild(intA).xml
            Exit For
        End If
    Next intA
    If intA = MainChildCount Then Exit Function
    Select Case LCase$(strDoctype)
        Case DOC_STRICT
            Doctype = XHTML10Strict
        Case DOC_TRANS
            Doctype = XHTML10Transitional
        Case DOC_FRAME
            Doctype = XHTML10Frameset
        Case DOC_11
            Doctype = XHTML11
    End Select
End Function
Public Sub FileClose()
    Erase MainChild
    MainChildCount = 0
    Set XHTML = Nothing
End Sub
Public Function FileOpen(ByRef Filename As String) As Boolean
    Dim bytFile As Byte, strXML As String
    If LenB(Dir$(Filename)) = 0 Then Exit Function
    If LenB(Dir$) > 0 Then Exit Function
    bytFile = FreeFile
    Open Filename For Input As #bytFile
        strXML = input(FileLen(Filename), #bytFile)
    Close #bytFile
    Set XHTML = New DOMDocument
    FileOpen = XHTML.loadXML(strXML)
    If Not FileOpen Then FileClose
    GetMainChilds
End Function
Private Sub GetMainChilds()
    Dim Child As IXMLDOMNode, intCount As Integer, intA As Integer
    If XHTML Is Nothing Then Exit Sub
    If XHTML.firstChild Is Nothing Then
        Erase MainChild
        MainChildCount = 0
        Exit Sub
    End If
    If XHTML.firstChild Is XHTML.lastChild Then
        ReDim MainChild(0)
        Set MainChild(0) = XHTML.firstChild
        MainChildCount = 1
    Else
        Set Child = XHTML.firstChild.nextSibling
        Do Until Child Is Nothing
            intCount = intCount + 1
            Set Child = Child.nextSibling
        Loop
        ReDim MainChild(intCount)
        Set MainChild(0) = XHTML.firstChild
        For intA = 1 To intCount
            Set MainChild(intA) = MainChild(intA - 1).nextSibling
        Next intA
        MainChildCount = intCount + 1
    End If
End Sub
Public Sub ListElements(ByRef List As ListBox)
    Dim intA As Integer, HTML As IXMLDOMNode
    For intA = 0 To MainChildCount - 1
        If MainChild(intA).baseName = "html" And Left$(LCase$(MainChild(intA).xml), 9) <> "<!doctype" Then Exit For
    Next intA
    If intA = MainChildCount Then Exit Sub
    Set HTML = MainChild(intA)
    OutputChildNodes HTML, List
    Set HTML = Nothing
End Sub
Public Sub OutputChildNodes(ByRef ParentNode As IXMLDOMNode, ByRef List As ListBox, Optional ByVal Depth As Integer)
    Dim Child As IXMLDOMNode
    If LenB(ParentNode.baseName) > 0 Then
        If Depth > 0 Then
            List.AddItem Str$(Depth) & String$(Depth, vbTab) & ParentNode.baseName
        Else
            List.AddItem ParentNode.baseName
        End If
    ElseIf LenB(ParentNode.Text) > 0 Then
        If Depth > 0 Then
            List.AddItem Str$(Depth) & String$(Depth, vbTab) & ParentNode.Text
        Else
            List.AddItem ParentNode.Text
        End If
    Else
        If Depth > 0 Then
            List.AddItem Str$(Depth) & String$(Depth, vbTab) & "(I have no idea what this is!)"
        Else
            List.AddItem "(I have no idea what this is!)"
        End If
    End If
    If ParentNode.hasChildNodes Then
        Set Child = ParentNode.firstChild
        Do Until Child Is Nothing
            OutputChildNodes Child, List, Depth + 1
            Set Child = Child.nextSibling
        Loop
    End If
End Sub
Private Sub Class_Terminate()
    If Not XHTML Is Nothing Then FileClose
End Sub
VB Code:
  1. ' Form1
  2. Option Explicit
  3.  
  4. Private Sub Form_Load()
  5.     Dim XHTML As clsXHTML
  6.     Set XHTML = New clsXHTML
  7.    
  8.     XHTML.FileOpen "C:\test.html"
  9.     XHTML.ListElements List1
  10.     XHTML.FileClose
  11.    
  12.     Set XHTML = Nothing
  13. End Sub
  14.  
  15. Private Sub Form_Resize()
  16.     If Me.WindowState = vbMinimized Then Exit Sub
  17.     List1.Move 0, 0, Me.ScaleWidth, Me.ScaleHeight
  18. End Sub