What would be the best way to parse html into elements?
Printable View
What would be the best way to parse html into elements?
What does this have to do with maths? Wrong forum I think...
Parsing html would be an algorithm.Quote:
Maths Forum (3 Viewing)
By popular request, a place for you to discuss Maths of all forms. Somewhere to think about algorithms and the applications of maths to programming too.
Anyways I was thinking about reading every character 1 by 1 to get it into 3 pieces.
whitespaces/Tags(NonParsed)/Text
Then parse it into how the xml parser is. Then possibly how javascript dom is.
What you highlighted is just part of the sentence, and while it isn't as clear as it could be, actually means "algorithms of maths" rather than general algorithms (which could be applied to a large percentage of threads on this site).
This isn't really related to maths, so thread moved to 'VB6 and Earlier' forum.
To clarify, this is VB6, right?
The easiest way would probably be using the HTML object library which allows you to loop through a document and treat it as an object.
MarkT has posted a few samples of doing this. Here is one:
http://www.vbforums.com/showthread.php?t=519339
It will add a dependency to your project, but is worth it. If you want to do it with pure VB6 code, it would be a combination of the string manipulation functions (InStr, Mid$(), etc.).
Also, what exactly is it you are trying to do with the HTML file? Giving some examples of what you want to do would help.
Well if this shouldn't be in the math area can you plop it in the C# section?
Thread moved to C#
So this is what I have right now.
Thats how its going to be structured.Code:public enum HtmlTypes
{
None = 0,
Text = 1,
WhiteSpace = 2,
Tag = 3,
EndTag = 4
}
public struct HtmlNode
{
int NodeType;
string NodeText;
}
List<HtmlNode> Nodes = new List<HtmlNode>();
I was thinking the most efficient way to read the html char by char.
I'll post some code of the loop later when I get it working and see if someone can help me improve it.
Thats what I have so far. Going to take all the nodes and break them down into elements.Code:public class Html_Parser
{
public enum ParseTypes
{
None = 0,
Text = 1,
WhiteSpace = 2,
Tag = 3,
EndTag = 4
}
public struct HtmlNode
{
public ParseTypes NodeType;
public string NodeText;
public HtmlNode(ParseTypes type, string text)
{
NodeType = type;
NodeText = text;
}
}
public List<HtmlNode> Nodes = new List<HtmlNode>();
private ParseTypes GetType(char ch)
{
if (ch == ' ' || ch == '\r' || ch == '\n')
{
return ParseTypes.WhiteSpace;
}
else if (ch == '<')
{
return ParseTypes.Tag;
}
else if (ch == '>')
{
return ParseTypes.EndTag;
}
return ParseTypes.Text;
}
private void AddNode(ParseTypes type, string text)
{
Nodes.Add(new HtmlNode(type, text));
}
public int LoadHtml(string html)
{
Nodes.Clear();
char[] chars = html.ToCharArray();
string t = "";
ParseTypes CurType = ParseTypes.None;
char CurChar = 'a';
for (int i = 0; i < chars.Length; i++)
{
CurChar = chars[i];
if (CurType == ParseTypes.None)
{
CurType = GetType(CurChar);
t = "";
t += CurChar;
}
else
{
switch (GetType(CurChar))
{
case (ParseTypes.WhiteSpace):
if (CurType != ParseTypes.WhiteSpace)
{
AddNode(CurType, t);
CurType = ParseTypes.WhiteSpace;
t = "";
t += CurChar;
}
else
{
t += CurChar;
}
break;
case (ParseTypes.Text):
if (CurType == ParseTypes.Tag)
{
t += CurChar;
}
else if (CurType != ParseTypes.Text)
{
AddNode(CurType, t);
CurType = ParseTypes.Text;
t = "";
t += CurChar;
}
else
{
t += CurChar;
}
break;
case (ParseTypes.Tag):
if (CurType != ParseTypes.Tag)
{
AddNode(CurType, t);
CurType = ParseTypes.Tag;
t = "";
t += CurChar;
}
else
{
t += CurChar;
}
break;
case(ParseTypes.EndTag):
if (CurType == ParseTypes.Tag)
{
t += CurChar;
AddNode(CurType, t);
CurType = ParseTypes.None;
}
else
{
CurType = ParseTypes.None;
}
break;
}
}
}
return 0;
}
}
Anything wrong with it or something I can do to improve it?
I now have all the tags separated but now I need to to link the start/end of a tag.
I was thinking of doing a Stack were when it finds a start it adds it to the stack, when it finds an end it takes 1 off the stack and pairs them.
Thats the most efficient way I can think of.
Here is the class so far (going to add more and neaten up)
Works very nicely XD.Code:using System;
using System.Collections.Generic;
using System.Text;
public class Html_Parser
{
protected string _version = "1.0";
string[] TagExceptions = new string[] { "meta","br","nobr" };
public string Version
{
get { return _version; }
}
public char[] HTML;
public enum ParseTypes
{
None = 0,
Text = 1,
Tag = 2,
EndTag = 3,
FullTag = 4
}
public struct HtmlNode
{
public ParseTypes NodeType;
public string NodeText;
public HtmlNode(ParseTypes type, string text)
{
NodeType = type;
NodeText = text;
}
}
public class attribute
{
public string Name = "";
public string Value = "";
public attribute(string n, string v)
{
Name = n;
Value = v;
}
}
public class Element
{
public char[] HTML;
public Element Parent = null;
public List<Element> Children = new List<Element>();
int innerstart = -1;
int innerend = -1;
public void innerLoc(int start,int end)
{
if (start != -1)
innerstart = start;
if (end != -1)
innerend = end;
}
protected string _innerHTML;
public string innerHTML
{
get
{
if (innerstart == -1 || innerend == -1)
return "";
return (new string(HTML)).Substring(innerstart, innerend - innerstart);
}
}
public string tagName = "";
public List<attribute> attributes = new List<attribute>();
public string getAttribute(string name)
{
for (int i = 0; i < attributes.Count; i++)
{
if (name == attributes[i].Name)
{
return attributes[i].Value;
}
}
return "";
}
public bool StartAndEndTag = true;
public bool Linked = false; //Has an endtag?
}
public List<HtmlNode> Nodes = new List<HtmlNode>();
public List<Element> Elements = new List<Element>();
private ParseTypes GetType(char ch)
{
if (ch == '<')
{
return ParseTypes.Tag;
}
return ParseTypes.Text;
}
private void AddNode(ParseTypes type, string text)
{
Nodes.Add(new HtmlNode(type, text));
}
int NextNonWhiteSpace(char[] chars,int idx)
{
while (idx < chars.Length && (chars[idx] == ' ' || chars[idx] == '\n' || chars[idx] == '\r' || chars[idx] == '\t'))
{
idx++;
}
return idx;
}
bool WhiteSpace(char chr)
{
if (chr == ' ' || chr == '\n' || chr == '\t' || chr == '\r')
{
return true;
}
return false;
}
int GetPrevStartTag(int idx)
{
while (idx > -1 && (Elements[idx].StartAndEndTag == false || Elements[idx].Linked == true))
{
idx--;
}
return idx;
}
List<int> ParseTags()
{
List<int> ret = new List<int>();
Element Cur = null;
Element Peek = null;
HtmlNode CurNode;
int q = 0;
int x = 0;
string t = "";
string n = "";
string v = "";
char del = 'a';
int idx = -1;
for (int i = 0; i < Nodes.Count; i++)
{
CurNode = Nodes[i];
if (CurNode.NodeType == ParseTypes.Tag)
{
q += CurNode.NodeText.Length;
Cur = new Element();
Cur.HTML = HTML;
char[] chars = Nodes[i].NodeText.ToCharArray();
x = 1;
t = "";
while (!WhiteSpace(chars[x]) && chars[x] != '>')
{
t+= chars[x];
x++;
}
Cur.tagName = t;
while (x < chars.Length)
{
n = "";
v = "";
x = NextNonWhiteSpace(chars, x);
if (x >= chars.Length)
break;
if (chars[x] == '/' || chars[x] == '>')
{
x++;
continue;
}
while (x < chars.Length && chars[x] != '=')
{
n += chars[x];
x++;
}
x++;
if (x > chars.Length && n != "")
{
Cur.attributes.Add(new attribute(n, ""));
break;
}
if (chars[x] == '\'' || chars[x] == '\"')
{
del = chars[x];
x++;
while (x < chars.Length && chars[x] != del)
{
v += chars[x];
x++;
}
x++;
}
else
{
while (x < chars.Length && !WhiteSpace(chars[x]))
{
v += chars[x];
x++;
}
}
Cur.attributes.Add(new attribute(n, v));
}
for (int z = 0; z < TagExceptions.Length; z++)
{
if (Cur.tagName.ToLower() == TagExceptions[z].ToLower())
Cur.StartAndEndTag = false;
}
Cur.innerLoc(q, -1);
Elements.Add(Cur);
idx = Elements.Count - 1;
}
else if (CurNode.NodeType == ParseTypes.EndTag)
{
if (Elements.Count < 1)
{
ret.Add(1);
}; //Found EndTag Before StartTag would be caused from </test>
idx = GetPrevStartTag(idx);
if (idx < 0)
{
ret.Add(3);
}; //Errors finding starttag Would be caused from <test></test></test2>
Peek = Elements[idx];
t = "";
x = 0;
char[] chrs = CurNode.NodeText.ToCharArray();
while (chrs[x] != '/')
{
x++;
}
x++;
x = NextNonWhiteSpace(chrs, x);
while (!WhiteSpace(chrs[x]) && chrs[x] != '>')
{
t += chrs[x];
x++;
}
if (Peek.tagName != t)
{
ret.Add(2);
//Error, example of how to get <test><test2></test></test2>
}
Cur.innerLoc(-1, q);
for (int z = idx; z < Elements.Count-1; z++)
{
Cur.Children.Add(Elements[z]);
}
Cur.Linked = true;
q += CurNode.NodeText.Length;
idx = GetPrevStartTag(idx);
}
else if (CurNode.NodeType == ParseTypes.FullTag)
{
Cur = new Element();
Cur.HTML = HTML;
Cur.StartAndEndTag = false;
char[] chars = Nodes[i].NodeText.ToCharArray();
x = 1;
t = "";
while (!WhiteSpace(chars[x]) && chars[x] != '>')
{
t += chars[x];
x++;
}
Cur.tagName = t;
while (x < chars.Length)
{
x = NextNonWhiteSpace(chars, x);
if (x >= chars.Length)
break;
if (chars[x] == '/' || chars[x] == '>')
{
x++;
continue;
}
n = "";
while (chars[x] != '=')
{
n += chars[x];
x++;
}
x++;
v = "";
if (chars[x] == '\'' || chars[x] == '\"')
{
del = chars[x];
x++;
while (x < chars.Length && chars[x] != del)
{
v += chars[x];
x++;
}
x++;
}
else
{
while (x < chars.Length && !WhiteSpace(chars[x]))
{
v += chars[x];
x++;
}
}
Cur.attributes.Add(new attribute(n, v));
}
Cur.Linked = true;
q += CurNode.NodeText.Length;
Elements.Add(Cur);
}
else if (CurNode.NodeType == ParseTypes.Text)
{
q += CurNode.NodeText.Length;
}
}
return ret;
}
}
Loaded google.com into it and only got 6 errors(all errorcode 2 from bad html)
Also what are the tags called that are like <br/>? Where it doesn't have an end tag.
also LoadHtml() is in the post below. Code is too long XD.
Code:public int LoadHtml(string html)
{
HTML = html.ToCharArray();
Nodes.Clear();
char[] chars = HTML;
string t = "";
ParseTypes CurType = ParseTypes.None;
char CurChar = 'a';
for (int i = 0; i < chars.Length; i++)
{
CurChar = chars[i];
if (CurChar == '<')
{
if (CurType != ParseTypes.None)
{
AddNode(CurType, t);
t = "";
}
CurType = ParseTypes.Tag;
t = "";
while (chars[i] != '>' && i < chars.Length)
{
t += chars[i];
i++;
}
t += chars[i];
if (t.Substring(1).StartsWith("/"))
{
CurType = ParseTypes.EndTag;
}
else if (t.Substring(0, t.Length - 1).EndsWith("/"))
{
CurType = ParseTypes.FullTag;
}
AddNode(CurType, t);
t = "";
CurType = ParseTypes.None;
continue;
}
else
{
if (CurType == ParseTypes.None) CurType = ParseTypes.Text;
t += CurChar;
}
}
if (CurType != ParseTypes.None)
{
AddNode(CurType,t);
}
List<int> ret = ParseTags();
return 0;
}