The trouble with my code ?

  • Thread starter Thread starter Just Me
  • Start date Start date
J

Just Me

Any ideas on this. I am trying to loop through an xml document to remove
attributes, but Im having so much trouble, any help is appreciated

//THIS IS THE EXCEPTION ( SEE CODE LINE WHERE FAILURE OCCURS

'//Unexpected XML declaration. The XML declaration must be the first node in
the document, and no white space characters are allowed to appear before it.
Line 13, position 11.

//THE XHTML TEXT WHICH IS BEING LOOOKED AT

<table cellspacing="0" rules="all" border="1" id="dgArticles"
style="font-family:Arial;font-size:8pt;width:762px;border-collapse:collapse;">
<tr style="color:White;background-color:Blue;">
<td>&nbsp;</td><td style="width:0.75cm;">ID</td><td
style="width:7cm;">Title</td><td style="width:13cm;">Summary</td><td
style="width:1cm;">Published</td>
</tr><tr valign="Top">
<td><a href='Articles/Art226/Art226.html'
target=_blank>Open</a></td><td>226</td><td>SQL Server 2005
Permissions</td><td>See this article for a handy reference to the complete
list of permissons on SQL Server 2005 </td><td>28/12/2006</td>
</tr><tr valign="Top">
<td><a href='Articles/Art223/Art223.html'
target=_blank>Open</a></td><td>223</td><td>SQL Schemas In SQL
2005</td><td>Want to know a little more about schemas in SQL Server 2005,
take a look at this quick overview. </td><td>25/12/2006</td>
</tr><tr valign="Top">
<td><a href='Articles/Art224/Art224.html'
target=_blank>Open</a></td><td>224</td><td>SQL Server 2005 - Must_Change
option</td><td>When de-checking Enforce Password Policy, SQL Security
responds with an error and refers to Must_Change being in force. This
article shows you how to reverse this. </td><td>27/12/2006</td>
</tr><tr valign="Top">
<td><a href='Articles/Art220/Art220.html'
target=_blank>Open</a></td><td>220</td><td>Installing Adventureworks
Sample</td><td>If you dont install the samples for Adventureworks first
time, getting them on can be a little tricky. This article explains.
</td><td>23/12/2006</td>
</tr>
</table>

'// THE CODE WHICH PROCESSES THE xhtml



Private Sub useXmlDocButton_Click(ByVal sender As System.Object, ByVal e As
System.EventArgs) Handles useXmlDocButton.Click

GC.Collect()

'Clear message

Me.messageTextBox.Text = String.Empty

Dim xmlString As String

'//Some pre-processing here

xmlString = Me.sourcetextBox.Text.ToLower

'//Remove nbsp

xmlString = Regex.Replace(xmlString, "&nbsp;", "")

'//Remove any explorer codes

xmlString = Regex.Replace(xmlString, "&[a-zA-Z0-9]*;", "")

'//Remove any unquoted attributes which appear at the end of a tag

xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]*>", ">")

'//Remove any unquoted attributes which before end of tag

xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]* ", "")

'Finally prepend the cml declaration needed

xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString

Me.sourcetextBox.Text = xmlString

'Get the xml into a stream

Dim stream As New System.IO.MemoryStream

stream.Write((New System.Text.UTF8Encoding).GetBytes(xmlString), 0,
xmlString.Length)

stream.Position = 0

Dim xDoc As New System.Xml.XmlDocument

xDoc.Load(stream)

stream.Position = 0

Dim xreader As New System.Xml.XmlTextReader(stream)

Dim xNode As System.Xml.XmlNode

stream.Position = 0

While xreader.Read()

If xreader.NodeType = Xml.XmlNodeType.Element Then

xNode = xDoc.ReadNode(xreader) '//************* THIS IS WHERE IT FAILS //

xNode.Attributes.RemoveAll()

End If

End While



Dim sr As New System.IO.StreamReader(stream)

stream.Position = 0

targetTextBox.Text = sr.ReadToEnd

sr.Close()

sr.Dispose()

xreader.Close()

stream.Close()

stream.Dispose()
 
"Just Me" <news.microsoft.com> wrote in message
: Any ideas on this. I am trying to loop through an xml document to
: remove attributes, but Im having so much trouble, any help is
: appreciated
:
: //THIS IS THE EXCEPTION ( SEE CODE LINE WHERE FAILURE OCCURS
:
: '//Unexpected XML declaration. The XML declaration must be the first
: node in the document, and no white space characters are allowed to
: appear before it. Line 13, position 11.
:
: //THE XHTML TEXT WHICH IS BEING LOOOKED AT
:
: <table cellspacing="0" rules="all" border="1" id="dgArticles"
: style="font-family:Arial;font-size:8pt;width:762px;border-collapse
: :collapse;">
: <tr style="color:White;background-color:Blue;">
: <td>&nbsp;</td><td style="width:0.75cm;">ID</td><td
: style="width:7cm;">Title</td><td style="width:13cm;">Summary</td><td
: style="width:1cm;">Published</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art226/Art226.html'
: target=_blank>Open</a></td><td>226</td><td>SQL Server 2005
: Permissions</td><td>See this article for a handy reference to the
: complete list of permissons on SQL Server 2005
: </td><td>28/12/2006</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art223/Art223.html'
: target=_blank>Open</a></td><td>223</td><td>SQL Schemas In SQL
: 2005</td><td>Want to know a little more about schemas in SQL Server
: 2005, take a look at this quick overview.
: </td><td>25/12/2006</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art224/Art224.html'
: target=_blank>Open</a></td><td>224</td><td>SQL Server 2005 -
: Must_Change option</td><td>When de-checking Enforce Password Policy,
: SQL Security responds with an error and refers to Must_Change being
: in force. This article shows you how to reverse this.
: </td><td>27/12/2006</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art220/Art220.html'
: target=_blank>Open</a></td><td>220</td><td>Installing Adventureworks
: Sample</td><td>If you dont install the samples for Adventureworks
: first time, getting them on can be a little tricky. This article
: explains.
: </td><td>23/12/2006</td>
: </tr>
: </table>
:
: '// THE CODE WHICH PROCESSES THE xhtml
:
:
:
: Private Sub useXmlDocButton_Click(ByVal sender As System.Object,
: ByVal e As System.EventArgs) Handles useXmlDocButton.Click
:
: GC.Collect()
:
: 'Clear message
:
: Me.messageTextBox.Text = String.Empty
:
: Dim xmlString As String
:
: '//Some pre-processing here
:
: xmlString = Me.sourcetextBox.Text.ToLower
:
: '//Remove nbsp
:
: xmlString = Regex.Replace(xmlString, "&nbsp;", "")
:
: '//Remove any explorer codes
:
: xmlString = Regex.Replace(xmlString, "&[a-zA-Z0-9]*;", "")
:
: '//Remove any unquoted attributes which appear at the end of a tag
:
: xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]*>",
: ">")
:
: '//Remove any unquoted attributes which before end of tag
:
: xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]* ",
: "")
:
: 'Finally prepend the cml declaration needed
:
: xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString
:
: Me.sourcetextBox.Text = xmlString
:
: 'Get the xml into a stream
:
: Dim stream As New System.IO.MemoryStream
:
: stream.Write((New System.Text.UTF8Encoding).GetBytes(xmlString), 0,
: xmlString.Length)
:
: stream.Position = 0
:
: Dim xDoc As New System.Xml.XmlDocument
:
: xDoc.Load(stream)
:
: stream.Position = 0
:
: Dim xreader As New System.Xml.XmlTextReader(stream)
:
: Dim xNode As System.Xml.XmlNode
:
: stream.Position = 0
:
: While xreader.Read()
:
: If xreader.NodeType = Xml.XmlNodeType.Element Then
:
: xNode = xDoc.ReadNode(xreader) '//************* THIS IS WHERE IT
: FAILS //
:
: xNode.Attributes.RemoveAll()
:
: End If
:
: End While
:
:
:
: Dim sr As New System.IO.StreamReader(stream)
:
: stream.Position = 0
:
: targetTextBox.Text = sr.ReadToEnd
:
: sr.Close()
:
: sr.Dispose()
:
: xreader.Close()
:
: stream.Close()
:
: stream.Dispose()


Try something along these lines instead (VB.NET 2.0):

xmlString As String = Me.sourcetextBox.Text.ToLower
xmlString = Regex.Replace(xmlString, _
"&nbsp;", "")
xmlString = Regex.Replace(xmlString, _
"&[a-zA-Z0-9]*;", "")
xmlString = Regex.Replace(xmlString, _
" [A-Za-z0-9]*=[A-Za-z0-9_]*>", ">")
xmlString = Regex.Replace(xmlString, _
" [A-Za-z0-9]*=[A-Za-z0-9_]* ", "")

'NOT SURE WHY YOU'D WANT THIS BUT NO HARM IN IT
xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString

Dim tmpDoc as New XmlDocument
tmpdoc.loadxml(xmlstring)
ZapAttributes(tmpdoc.selectSingleNode("/table"))
Me.targetTextBox.Text = tmpdoc.InnerXml

[...]

Private Sub ZapAttributes(xNode as xmlnode)
If xNode.attributes IsNot Nothing Then
xnode.Attributes.RemoveAll
End If
For each child As xmlNode in xNode.childNOdes
ZapAttributes(child)
Next
End Sub

Ralf
 
Thanks for your help. But it doesent really answer my question about my own
failing code. Where am I going wrong, this is important for me to learn as I
need to know why its failing.

Many Thanks



_AnonCoward said:
"Just Me" <news.microsoft.com> wrote in message
: Any ideas on this. I am trying to loop through an xml document to
: remove attributes, but Im having so much trouble, any help is
: appreciated
:
: //THIS IS THE EXCEPTION ( SEE CODE LINE WHERE FAILURE OCCURS
:
: '//Unexpected XML declaration. The XML declaration must be the first
: node in the document, and no white space characters are allowed to
: appear before it. Line 13, position 11.
:
: //THE XHTML TEXT WHICH IS BEING LOOOKED AT
:
: <table cellspacing="0" rules="all" border="1" id="dgArticles"
: style="font-family:Arial;font-size:8pt;width:762px;border-collapse
: :collapse;">
: <tr style="color:White;background-color:Blue;">
: <td>&nbsp;</td><td style="width:0.75cm;">ID</td><td
: style="width:7cm;">Title</td><td style="width:13cm;">Summary</td><td
: style="width:1cm;">Published</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art226/Art226.html'
: target=_blank>Open</a></td><td>226</td><td>SQL Server 2005
: Permissions</td><td>See this article for a handy reference to the
: complete list of permissons on SQL Server 2005
: </td><td>28/12/2006</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art223/Art223.html'
: target=_blank>Open</a></td><td>223</td><td>SQL Schemas In SQL
: 2005</td><td>Want to know a little more about schemas in SQL Server
: 2005, take a look at this quick overview.
: </td><td>25/12/2006</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art224/Art224.html'
: target=_blank>Open</a></td><td>224</td><td>SQL Server 2005 -
: Must_Change option</td><td>When de-checking Enforce Password Policy,
: SQL Security responds with an error and refers to Must_Change being
: in force. This article shows you how to reverse this.
: </td><td>27/12/2006</td>
: </tr><tr valign="Top">
: <td><a href='Articles/Art220/Art220.html'
: target=_blank>Open</a></td><td>220</td><td>Installing Adventureworks
: Sample</td><td>If you dont install the samples for Adventureworks
: first time, getting them on can be a little tricky. This article
: explains.
: </td><td>23/12/2006</td>
: </tr>
: </table>
:
: '// THE CODE WHICH PROCESSES THE xhtml
:
:
:
: Private Sub useXmlDocButton_Click(ByVal sender As System.Object,
: ByVal e As System.EventArgs) Handles useXmlDocButton.Click
:
: GC.Collect()
:
: 'Clear message
:
: Me.messageTextBox.Text = String.Empty
:
: Dim xmlString As String
:
: '//Some pre-processing here
:
: xmlString = Me.sourcetextBox.Text.ToLower
:
: '//Remove nbsp
:
: xmlString = Regex.Replace(xmlString, "&nbsp;", "")
:
: '//Remove any explorer codes
:
: xmlString = Regex.Replace(xmlString, "&[a-zA-Z0-9]*;", "")
:
: '//Remove any unquoted attributes which appear at the end of a tag
:
: xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]*>",
: ">")
:
: '//Remove any unquoted attributes which before end of tag
:
: xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]* ",
: "")
:
: 'Finally prepend the cml declaration needed
:
: xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString
:
: Me.sourcetextBox.Text = xmlString
:
: 'Get the xml into a stream
:
: Dim stream As New System.IO.MemoryStream
:
: stream.Write((New System.Text.UTF8Encoding).GetBytes(xmlString), 0,
: xmlString.Length)
:
: stream.Position = 0
:
: Dim xDoc As New System.Xml.XmlDocument
:
: xDoc.Load(stream)
:
: stream.Position = 0
:
: Dim xreader As New System.Xml.XmlTextReader(stream)
:
: Dim xNode As System.Xml.XmlNode
:
: stream.Position = 0
:
: While xreader.Read()
:
: If xreader.NodeType = Xml.XmlNodeType.Element Then
:
: xNode = xDoc.ReadNode(xreader) '//************* THIS IS WHERE IT
: FAILS //
:
: xNode.Attributes.RemoveAll()
:
: End If
:
: End While
:
:
:
: Dim sr As New System.IO.StreamReader(stream)
:
: stream.Position = 0
:
: targetTextBox.Text = sr.ReadToEnd
:
: sr.Close()
:
: sr.Dispose()
:
: xreader.Close()
:
: stream.Close()
:
: stream.Dispose()


Try something along these lines instead (VB.NET 2.0):

xmlString As String = Me.sourcetextBox.Text.ToLower
xmlString = Regex.Replace(xmlString, _
"&nbsp;", "")
xmlString = Regex.Replace(xmlString, _
"&[a-zA-Z0-9]*;", "")
xmlString = Regex.Replace(xmlString, _
" [A-Za-z0-9]*=[A-Za-z0-9_]*>", ">")
xmlString = Regex.Replace(xmlString, _
" [A-Za-z0-9]*=[A-Za-z0-9_]* ", "")

'NOT SURE WHY YOU'D WANT THIS BUT NO HARM IN IT
xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString

Dim tmpDoc as New XmlDocument
tmpdoc.loadxml(xmlstring)
ZapAttributes(tmpdoc.selectSingleNode("/table"))
Me.targetTextBox.Text = tmpdoc.InnerXml

[...]

Private Sub ZapAttributes(xNode as xmlnode)
If xNode.attributes IsNot Nothing Then
xnode.Attributes.RemoveAll
End If
For each child As xmlNode in xNode.childNOdes
ZapAttributes(child)
Next
End Sub

Ralf
--
--
----------------------------------------------------------
* ^~^ ^~^ *
* _ {~ ~} {~ ~} _ *
* /_``>*< >*<''_\ *
* (\--_)++) (++(_--/) *
 
"Just Me" <news.microsoft.com> wrote in message
:
: Thanks for your help. But it doesent really answer my question about
: my own failing code. Where am I going wrong, this is important for
: me to learn as I need to know why its failing.
:
: Many Thanks

<snip>

Well, at first glance it would appear that the problem is here:

=============================
xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString
=============================

This is the Xml Declaration the exception is referring to. However, if
you remove this line you just end up with a different exception -
"There are multiple root elements" - so in reality, the xml
declaration isn't the actual problem.

What these two exceptions have in common is that they are reporting
the underlying xml as being malformed and I think that is an important
clue. I'm not an expert with the memory stream object, so I cannot
give you a specific answer as to what is happening but it appears that
the when the xml reader gets to the end of the memory stream, it is
looping back on itself. What the xml text reader object therefore ends
up seeing is something like this:

<?xml version='1.0'?>
<table>
<tr>
[...]
</tr>
</table>
<?xml version='1.0'?>
<table>
<tr>
[...]
</tr>
</table>

In the first exception message, it's objecting because it thinks it's
seeing the <?xml...?> declaration embedded in the complete document.
In second exception, it's objecting to the what it thinks is a second
root element.

As I've stated, I'm not familiar with the memory stream object so I
don't know in fact that this what is happening, but this certainly
strikes me as plausible. This argument is reinforced when you consider
that if you copy the xml into a text file and make the following
change, the xmlexceptions go away:

'Dim xreader As New System.Xml.XmlTextReader(stream)
Dim xreader As New System.Xml.XmlTextReader("xhtmldoc.xml")

Ralf
 
Ok Ralf

Thanks for your insight into this problem, I find this whole area a little
confusing, there seems to be so many ways of skinning the same cat. You have
the xpath stuff, the xldocument itself, the xmlreader, the streams.

Blows my head off sometimes.

I am trying to alter the code you gave me so that I can re-apply specific
class attributes to the first row and another to the tables cells and one
for the table tag itself.

I seem to have almost got it, but not quite.

Thanks anyway for your help.



_AnonCoward said:
"Just Me" <news.microsoft.com> wrote in message
:
: Thanks for your help. But it doesent really answer my question about
: my own failing code. Where am I going wrong, this is important for
: me to learn as I need to know why its failing.
:
: Many Thanks

<snip>

Well, at first glance it would appear that the problem is here:

=============================
xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString
=============================

This is the Xml Declaration the exception is referring to. However, if
you remove this line you just end up with a different exception -
"There are multiple root elements" - so in reality, the xml
declaration isn't the actual problem.

What these two exceptions have in common is that they are reporting
the underlying xml as being malformed and I think that is an important
clue. I'm not an expert with the memory stream object, so I cannot
give you a specific answer as to what is happening but it appears that
the when the xml reader gets to the end of the memory stream, it is
looping back on itself. What the xml text reader object therefore ends
up seeing is something like this:

<?xml version='1.0'?>
<table>
<tr>
[...]
</tr>
</table>
<?xml version='1.0'?>
<table>
<tr>
[...]
</tr>
</table>

In the first exception message, it's objecting because it thinks it's
seeing the <?xml...?> declaration embedded in the complete document.
In second exception, it's objecting to the what it thinks is a second
root element.

As I've stated, I'm not familiar with the memory stream object so I
don't know in fact that this what is happening, but this certainly
strikes me as plausible. This argument is reinforced when you consider
that if you copy the xml into a text file and make the following
change, the xmlexceptions go away:

'Dim xreader As New System.Xml.XmlTextReader(stream)
Dim xreader As New System.Xml.XmlTextReader("xhtmldoc.xml")

Ralf
--
--
----------------------------------------------------------
* ^~^ ^~^ *
* _ {~ ~} {~ ~} _ *
* /_``>*< >*<''_\ *
* (\--_)++) (++(_--/) *
 
Just Me wrote :
<backposted/>

If what you want is to extract the contents of the html in a structured
way, then I suggest you use a tool to convert html to xml first --
there are so many details on dealing with html that any ad hoc approach
is sure to leave something out.

It seems HTMLTidy is such a tool (I never used, can't say anything
about it).

Another approach you may consider is using the WebBrowser control to
"navigate" the document structure. Maybe its easier than your current
approach:

<aircode>
Private WithEvents WB As WebBrowser
Private mText As String

Sub ExtractText(ByVal Text As String)
mText = ""
If WB Is Nothing Then WB = New WebBrowser
WB.DocumentText = Text
End Sub

Private Sub WB_DocumentCompleted( _
ByVal sender As System.Object, _
ByVal E As WebBrowserDocumentCompletedEventArgs _
) Handles WB.DocumentCompleted

Dim S As New System.Text.StringBuilder
MapHtmlItems(WB.Document.Body.Children, S, 0)
mText = S.ToString
Debug.Print(mText)
End Sub

Sub MapHtmlItems(ByVal Items As HtmlElementCollection, _
ByVal Builder As System.Text.StringBuilder, _
ByVal Level As Integer)

For Each E As HtmlElement In Items
MapHtmlItem(E, Builder, Level)
Next

End Sub

Sub MapHtmlItem(ByVal Element As HtmlElement, _
ByVal Builder As System.Text.StringBuilder, _
ByVal Level As Integer)

If Element.CanHaveChildren Then
Dim Tag As String = Element.TagName
Dim Text As String = Nothing

If Element.Children.Count = 0 Then
Text = Element.InnerText
End If

Select Case Element.TagName.ToLower
Case "table", "tr", "td"
'does nothing
Case Else
Tag = Nothing
End Select

Dim Tab As String = New String(" "c, Level * 2)
If Not String.IsNullOrEmpty(Text) Then
Dim S As String
If Not String.IsNullOrEmpty(Tag) Then
S = String.Format("{0}<{1}>{2}</{1}>", Tab, Tag, Text)
Else
S = String.Format("{0}{1}", Tab, Text)
End If
Builder.AppendLine(S)
Else
If Not String.IsNullOrEmpty(Tag) Then
Builder.AppendLine(String.Format("{0}<{1}>", Tab, Tag))
End If

MapHtmlItems(Element.Children, Builder, Level + 1)

If Not String.IsNullOrEmpty(Tag) Then
Builder.AppendLine(String.Format("{0}</{1}>", Tab, Tag))
End If

End If

End If

End Sub

</aircode>

The previous code will extract all table structures from the htmltext
you provide. To this, just pass the text to ExtractText(); the result
will be saved in the mText global string. Maybe this can give you new
ideas. ;-)

HTH.

Regards,

Branco.
Any ideas on this. I am trying to loop through an xml document to remove
attributes, but Im having so much trouble, any help is appreciated

//THIS IS THE EXCEPTION ( SEE CODE LINE WHERE FAILURE OCCURS

'//Unexpected XML declaration. The XML declaration must be the first node in
the document, and no white space characters are allowed to appear before it.
Line 13, position 11.

//THE XHTML TEXT WHICH IS BEING LOOOKED AT

<table cellspacing="0" rules="all" border="1" id="dgArticles"
style="font-family:Arial;font-size:8pt;width:762px;border-collapse:collapse;">
<tr style="color:White;background-color:Blue;">
<td>&nbsp;</td><td style="width:0.75cm;">ID</td><td
style="width:7cm;">Title</td><td style="width:13cm;">Summary</td><td
style="width:1cm;">Published</td>
</tr><tr valign="Top">
<td><a href='Articles/Art226/Art226.html'
target=_blank>Open</a></td><td>226</td><td>SQL Server 2005
Permissions</td><td>See this article for a handy reference to the complete
list of permissons on SQL Server 2005 </td><td>28/12/2006</td>
</tr><tr valign="Top">
<td><a href='Articles/Art223/Art223.html'
target=_blank>Open</a></td><td>223</td><td>SQL Schemas In SQL
2005</td><td>Want to know a little more about schemas in SQL Server 2005,
take a look at this quick overview. </td><td>25/12/2006</td>
</tr><tr valign="Top">
<td><a href='Articles/Art224/Art224.html'
target=_blank>Open</a></td><td>224</td><td>SQL Server 2005 - Must_Change
option</td><td>When de-checking Enforce Password Policy, SQL Security
responds with an error and refers to Must_Change being in force. This
article shows you how to reverse this. </td><td>27/12/2006</td>
</tr><tr valign="Top">
<td><a href='Articles/Art220/Art220.html'
target=_blank>Open</a></td><td>220</td><td>Installing Adventureworks
Sample</td><td>If you dont install the samples for Adventureworks first
time, getting them on can be a little tricky. This article explains.
</td><td>23/12/2006</td>
</tr>
</table>

'// THE CODE WHICH PROCESSES THE xhtml



Private Sub useXmlDocButton_Click(ByVal sender As System.Object, ByVal e As
System.EventArgs) Handles useXmlDocButton.Click

GC.Collect()

'Clear message

Me.messageTextBox.Text = String.Empty

Dim xmlString As String

'//Some pre-processing here

xmlString = Me.sourcetextBox.Text.ToLower

'//Remove nbsp

xmlString = Regex.Replace(xmlString, "&nbsp;", "")

'//Remove any explorer codes

xmlString = Regex.Replace(xmlString, "&[a-zA-Z0-9]*;", "")

'//Remove any unquoted attributes which appear at the end of a tag

xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]*>", ">")

'//Remove any unquoted attributes which before end of tag

xmlString = Regex.Replace(xmlString, " [A-Za-z0-9]*=[A-Za-z0-9_]* ", "")

'Finally prepend the cml declaration needed

xmlString = "<?xml version='1.0' encoding='utf-8'?> " & xmlString

Me.sourcetextBox.Text = xmlString

'Get the xml into a stream

Dim stream As New System.IO.MemoryStream

stream.Write((New System.Text.UTF8Encoding).GetBytes(xmlString), 0,
xmlString.Length)

stream.Position = 0

Dim xDoc As New System.Xml.XmlDocument

xDoc.Load(stream)

stream.Position = 0

Dim xreader As New System.Xml.XmlTextReader(stream)

Dim xNode As System.Xml.XmlNode

stream.Position = 0

While xreader.Read()

If xreader.NodeType = Xml.XmlNodeType.Element Then

xNode = xDoc.ReadNode(xreader) '//************* THIS IS WHERE IT FAILS //

xNode.Attributes.RemoveAll()

End If

End While



Dim sr As New System.IO.StreamReader(stream)

stream.Position = 0

targetTextBox.Text = sr.ReadToEnd

sr.Close()

sr.Dispose()

xreader.Close()

stream.Close()

stream.Dispose()
 
In the end I was able to get just what I needed. Here you go!

Imports System.xml

Imports System.Text.RegularExpressions

Private idNo As Integer

Private rowCount As Integer



Private Sub processButton_Click(ByVal sender As System.Object, ByVal e As
System.EventArgs) Handles processButton.Click

Dim xmlString As String

idNo = 0

'//Some pre-processing here

xmlString = Me.sourceTextbox.Text.ToLower

'//Remove nbsp

xmlString = Regex.Replace(xmlString, "&nbsp;", "")

'//Remove any explorer codes

xmlString = Regex.Replace(xmlString, "&[a-zA-Z0-9]*;", "")

'//Remove any unquoted attributes which appear at the end of a tag

xmlString = Regex.Replace(xmlString, "\sp*[A-Za-z0-9]*=[A-Za-z0-9_]*>", ">")

'//Remove any unquoted attributes which before end of tag

xmlString = Regex.Replace(xmlString, "\sp*[A-Za-z0-9]*=[A-Za-z0-9_]* ", "")

Dim tmpDoc As New XmlDocument

Try

tmpDoc.LoadXml(xmlString)

ZapAttributes(tmpDoc.SelectSingleNode("/table"), tmpDoc)

Me.targetTextBox.Text = tmpDoc.InnerXml

Catch ex As XmlException

End Try

End Sub

Private Sub ZapAttributes(ByVal xNode As XmlNode, ByVal xd As
System.Xml.XmlDocument)

If Not (xNode.Attributes Is Nothing) Then

Dim xAttr As System.Xml.XmlAttribute

xNode.Attributes.RemoveAll()

Select Case xNode.Name

Case "table"

xAttr = xd.CreateAttribute("class")

xAttr.Value = "ArticleTableTag"

xNode.Attributes.Append(xAttr)

Case "tr"

rowCount += 1

Case "td"

If rowCount = 1 Then

xAttr = xd.CreateAttribute("class")

xAttr.Value = "ArticleTableHeader"

xNode.Attributes.Append(xAttr)

ElseIf rowCount > 1 Then

xAttr = xd.CreateAttribute("class")

xAttr.Value = "ArticleTableCells"

xNode.Attributes.Append(xAttr)

End If

Case "a"

End Select

End If

For Each child As XmlNode In xNode.ChildNodes

ZapAttributes(child, xd)

Next

End Sub

End Class
 
Back
Top