S
Seven Stars
Hey Guys, I have the code for this html content parser, see bellow.
My program should go to google and extraxt all refferencing link to
the link I insert.
So if I ask my app to find all links to www.mysite.com then it should
go to google with this parameter: "links: www.mysite.com".
Problem is that this parser will always extraxt links only from the
first page on google.
What if google suggests over 1000 links? How do I swith from page to
page? How do I know when it's end?
This is, (believe it or not) a problem for me.
7*
Imports System.IO
Imports System.Net
Imports System
Imports System.Text
Imports System.Text.RegularExpressions
Public Class HTMLContentParser
Function Return_HTMLContent(ByVal sURL As String)
Dim sStream As Stream
Dim URLReq As HttpWebRequest
Dim URLRes As HttpWebResponse
Try
URLReq = WebRequest.Create(sURL)
URLRes = URLReq.GetResponse()
sStream = URLRes.GetResponseStream()
Return New
StreamReader(sStream).ReadToEnd()
Catch ex As Exception
Return ex.Message
End Try
End Function
Function ParseHTMLLinks(ByVal sHTMLContent As String, ByVal
sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList
rRegEx = New
Regex("a.*href\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))",
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString,
sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Function ParseHTMLImages(ByVal sHTMLContent As String, ByVal
sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList
rRegEx = New
Regex("img.*src\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))",
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString,
sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Private Function ProcessURL(ByVal sInput As String, ByVal sURL
As String)
'Find out if the sURL has a "/" after the Domain
Name 'If not, give a "/" at the end 'First, check out for
any slash after the 'Double Dashes of the http:// 'If there is NO
slash, then end the sURL string with a SLASH If InStr(8, sURL,
"/") = 0 Then
sURL += "/"
'End If
'FILTERING
'Filter down to the Domain Name Directory from the Right
Dim iCount As Integer
For iCount = sURL.Length To 1 Step -1
If Mid(sURL, iCount, 1) = "/" Then
sURL = Left(sURL, iCount)
Exit For
End If
Next
'Filter out the ">" from the Left
For iCount = 1 To sInput.Length
If Mid(sInput, iCount, 4) = ">" Then
sInput = Left(sInput, iCount - 1) 'Stop and
Take the Char before
Exit For
End If
Next
'Filter out unnecessary Characters
sInput = sInput.Replace("<",
Chr(39))
sInput = sInput.Replace(">",
Chr(39))
'sInput = sInput.Replace(""",
"")
sInput = sInput.Replace("'", "")
If (sInput.IndexOf("http://") <
0) Then
If (Not
(sInput.StartsWith("/")) And Not
(sURL.EndsWith("/"))) Then
Return sURL & "/" & sInput
Else
If (sInput.StartsWith("/"))
And (sURL.EndsWith("/")) Then
Return sURL.Substring(0, sURL.Length - 1)
+ sInput
Else
Return sURL + sInput
End If
End If
Else
Return sInput
End If
End Function
End Class
My program should go to google and extraxt all refferencing link to
the link I insert.
So if I ask my app to find all links to www.mysite.com then it should
go to google with this parameter: "links: www.mysite.com".
Problem is that this parser will always extraxt links only from the
first page on google.
What if google suggests over 1000 links? How do I swith from page to
page? How do I know when it's end?
This is, (believe it or not) a problem for me.
7*
Imports System.IO
Imports System.Net
Imports System
Imports System.Text
Imports System.Text.RegularExpressions
Public Class HTMLContentParser
Function Return_HTMLContent(ByVal sURL As String)
Dim sStream As Stream
Dim URLReq As HttpWebRequest
Dim URLRes As HttpWebResponse
Try
URLReq = WebRequest.Create(sURL)
URLRes = URLReq.GetResponse()
sStream = URLRes.GetResponseStream()
Return New
StreamReader(sStream).ReadToEnd()
Catch ex As Exception
Return ex.Message
End Try
End Function
Function ParseHTMLLinks(ByVal sHTMLContent As String, ByVal
sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList
rRegEx = New
Regex("a.*href\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))",
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString,
sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Function ParseHTMLImages(ByVal sHTMLContent As String, ByVal
sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList
rRegEx = New
Regex("img.*src\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))",
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString,
sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Private Function ProcessURL(ByVal sInput As String, ByVal sURL
As String)
'Find out if the sURL has a "/" after the Domain
Name 'If not, give a "/" at the end 'First, check out for
any slash after the 'Double Dashes of the http:// 'If there is NO
slash, then end the sURL string with a SLASH If InStr(8, sURL,
"/") = 0 Then
sURL += "/"
'End If
'FILTERING
'Filter down to the Domain Name Directory from the Right
Dim iCount As Integer
For iCount = sURL.Length To 1 Step -1
If Mid(sURL, iCount, 1) = "/" Then
sURL = Left(sURL, iCount)
Exit For
End If
Next
'Filter out the ">" from the Left
For iCount = 1 To sInput.Length
If Mid(sInput, iCount, 4) = ">" Then
sInput = Left(sInput, iCount - 1) 'Stop and
Take the Char before
Exit For
End If
Next
'Filter out unnecessary Characters
sInput = sInput.Replace("<",
Chr(39))
sInput = sInput.Replace(">",
Chr(39))
'sInput = sInput.Replace(""",
"")
sInput = sInput.Replace("'", "")
If (sInput.IndexOf("http://") <
0) Then
If (Not
(sInput.StartsWith("/")) And Not
(sURL.EndsWith("/"))) Then
Return sURL & "/" & sInput
Else
If (sInput.StartsWith("/"))
And (sURL.EndsWith("/")) Then
Return sURL.Substring(0, sURL.Length - 1)
+ sInput
Else
Return sURL + sInput
End If
End If
Else
Return sInput
End If
End Function
End Class