I am writing a .net web spider. while it works great on one of my sites (about 20 pages) it bombs with a System.StackOverflowException with another site i manage (about 500 pages).
I am developing this on a win7 64bit i3 laptop with 8g of ram, a 128g hyperx ssd and no swap file.
My question is.... am i getting this exception thrown because i have no swap file?
cpu usage (vs2010 debug process) only gets to around 34% with only 74-75m ram usage.
if this is the case, how can i ensure that it wont happen?
This is with no recursion.
Code:
Imports System.Reflection
Imports System.Net
Imports Superstar.Html.Linq
Public Class Downloader
Implements IDisposable
''' <summary>
''' Get the returned downloaded string
''' </summary>
''' <value></value>
''' <returns></returns>
''' <remarks></remarks>
Public ReadOnly Property ReturnString As String
Get
Return _StrReturn
End Get
End Property
Private Property _StrReturn As String
''' <summary>
''' Get the returned downloaded byte array
''' </summary>
''' <value></value>
''' <returns></returns>
''' <remarks></remarks>
Public ReadOnly Property ReturnBytes As Byte()
Get
Return _FSReturn
End Get
End Property
Private Property _FSReturn As Byte()
Private Property _UserAgent As String = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"
Private Property DataReceived As Boolean = False
''' <summary>
''' Download a string, but do not block the calling thread
''' </summary>
''' <param name="_Path"></param>
''' <remarks></remarks>
Public Sub DownloadString(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing)
SetAllowUnsafeHeaderParsing20()
Using wc As New Net.WebClient()
With wc
Dim _ct As Long = 0
DataReceived = False
.Headers.Add("user-agent", _UserAgent)
.DownloadStringAsync(New System.Uri(_Path))
AddHandler .DownloadStringCompleted, AddressOf StringDownloaded
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Loop
End With
End Using
End Sub
''' <summary>
''' Download a file, but do not block the calling thread
''' </summary>
''' <param name="_Path"></param>
''' <remarks></remarks>
Public Sub DownloadFile(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing)
SetAllowUnsafeHeaderParsing20()
Using wc As New Net.WebClient()
With wc
Dim _ct As Long = 0
DataReceived = False
.Headers.Add("user-agent", _UserAgent)
.DownloadDataAsync(New System.Uri(_Path))
AddHandler .DownloadDataCompleted, AddressOf FileStreamDownload
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Loop
End With
End Using
End Sub
''' <summary>
''' Download a parsable HDocument, for using HtmlToLinq
''' </summary>
''' <param name="_Path"></param>
''' <returns></returns>
''' <remarks></remarks>
Public Function DownloadHDoc(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing) As HDocument
Try
'
'
'
'
'
'
'StackOverFlowException Occurring Here!
DownloadString(_Path, _Worker)
Return HDocument.Parse(_StrReturn)
Catch soex As StackOverflowException
'put some logging in here, with the path attempted
Return Nothing
Catch ex As Exception
SetAllowUnsafeHeaderParsing20()
Return HDocument.Load(_Path)
End Try
End Function
#Region "Internals"
Private Sub SetAllowUnsafeHeaderParsing20()
Dim a As New System.Net.Configuration.SettingsSection
Dim aNetAssembly As System.Reflection.Assembly = Assembly.GetAssembly(a.GetType)
Dim aSettingsType As Type = aNetAssembly.GetType("System.Net.Configuration.SettingsSectionInternal")
Dim args As Object() = Nothing
Dim anInstance As Object = aSettingsType.InvokeMember("Section", BindingFlags.Static Or BindingFlags.GetProperty Or BindingFlags.NonPublic, Nothing, Nothing, args)
Dim aUseUnsafeHeaderParsing As FieldInfo = aSettingsType.GetField("useUnsafeHeaderParsing", BindingFlags.NonPublic Or BindingFlags.Instance)
aUseUnsafeHeaderParsing.SetValue(anInstance, True)
End Sub
Private Sub FileStreamDownload(ByVal sender As Object, ByVal e As DownloadDataCompletedEventArgs)
If e.Cancelled = False AndAlso e.Error Is Nothing Then
DataReceived = True
_FSReturn = DirectCast(e.Result, Byte())
Else
_FSReturn = Nothing
End If
End Sub
Private Sub StringDownloaded(ByVal sender As Object, ByVal e As DownloadStringCompletedEventArgs)
If e.Cancelled = False AndAlso e.Error Is Nothing Then
DataReceived = True
_StrReturn = DirectCast(e.Result, String)
Else
_StrReturn = String.Empty
End If
End Sub
#End Region
#Region "IDisposable Support"
Private disposedValue As Boolean ' To detect redundant calls
' IDisposable
Protected Overridable Sub Dispose(disposing As Boolean)
If Not Me.disposedValue Then
If disposing Then
End If
_StrReturn = String.Empty
_FSReturn = Nothing
End If
Me.disposedValue = True
End Sub
Public Sub Dispose() Implements IDisposable.Dispose
Dispose(True)
GC.SuppressFinalize(Me)
End Sub
#End Region
End Class
And the code that calls this where the stackoverflow is happenning
Private Function PopulateSEOList(Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing) As List(Of Typing.SEO)
Dim _L = LinkList, _Ct As Long = 0
Dim _NL As New List(Of Typing.SEO)
Dim _EL As Typing.SEO.Elements = Nothing
Dim _Doc As HDocument = Nothing, _Keywords As String = String.Empty, _Description As String = String.Empty, _Content As HElement = Nothing
For i As Long = 0 To _L.Count - 1
Try
_Ct += 1
Using _HDoc As New Downloader
With _HDoc
_Doc = .DownloadHDoc(_L(i).SiteUrl)
End With
End Using
Tasks.Parallel.Invoke(Sub()
'Keywords
For Each Item In _Doc.Descendants("meta")
If Item.Attribute("name") = "keywords" Then
_Keywords = Item.Attribute("content").Value
'Exit For
End If
Next
End Sub,
Sub()
'Description
For Each Item In _Doc.Descendants("meta")
If Item.Attribute("name") = "description" Then
_Description = Item.Attribute("content").Value
'Exit For
End If
Next
End Sub,
Sub()
If _Doc.Descendants("body") IsNot Nothing Then
_Content = _Doc.Descendants("body").FirstOrDefault
End If
End Sub,
Sub()
_EL = New Typing.SEO.Elements() With {
.H1 = If(_Doc.Descendants("h1") IsNot Nothing, (From n In _Doc.Descendants("h1").AsParallel()
Select n.Value).ToList(), Nothing),
.H2 = If(_Doc.Descendants("h2") IsNot Nothing, (From n In _Doc.Descendants("h2").AsParallel()
Select n.Value).ToList(), Nothing),
.H3 = If(_Doc.Descendants("h3") IsNot Nothing, (From n In _Doc.Descendants("h3").AsParallel()
Select n.Value).ToList(), Nothing),
.H4 = If(_Doc.Descendants("h4") IsNot Nothing, (From n In _Doc.Descendants("h4").AsParallel()
Select n.Value).ToList(), Nothing),
.H5 = If(_Doc.Descendants("h5") IsNot Nothing, (From n In _Doc.Descendants("h5").AsParallel()
Select n.Value).ToList(), Nothing),
.H6 = If(_Doc.Descendants("h6") IsNot Nothing, (From n In _Doc.Descendants("h6").AsParallel()
Select n.Value).ToList(), Nothing),
.UL = If(_Doc.Descendants("ul") IsNot Nothing, (From n In _Doc.Descendants("ul").AsParallel()
Select n.Value).ToList(), Nothing),
.OL = If(_Doc.Descendants("ol") IsNot Nothing, (From n In _Doc.Descendants("ol").AsParallel()
Select n.Value).ToList(), Nothing),
.STRONG = If(_Doc.Descendants("strong") IsNot Nothing OrElse _Doc.Descendants("b") IsNot Nothing,
(From n In _Doc.Descendants("strong").AsParallel()
Select n.Value).Union(From n In _Doc.Descendants("b").AsParallel()
Select n.Value).ToList(), Nothing),
.BLOCKQUOTE = If(_Doc.Descendants("blockquote") IsNot Nothing, (From n In _Doc.Descendants("blockquote").AsParallel()
Select n.Value).ToList(), Nothing),
.EM = If(_Doc.Descendants("em") IsNot Nothing OrElse _Doc.Descendants("i") IsNot Nothing,
(From n In _Doc.Descendants("em").AsParallel()
Select n.Value).Union(From n In _Doc.Descendants("i").AsParallel()
Select n.Value).ToList(), Nothing),
.A = If(_Doc.Descendants("a") IsNot Nothing, (From n In _Doc.Descendants("a").AsParallel()
Select New Typing.SEO.Elements.Links() With {
.Content = n.Value,
.Title = If(n.Attribute("title") IsNot Nothing,
n.Attribute("title").Value,
Nothing),
.Target = If(n.Attribute("target") IsNot Nothing,
n.Attribute("target").Value,
Nothing),
.Rel = If(n.Attribute("rel") IsNot Nothing,
n.Attribute("rel").Value,
Nothing),
.Href = If(n.Attribute("href") IsNot Nothing,
n.Attribute("href").Value,
Nothing)
}).ToList(), Nothing),
.IMG = If(_Doc.Descendants("img") IsNot Nothing,
(From n In _Doc.Descendants("img").AsParallel()
Select New Typing.SEO.Elements.Images() With {
.Alt = If(n.Attribute("alt") IsNot Nothing,
n.Attribute("alt").Value,
Nothing),
.Source = If(n.Attribute("src") IsNot Nothing,
n.Attribute("src").Value,
Nothing),
.Title = If(n.Attribute("title") IsNot Nothing,
n.Attribute("title").Value,
Nothing)
}).ToList(),
Nothing)
}
End Sub)
_NL.Add(New Typing.SEO() With {
.Link = _L(i).SiteUrl,
.Title = _Doc.Descendants("title").First().Value,
.Keywords = _Keywords,
.Description = _Description,
.Content = _Content,
.ContentElements = _EL
})
_L.RemoveAt(i)
_EL = Nothing : _Doc = Nothing
ReportProgress((_Ct / _L.Count) * 100, _Worker)
Catch ex As Exception
'Put logging in here
End Try
Next
Return _NL
End Function
As you probably know, this error is most likely due to a bug in code causing infinite looping in a recursive algorithm. Although you say you don't use recursion, you probably have recursion happening inadvertently.
The easiest way to figure out what's causing it is to attach the debugger, configure Visual Studio to break on exceptions, and trigger the error in your application.
When the error occurs and the debugger breaks, have a look at the call stack - hopefully you'll see what the problem is there.