Search code examples
.netvb.nethttpcookieshttpwebrequest

Automate picture downloads from website with authentication, part two


This question is a derivated from this other question: Automate picture downloads from website with authentication where I asked how to download a picture from an specific website that requires a login.

There are two websites from the same company, cgwallpapers.com and gamewallpapers.com, since with the help of the user who answered the other question I finally maneged how to automate the download of one of the websites, I'm not able to reproduce the same steps on gamewallpapers.com website.

Maybe I can be wrong with the things that I'm gonna say due to my inexperience on requests, so please if an helper/expert have time to I really suggest to verify that the parametters and other things that I'm gonna say are like I'm saying or are not, as I say, I can be wrong.

In cgwallpapers.com, I basically set the query like this to download a wallpaper:

http://www.cgmewallpapers.com/members/getwallpaper.php?id=100&res=1920x1080

But I found that in gamewallpapers.com I cannot use the same post data because it seems to be like this:

http://www.gamewallpapers.com/members/getwallpaper.php?wallpaper=wallpaper_ancient_space_01_1920x1080.jpg&keystr=1423106012&retry=

In cgwallpapers is easier because I Just can use an incremental for loop with the ids with the specific wallpaper resolution, but with gamewallpapers.com site I can't figure out how I can automate the wallpaper downloads, it seems to need a treatment totally different if I'm not wrong.

So, I don't know what to try or even how to do it.

After I logged into gamewallpapers.com, this is the way that I'm trying to download a wallpaper, of course this does not works because I'm not using the proper query, but this code worked for cgwallpaper.com site so i'll show if it can help for something:

NOTE: WallpaperInfo is a non-relevant object that I use to return the downloaded wallpaper image stream, it is much code so I skipped it.

''' <summary>
''' Tries to download the specified wallpaper from GameWallpapers server.
''' </summary>
''' <param name="id">The wallpaper id.</param>
''' <param name="res">The wallpaper resolution.</param>
''' <param name="cookieCollection">The cookie collection.</param>
''' <returns>A <see cref="WallpaperInfo"/> instance containing the wallpaper info and the image stream.</returns>
Private Function GetWallpaperMethod(ByVal id As String,
                                    ByVal res As String,
                                    ByRef cookieCollection As CookieCollection) As WallpaperInfo

    Dim request As HttpWebRequest
    Dim url As String = String.Format("http://www.gamewallpapers.com/members/getwallpaper.php?id={0}&res={1}", id, res)
    Dim contentDisposition As String
    Dim webResponse As WebResponse = Nothing
    Dim responseStream As Stream = Nothing
    Dim imageStream As MemoryStream = Nothing
    Dim wallInfo As WallpaperInfo = Nothing

    Try
        request = DirectCast(HttpWebRequest.Create(url), HttpWebRequest)
        With request
            .Method = "GET"
            .Headers.Add("Accept-Language", "en-US,en;q=0.5")
            .Headers.Add("Accept-Encoding", "gzip, deflate")
            .Headers.Add("Keep-Alive", "300")
            .Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
            .AllowAutoRedirect = False
            .UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0"
            .KeepAlive = True
        End With

        If cookieCollection IsNot Nothing Then
            ' Pass cookie info so that we remain logged in.
            request.CookieContainer = Me.SetCookieContainer(url, cookieCollection)
        End If

        webResponse = request.GetResponse

        Using webResponse

            contentDisposition = CType(webResponse, HttpWebResponse).Headers("Content-Disposition")

            If Not String.IsNullOrEmpty(contentDisposition) Then ' There is an image to download.

                Dim filename As String = contentDisposition.Substring(contentDisposition.IndexOf("=") + "=".Length).
                                         TrimStart(" "c).TrimEnd({" "c, ";"c})

                Try
                    imageStream = New MemoryStream
                    responseStream = webResponse.GetResponseStream

                    Using responseStream

                        Dim buffer(2047) As Byte
                        Dim read As Integer

                        Do
                            read = responseStream.Read(buffer, 0, buffer.Length)
                            imageStream.Write(buffer, 0, read)
                        Loop Until read = 0

                        responseStream.Close()

                    End Using

                Catch ex As Exception
                    Throw

                End Try

                ' This is the object that I'll return
                ' that I'm storing the url, the wallpaper id,
                ' the wallpaper resolution, the wallpaper filename
                ' and finally the downloaded MemoryStream (the wallpaper image stream)
                wallInfo = New WallpaperInfo(url:=url,
                                             id:=id,
                                             resolution:=res,
                                             filename:=filename,
                                             imageStream:=imageStream)

            End If ' String.IsNullOrEmpty(contentDisposition)

        End Using ' webResponse

    Catch ex As Exception
        Throw

    Finally
        If webResponse IsNot Nothing Then
            webResponse.Close()
        End If
        If responseStream IsNot Nothing Then
            responseStream.Close()
        End If

    End Try

    Return wallInfo

End Function

Private Function SetCookieContainer(ByVal url As String,
                                    ByVal cookieCollection As CookieCollection) As CookieContainer

    Dim cookieContainer As New CookieContainer
    Dim refDate As Date

    For Each oldCookie As Cookie In cookieCollection

        If Not DateTime.TryParse(oldCookie.Value, refDate) Then

            Dim newCookie As New Cookie
            With newCookie
                .Name = oldCookie.Name
                .Value = oldCookie.Value
                .Domain = New Uri(url).Host
                .Secure = False
            End With

            cookieContainer.Add(newCookie)

        End If

    Next oldCookie

    Return cookieContainer

End Function

Here is the full source that I'm trying to realize with an example usage of how I expected that it should work (a for loop incrementing the wallpapers ids to automate downloads ), it works perfect when CHANGING the base url name from gamewallpapers.com to cgwallpapers.com, because this source only works for cgwallpapers.com but I'm just trying it with gamewallpapers.com url:

http://pastebin.com/eyBxHmnJ


Solution

  • Update:

    As promised, I have come up with a "proper" solution to your question for gamewallpapers.com using the Telerik Testing Framework.

    You must change the sUsername and sPassword variables to your own username/password to successfully log into the site.

    Optional variables that you may want to change:

    • sResolutionString: Defaults to 1920x1080 which is what you specified in your original question. Change this value to any of the accepted resolution values on the website. Just a warning that I am not not 100% sure if all images have the same resolutions so changing this value may cause some images to be skipped if they do not have an image in the desired resolution.
    • sDownloadPath: Currently set to the same folder as the application exe. Change this to the path where you want to download your images.
    • sUserAgent: Defaults to the user agent for Internet Explorer 11 for Windows 7. Since the Telerik Testing Framework controls a real browser (whatever IE version you have installed on your pc in this case), it uses the "real" user agent when sending requests. This variable user agent string is only used when downloading wallpapers using HttpWebRequest and the default is most likely unnecessary since the included code will capture the user agent used by Telerik and save it for later use.
    • nMaxSkippedFilesInSuccession: Set to 10 by default. When trying to download a wallpaper image, the app will check if the filename already exists in your download directory. If it exists then the file will not be downloaded and a skip counter will be incremented. If the skip counter reaches the value of nMaxSkippedFilesInSuccession then the app stops as it assumes you have downloaded the rest of the files in a previous session. Note: In theory this value could even be set to 1 or 2 as the filenames are very unique and therefore would never overlap. The problem is that the toplist.php page is sorted by date and if in the middle of you running this app they add x new images then when you go to the next page the images will be shifted by x. If x is greater than nMaxSkippedFilesInSuccession then you will most likely find that the app will end prematurely as you will be trying to download a number of the same images over again because of the shift.
    • nCurrentPageID: Set to 0 by default. The list page toplist.php accepts a query string argument called Start which tells the page which index to start from depending on your chosen search arguments. The list shows 24 images per page so the nCurrentPageID variable must be divisible by 24 or else you may end up skipping images. Depending on time and circumstances you may not be able to download all images in one session. If this is the case you can remember which nCurrentPageIDyou left off on and update this variable accordingly to start on a different id next time (keep in mind that the images may get shifted as new wallpapers are added to the site since the list page is sorted by wallpaper date).

    To use the Telerik Testing Framework you only need to install the setup file and then include a reference to ArtOfTest.WebAii.dll.

    One quirk about using the testing framework (at least with internet explorer) is that it doesn't allow you to start the browser as a hidden process. I have talked to telerik support about this and they claim that it is not possible to do although other web scraping frameworks like Watin do support this feature (I personally still prefer Watin for this and other reasons but it is quite old now and not updated since 2011). Since it is nice to run web scraping tasks in the background without bothering you from using your computer, this example starts the browser minimized (which telerik does support) and then uses windows api calls to hide the browser process. This is a bit of a hack but it is useful and works well in my experience.

    In my original answer I mentioned that you would most likely have to crawl the toplist.php page by clicking links and building the download url but I was able to get this to work without clicking into any pages other than toplist.php. This is only possible because the wallpaper filename (which is basically the id that you need to download with) is partially contained in the preview image. I also originally thought that the keystr query string parameter was some kind of id that "protected" the download but it is actually not required at all to get the wallpaper.

    One last thing to mention is that the toplist.php page can be sorted by rating or date. Rating is very volatile and subject to change at any moment as people vote for images so this is not a good sort method for this type of work. We use the date in this case because it works well for sorting and should always have the images in the same order as before but there is a small issue: It doesn't seem to allow you to sort in the reverse order. Therefore the newest images always appear at the top on the first page. This causes images to shift over in the list and will most likely cause you to re-test the same images over again when this happens. For cgwallpapers.com this is not a problem because new images will receive a new (higher) id value and we can just remember the last id that we left off on and test the next id in succession to see if there are new images. For gamewallpapers.com we always re-run from pageid 0 and keep going until we reach a certain number of skipped files to know when we have found the end of the images since last download.

    Here is the code. Let me know if you have questions:

    Imports ArtOfTest.WebAii.Core
    Imports System.Runtime.InteropServices
    
    Public Class Form1
        Const sUsername As String = "USERNAMEHERE"
        Const sPassword As String = "PASSWORDHERE"
        Const sMainURL As String = "http://www.gamewallpapers.com"
        Const sListURL As String = "http://www.gamewallpapers.com/members/toplist.php"
        Const sListQueryString As String = "?action=go&title=&maxage=0&latestnr=0&platform=&resolution=&cyberbabes=&membersonly2=&rating=0&minimumvotes2=0&sort=date&start="
        Const sDownloadURL As String = "http://www.gamewallpapers.com/members/getwallpaper.php?wallpaper="
        Const sResolutionString As String = "1920x1080"
        Private sDownloadPath As String = Application.StartupPath
        Private sUserAgent As String = "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0;  rv:11.0) like Gecko"    ' Default to ie11 user agent
        Private oCookieContainerObject As New System.Net.CookieContainer
        Private nMaxSkippedFilesInSuccession As Int32 = 10
        Private nCurrentPageID As Int32 = 0 ' Only incrememnt this value in values of 24 or else you may miss some images
    
        Private Enum oDownloadResult
            Failed = 0
            Success = 1
            Skipped = 2
        End Enum
    
        Private Sub Form1_Load(sender As Object, e As EventArgs) Handles MyBase.Load
            StartScrape()
        End Sub
    
        Private Sub StartScrape()
            Dim oBrowser As Manager = Nothing
    
            Try
                ' Start Internt Explorer
    
                Dim oSettings As New Settings
    
                oSettings.Web.DefaultBrowser = BrowserType.InternetExplorer
                oSettings.DisableDialogMonitoring = False
                oSettings.UnexpectedDialogAction = UnexpectedDialogAction.DoNotHandle
                oSettings.Web.UseHttpProxy = True   ' This must be enabled for us to get the headers being sent and know what the user agent is dynamically
    
                oBrowser = New Manager(oSettings)
    
                oBrowser.Start()
                oBrowser.LaunchNewBrowser(oSettings.Web.DefaultBrowser, True, ProcessWindowStyle.Minimized) ' Start minimized
    
                ' Set up a proxy so that we can capture the request headers
    
                Dim li As New ArtOfTest.WebAii.Messaging.Http.RequestListenerInfo(AddressOf RequestHandler)
    
                oBrowser.Http.AddBeforeRequestListener(li)  ' Add proxy listener
    
                ' Hide the browser window
    
                HideBrowser(oBrowser)
    
                ' Load the main url
    
                oBrowser.ActiveBrowser.NavigateTo(sMainURL)
                oBrowser.ActiveBrowser.WaitUntilReady()
    
                oBrowser.Http.RemoveBeforeRequestListener(li)   ' Remove proxy listener
                oBrowser.ActiveBrowser.RefreshDomTree()
    
                Dim bLoggedIn As Boolean = False
    
                ' Wait for the main logo image to show so that we know we have the right page
    
                oBrowser.ActiveBrowser.WaitForElement(New HtmlFindExpression("Tagname=div", "Id=clickable_logo"), 30000, False)
                Threading.Thread.Sleep(3000)    ' Wait 3 seconds to prevent loading pages too quickly
                oBrowser.ActiveBrowser.RefreshDomTree()
    
                ' Check if we are logged in already or if we need to log in
    
                If oBrowser.ActiveBrowser.Find.ByExpression("Tagname=div", "Id=logout", "InnerText=Logout") IsNot Nothing Then
                    ' Cannot find the logout button therefore we are already logged in
                    bLoggedIn = True
                ElseIf oBrowser.ActiveBrowser.Find.ByExpression("Tagname=input", "Name=email") IsNot Nothing AndAlso oBrowser.ActiveBrowser.Find.ByExpression("Tagname=input", "Name=wachtwoord") IsNot Nothing Then
                    ' Log in
    
                    oBrowser.ActiveBrowser.RefreshDomTree()
                    oBrowser.ActiveBrowser.Actions.SetText(oBrowser.ActiveBrowser.Find.ByExpression("Tagname=input", "Name=email"), sUsername)
                    oBrowser.ActiveBrowser.Actions.SetText(oBrowser.ActiveBrowser.Find.ByExpression("Tagname=input", "Name=wachtwoord"), sPassword)
                    oBrowser.ActiveBrowser.Actions.Click(oBrowser.ActiveBrowser.Find.ByExpression("Tagname=div", "Id=login", "InnerText=Login"))
    
                    ' Wait for page to load
    
                    oBrowser.ActiveBrowser.WaitUntilReady()
                    oBrowser.ActiveBrowser.WaitForElement(New HtmlFindExpression("Tagname=div", "Id=logout", "InnerText=Logout"), 30000, False)   ' Wait until Logout button is loaded
                    bLoggedIn = True
                Else
                    ' Didn't find any controls that we were looking for. Maybe the page was updated recently?
    
                    MessageBox.Show("Error loading page. Maybe the html changed?")
                End If
    
                If bLoggedIn = True Then
                    Dim bStop As Boolean = False
                    Dim sPreviewImageFilename As String
                    Dim sPreviewImageFileExtension As String
                    Dim oURI As Uri = New Uri(sMainURL)
                    Dim oCookie As System.Net.Cookie
                    Dim nSkippedFiles As Int32 = 0
    
                    ' Save cookies from browser to use with HttpWebRequest later
    
                    For c As Int32 = 0 To oBrowser.ActiveBrowser.Cookies.GetCookies(oURI.Scheme & Uri.SchemeDelimiter & oURI.Host).Count - 1
                        oCookie = New System.Net.Cookie
                        oCookie.Name = oBrowser.ActiveBrowser.Cookies.GetCookies(oURI.Scheme & Uri.SchemeDelimiter & oURI.Host)(c).Name
                        oCookie.Value = oBrowser.ActiveBrowser.Cookies.GetCookies(oURI.Scheme & Uri.SchemeDelimiter & oURI.Host)(c).Value
                        oCookie.Domain = oURI.Host
                        oCookie.Secure = False
                        oCookieContainerObject.Add(oCookie)
                    Next
    
                    Threading.Thread.Sleep(3000)    ' Wait 3 seconds to prevent loading pages too quickly
    
                    Do Until bStop = True
                        ' Browse to the list url
    
                        oBrowser.ActiveBrowser.NavigateTo(sListURL & sListQueryString & nCurrentPageID)
                        oBrowser.ActiveBrowser.WaitUntilReady()
    
                        If oBrowser.ActiveBrowser.Find.AllByExpression("Tagname=img", "Class=toggleTooltip").Count > 0 Then
                            ' Get all preview images on the page
    
                            For i As Int32 = 0 To oBrowser.ActiveBrowser.Find.AllByExpression("Tagname=img", "Class=toggleTooltip").Count - 1
                                ' Convert the preview image browser element into an HtmlImage
    
                                Dim oHtmlImage As ArtOfTest.WebAii.Controls.HtmlControls.HtmlImage = oBrowser.ActiveBrowser.Find.AllByExpression("Tagname=img", "Class=toggleTooltip")(i).[As](Of ArtOfTest.WebAii.Controls.HtmlControls.HtmlImage)()
    
                                ' Extract the filename and extension from the preview image
    
                                sPreviewImageFilename = System.IO.Path.GetFileNameWithoutExtension(oHtmlImage.Src)
                                sPreviewImageFileExtension = System.IO.Path.GetExtension(oHtmlImage.Src)
    
                                ' Create a proper download url using the preview image filename and download the file in the resolution that we want using HttpWebRequest
    
                                Select Case DownloadImage(sDownloadURL & sPreviewImageFilename & "_" & sResolutionString & sPreviewImageFileExtension, sListURL & sListQueryString & nCurrentPageID)
                                    Case Is = oDownloadResult.Success
                                        nSkippedFiles = 0   ' Result skipped files back to zero
                                    Case Is = oDownloadResult.Skipped
                                        nSkippedFiles += 1  ' Increment skipped files by one since we have already downloaded this file previously
                                    Case Is = oDownloadResult.Failed
                                        ' The image didn't download properly.
                                        ' Do whatever error handling in here that you want to
                                        ' Maybe save the filename to a log file so you know which file(s) failed and download them again later?
                                End Select
    
                                If nSkippedFiles >= nMaxSkippedFilesInSuccession Then
                                    ' We have skipped the maximum amount of files in a row so we must have downloaded them all (This should only ever happen on the 2nd+ run)
                                    bStop = True
                                    Exit For
                                Else
                                    Threading.Thread.Sleep(3000)    ' Wait 3 seconds to prevent loading pages too quickly
                                End If
                            Next
    
                            ' Increment the 'Start' querystring value by 24 to simulate clicking the 'Next' button and load the next 24 images
                            nCurrentPageID += 24
                        Else
                            ' No more images were found so we stop the application
                            bStop = True
                        End If
                    Loop
                End If
            Catch ex As Exception
                MessageBox.Show(ex.Message)
            Finally
                ' Ensure browser is closed when we exit
                CleanupBrowser(oBrowser)
            End Try
        End Sub
    
        Private Sub RequestHandler(sender As Object, e As ArtOfTest.WebAii.Messaging.Http.HttpRequestEventArgs)
            ' Save the exact user agent we are using so that we can use it with HTTPWebRequest later
            sUserAgent = e.Request.Headers("User-Agent")
        End Sub
    
        Private Function DownloadImage(ByVal sPage As String, sReferer As String) As oDownloadResult
            Dim req As System.Net.HttpWebRequest
            Dim oReturn As oDownloadResult
    
            Try
                req = System.Net.HttpWebRequest.Create(sPage)
                req.Method = "GET"
                req.AllowAutoRedirect = False
                req.UserAgent = sUserAgent
                req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                req.Headers.Add("Accept-Language", "en-US,en;q=0.5")
                req.Headers.Add("Accept-Encoding", "gzip, deflate")
                req.Headers.Add("Keep-Alive", "300")
                req.KeepAlive = True
    
                If oCookieContainerObject IsNot Nothing Then
                    ' Set cookie info so that we continue to be logged in
                    req.CookieContainer = oCookieContainerObject
                End If
    
                ' Save file to disk
    
                Using oResponse As System.Net.WebResponse = CType(req.GetResponse, System.Net.WebResponse)
                    Dim sContentDisposition As String = CType(oResponse, System.Net.HttpWebResponse).Headers("Content-Disposition")
    
                    If sContentDisposition IsNot Nothing Then
                        Dim sFilename As String = sContentDisposition.Substring(sContentDisposition.IndexOf("filename="), sContentDisposition.Length - sContentDisposition.IndexOf("filename=")).Replace("filename=", "").Replace("""", "").Replace(";", "").Trim
                        Dim sFullPath As String = System.IO.Path.Combine(sDownloadPath, sFilename)
    
                        If System.IO.File.Exists(sFullPath) = False Then
                            Using responseStream As IO.Stream = oResponse.GetResponseStream
                                Using fs As New IO.FileStream(sFullPath, System.IO.FileMode.Create, System.IO.FileAccess.Write)
                                    Dim buffer(2047) As Byte
                                    Dim read As Integer
    
                                    Do
                                        read = responseStream.Read(buffer, 0, buffer.Length)
                                        fs.Write(buffer, 0, read)
                                    Loop Until read = 0
    
                                    responseStream.Close()
                                    fs.Flush()
                                    fs.Close()
                                End Using
    
                                responseStream.Close()
                            End Using
    
                            oReturn = oDownloadResult.Success
                        Else
                            oReturn = oDownloadResult.Skipped   ' We have downloaded this file before so skip it
                        End If
                    End If
    
                    oResponse.Close()
                End Using
            Catch exc As System.Net.WebException
                MessageBox.Show("Network Error: " & exc.Message.ToString & " Status Code: " & exc.Status.ToString & " from " & sPage, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error)
                oReturn = oDownloadResult.Failed
            End Try
    
            Return oReturn
        End Function
    
        Private Sub HideBrowser(ByRef oBrowser As Manager)
    
            Dim tmp_hWnd As IntPtr
    
            For w As Integer = 1 To 10
                tmp_hWnd = oBrowser.ActiveBrowser.Window.Handle
                If Not tmp_hWnd.Equals(IntPtr.Zero) Then Exit For
                Threading.Thread.Sleep(100)
            Next
    
            If Not tmp_hWnd.Equals(IntPtr.Zero) Then
                ' use ShowWindowAsync to change app window state (minimize and hide it).
                ShowWindowAsync(tmp_hWnd, ShowWindowCommands.Minimize)
                ShowWindowAsync(tmp_hWnd, ShowWindowCommands.Hide)
            Else
                ' no window handle?
                MessageBox.Show("Error - Unable to get a window handle")
            End If
        End Sub
    
        Private Sub CleanupBrowser(ByRef oBrowser As Manager)
            If oBrowser IsNot Nothing AndAlso oBrowser.ActiveBrowser IsNot Nothing Then
                oBrowser.ActiveBrowser.Close()
            End If
    
            If oBrowser IsNot Nothing Then
                oBrowser.Dispose()
            End If
    
            oBrowser = Nothing
        End Sub
    End Class
    
    Module Module1
        Public Enum ShowWindowCommands As Integer
            Hide = 0
            Normal = 1
            ShowMinimized = 2
            Maximize = 3
            ShowMaximized = 3
            ShowNoActivate = 4
            Show = 5
            Minimize = 6
            ShowMinNoActive = 7
            ShowNA = 8
            Restore = 9
            ShowDefault = 10
            ForceMinimize = 11
        End Enum
    
        <DllImport("user32.dll", SetLastError:=True)> _
        Public Function ShowWindowAsync(hWnd As IntPtr, <MarshalAs(UnmanagedType.I4)> nCmdShow As ShowWindowCommands) As <MarshalAs(UnmanagedType.Bool)> Boolean
        End Function
    End Module