Search code examples
c#.netpaginationweb-crawlerwatin

not able to fetch links from pagination using watin dll


Hi i am collecting urls using watin framework. i want to traverse all the pages and collect the link and save it in one text file.I dont know how to add the pagination function.here is my code.

using System.Text;
using System.Threading.Tasks;
using WatiN.Core;
namespace magicbricks
{
class Class1
{
[STAThread]
static void Main(string[] args)
{
IE ie = new IE();

ie.GoTo("http://www.99acres.com/property-in-chennai-ffid?search_type=QS&search_location=HP&lstAcn=HP_R&src=CLUSTER&isvoicesearch=N&keyword_suggest=chennai%20%28all%29%3B&fullSelectedSuggestions=chennai%20%28all%29&strEntityMap=W3sidHlwZSI6ImNpdHkifSx7IjEiOlsiY2hlbm5haSAoYWxsKSIsIkNJVFlfMzIsIFBSRUZFUkVOQ0VfUywgUkVTQ09NX1IiXX1d&texttypedtillsuggestion=chennai&refine_results=Y&Refine_Localities=Refine%20Localities&action=%2Fdo%2Fquicksearch%2Fsearch&suggestion=CITY_32%2C%20PREFERENCE_S%2C%20RESCOM_R");

foreach (var currLink in ie.Links)
{
if (currLink.Url.Contains("b"))
{
Console.WriteLine(currLink.Url);
}
}

Console.ReadLine();

}

}
}

any help will be appreciated.


Solution

  • Here is working solution for that. I changed a bit your code.

    using System;
    using WatiN.Core;
    
    namespace magicbricks
    {
    static class Class1
    {
        private static WatiN.Core.Link _nextPageElement;
        private static string _firstPartOfAddress = "";
        private static string _lastPartOfAddress = "";
        private static int _maxPageCounter = 0;
        [STAThread]
        static void Main(string[] args)
        {
            IE ie = SetUpBrowser();
            EnterFirstWebpage(ie);
            ie.WaitForComplete();
            LookFoAllLinks(ie);
            for (int i = 2; i < _maxPageCounter; i++)
            {
                Console.WriteLine("----------------------------Next Page {0}---------------------------", i);
                Console.WriteLine(AssembleNextPageWebAddress(i));
                EnterNextWebpageUrl(ie,AssembleNextPageWebAddress(i));
                LookFoAllLinks(ie);                               
            }
            Console.ReadKey();
        }
    
        private static IE SetUpBrowser()
        {
            IE ie = new IE();
            return ie;
        }
    
        private static void EnterFirstWebpage(IE ie)
        {
            ie.GoTo("http://www.99acres.com/property-in-chennai-ffid?search_type=QS&search_location=HP&lstAcn=HP_R&src=CLUSTER&isvoicesearch=N&keyword_suggest=chennai%20%28all%29%3B&fullSelectedSuggestions=chennai%20%28all%29&strEntityMap=W3sidHlwZSI6ImNpdHkifSx7IjEiOlsiY2hlbm5haSAoYWxsKSIsIkNJVFlfMzIsIFBSRUZFUkVOQ0VfUywgUkVTQ09NX1IiXX1d&texttypedtillsuggestion=chennai&refine_results=Y&Refine_Localities=Refine%20Localities&action=%2Fdo%2Fquicksearch%2Fsearch&suggestion=CITY_32%2C%20PREFERENCE_S%2C%20RESCOM_R");
        }
    
        private static void EnterNextWebpageUrl(IE ie,string url)
        {
            ie.GoTo(url);
            ie.WaitForComplete();
        }
    
        private static void LookFoAllLinks(IE ie)
        {
            int currentpageCounter = 0;
            var tmpUrl = string.Empty;
            const string nextPageUrl = "http://www.99acres.com/property-in-chennai-ffid-page-";
            foreach (var currLink in ie.Links)
            {
                if (currLink.Url.Contains("b"))
                {
                    Console.WriteLine(currLink.Url);
                    try
                    {                        
                        if (currLink.Name.Contains("nextbutton"))
                        {                            
                            _nextPageElement = currLink;
                        }
                    }
                    catch (Exception ex)
                    {                                          
                    }
                    try
                    {
                        if (currLink.GetAttributeValue("name").Contains("page"))
                        {
                            _firstPartOfAddress = currLink.Url.Substring(0, nextPageUrl.Length);
                            tmpUrl = currLink.Url.Remove(0,nextPageUrl.Length);
                            _lastPartOfAddress = tmpUrl.Substring(tmpUrl.IndexOf("?"));
                            tmpUrl = tmpUrl.Substring(0,tmpUrl.IndexOf("?"));
                            int.TryParse(tmpUrl, out currentpageCounter);
                            if (currentpageCounter > _maxPageCounter)
                            {
                                _maxPageCounter = currentpageCounter;
                                currentpageCounter = 0;
                            }
                        }
                    }
                    catch (Exception)
                    {                          
                    }
                }
            }
        }
    
        private static string AssembleNextPageWebAddress(int pageNumber)
        {
            return _firstPartOfAddress + pageNumber + _lastPartOfAddress;
        }
    }
    }
    

    Some explanation :

    variable _maxPageCounter contains max numbers of pages to lookfor links. We are getting this here :

    if (currLink.GetAttributeValue("name").Contains("page"))
                        {
                            _firstPartOfAddress = currLink.Url.Substring(0, nextPageUrl.Length);
                            tmpUrl = currLink.Url.Remove(0,nextPageUrl.Length);
                            _lastPartOfAddress = tmpUrl.Substring(tmpUrl.IndexOf("?"));
                            tmpUrl = tmpUrl.Substring(0,tmpUrl.IndexOf("?"));
                            int.TryParse(tmpUrl, out currentpageCounter);
                            if (currentpageCounter > _maxPageCounter)
                            {
                                _maxPageCounter = currentpageCounter;
                                currentpageCounter = 0;
                            }
                        }
    

    Later we are just looping through pages, by create next address.

    private static string AssembleNextPageWebAddress(int pageNumber)
        {
            return _firstPartOfAddress + pageNumber + _lastPartOfAddress;
        }
    

    We could use here as well next button, and click it in loop. I hope it was helpful.