Search code examples
c#screen-scraping

submitting form programmatically


Im trying to submit a specific form programatically, but I allways get the initial page back. I must be doing something wrong or missing something here. Im sending the session cookie and some POST data like viewState (that I parse from the initial request), and SessionID (this is the value i change in the form toget data from other years). But in the second request I allways get data for Session 899, instead of the one i request: 875.

Here is the code used:, any help is greatly apreciated

retrieveEdmIndexForSession(875);

 protected string retrieveEdmIndexForSession(int sessionId) {

    CookieContainer cookies;
    HttpWebRequest oRequest;
    HttpWebResponse oResponse;
    Stream sw;
    StreamReader sr;
    string pageData;
string PathRemote = @"http://edmi.parliament.uk/EDMi/EDMList.aspx";


    /*
     * download the index page so we can get Cookies and ViewState from it.
     */
    oRequest = (HttpWebRequest)WebRequest.Create(PathRemote);
    oRequest.Method = "GET";
    oRequest.AllowAutoRedirect = true;
    oRequest.CookieContainer = new CookieContainer();
    oRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    oRequest.Referer = "http://edmi.parliament.uk/EDMi/EDMList.aspx";


    oResponse =(HttpWebResponse) oRequest.GetResponse();
    sr = new StreamReader(oResponse.GetResponseStream());

    pageData = sr.ReadToEnd();

    /*
     * extract view state from pageData.
     */
    string viewState = this.ExtractViewState(pageData);


    /*
    * lets submit the form with the parameters we want
    */
    oRequest = (HttpWebRequest)WebRequest.Create(PathRemote);
    oRequest.Method = "POST";
    oRequest.AllowAutoRedirect = true;
    oRequest.ContentType = "application/x-www-form-urlencoded";
    oRequest.CookieContainer = new CookieContainer();
    oRequest.CookieContainer.Add(oResponse.Cookies);
    oRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    oRequest.Referer = "http://edmi.parliament.uk/EDMi/EDMList.aspx";

    string postdata = "__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=" + viewState + "&_MenuCtrl%3AddlSession=" + sessionId + "&_MenuCtrl%3A_GoTo.x=57&_MenuCtrl%3A_GoTo.y=14&ddlStatus=1&ddlSortedBy=1";
    byte[] buffer = Encoding.UTF8.GetBytes(postdata);
    oRequest.ContentLength = buffer.Length;

    /*
     * Send post data into request stream first
     */
    sw = oRequest.GetRequestStream();
    sw.Write(buffer, 0, buffer.Length);
    sw.Flush();
    sw.Close();

    /*
     * Connect, send and get response
     */
    oResponse = (HttpWebResponse)oRequest.GetResponse();
    sr = new StreamReader(oResponse.GetResponseStream());

    //OnLogUpdated(1, "\r\nStatus Code: " + oResponse.StatusCode);
    //OnLogUpdated(1, "\r\nServer: " + oResponse.Server);

    pageData = sr.ReadToEnd();
    string result = getSessionId(pageData);
    //OnLogUpdated(1, "\r\nRestuls: [" + result + "]");
    //OnLogUpdated(1, "\r\nPage: [" + pageData + "]");


    return pageData;


}



private string ExtractViewState(string str)
{
    string viewState = "";
    string pattern = "(?<=__VIEWSTATE\" value=\")(?<val>.*?)(?=\")";

    Match match = Regex.Match(str, pattern);

    if (match.Success)
    {
        viewState = match.Groups["val"].Value;
        viewState = HttpUtility.UrlEncodeUnicode(viewState);

    }

    return viewState;
}

protected string getSessionId(string str)
{
    string sessionId = string.Empty;

    str = str.Trim();

    string pattern = @"&SESSION=([^']+)'";

    Match match = Regex.Match(str, pattern, RegexOptions.IgnoreCase);
    if (match.Success)
    {
        sessionId = match.Groups[1].ToString(); ;

    }

    return sessionId;
}

This is the RAW Request being sent by the .NET script.

POST /EDMi/EDMList.aspx HTTP/1.1 Content-Type: application/x-www-form-urlencoded Accept: text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8 Referer: http://edmi.parliament.uk/EDMi/EDMList.aspx User-Agent: .NET Framework Client Host: edmi.parliament.uk Cookie: ASP.NET_SessionId=k55fqarvx2oszp2wxhtrol45 Content-Length: 2431 Expect: 100-continue

__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=dDwxMDgyNzIxNDQ2O3Q8O2w8aTwzPjs%2bO2w8dDw7bDxpPDE%2bO2k8Mz47aTw1PjtpPDExPjs%2bO2w8dDw7bDxpPDEzPjtpPDE3Pjs%2bO2w8dDx0PHA8cDxsPERhdGFWYWx1ZUZpZWxkO0RhdGFUZXh0RmllbGQ7PjtsPFNFU1NJT05JRDtJVEVNX1ZBTFVFOz4%2bOz47dDxpPDIwPjtAPDA4LTA5OzA3LTA4OzA2LTA3OzA1LTA2OzA0LTA1OzAzLTA0OzAyLTAzOzAxLTAyOzAwLTAxOzk5LTAwOzk4LTk5Ozk3LTk4Ozk2LTk3Ozk1LTk2Ozk0LTk1OzkzLTk0OzkyLTkzOzkxLTkyOzkwLTkxOzg5LTkwOz47QDw4OTk7ODkxOzg4NTs4NzU7ODczOzY4Mjs2ODE7NjgwOzY3OTs3MDM7NzAyOzcwMTs3MDA7Njk5OzY5ODs2OTc7Njk2OzY5NTs2OTQ7NjkzOz4%2bOz47Oz47dDxwPGw8VGV4dDs%2bO2w8TGlzdCBPZiBFYXJseSBEYXkgTW90aW9uczs%2bPjs7Pjs%2bPjt0PDtsPGk8MT47aTwzPjs%2bO2w8dDx0PDs7bDxpPDA%2bOz4%2bOzs%2bO3Q8dDw7O2w8aTwwPjs%2bPjs7Pjs%2bPjt0PDtsPGk8MT47aTwzPjs%2bO2w8dDw7bDxpPDE%2bO2k8Mz47aTw1PjtpPDc%2bOz47bDx0PHA8cDxsPENvbW1hbmRBcmd1bWVudDtDc3NDbGFzcztFbmFibGVkO18hU0I7PjtsPDA7UGFnZUZpcnN0RGlzYWJsZWQ7bzxmPjtpPDI%2bOz4%2bOz47Oz47dDxwPHA8bDxDb21tYW5kQXJndW1lbnQ7Q3NzQ2xhc3M7RW5hYmxlZDtfIVNCOz47bDwtMTtQYWdlUHJldkRpc2FibGVkO288Zj47aTwyPjs%2bPjs%2bOzs%2bO3Q8cDxwPGw8Q29tbWFuZEFyZ3VtZW50O0Nzc0NsYXNzO18hU0I7PjtsPDE7UGFnZU5leHRFbmFibGVkO2k8Mj47Pj47Pjs7Pjt0PHA8cDxsPENvbW1hbmRBcmd1bWVudDtDc3NDbGFzcztfIVNCOz47bDw0MjtQYWdlTGFzdEVuYWJsZWQ7aTwyPjs%2bPjs%2bOzs%2bOz4%2bO3Q8O2w8aTwxPjtpPDM%2bO2k8NT47aTw3Pjs%2bO2w8dDxwPHA8bDxUZXh0Oz47bDwyMTA5Oz4%2bOz47Oz47dDxwPHA8bDxUZXh0Oz47bDxFRE1zIGFuZCBBbWVuZG1lbnRzOz4%2bOz47Oz47dDxwPHA8bDxUZXh0Oz47bDwxOz4%2bOz47Oz47dDxwPHA8bDxUZXh0Oz47bDw1MDs%2bPjs%2bOzs%2bOz4%2bOz4%2bO3Q8O2w8aTwxPjtpPDM%2bOz47bDx0PDtsPGk8MT47aTwzPjtpPDU%2bO2k8Nz47PjtsPHQ8cDxwPGw8Q29tbWFuZEFyZ3VtZW50O0Nzc0NsYXNzO0VuYWJsZWQ7XyFTQjs%2bO2w8MDtQYWdlRmlyc3REaXNhYmxlZDtvPGY%2bO2k8Mj47Pj47Pjs7Pjt0PHA8cDxsPENvbW1hbmRBcmd1bWVudDtDc3NDbGFzcztFbmFibGVkO18hU0I7PjtsPC0xO1BhZ2VQcmV2RGlzYWJsZWQ7bzxmPjtpPDI%2bOz4%2bOz47Oz47dDxwPHA8bDxDb21tYW5kQXJndW1lbnQ7Q3NzQ2xhc3M7XyFTQjs%2bO2w8MTtQYWdlTmV4dEVuYWJsZWQ7aTwyPjs%2bPjs%2bOzs%2bO3Q8cDxwPGw8Q29tbWFuZEFyZ3VtZW50O0Nzc0NsYXNzO18hU0I7PjtsPDQyO1BhZ2VMYXN0RW5hYmxlZDtpPDI%2bOz4%2bOz47Oz47Pj47dDxwPHA8bDxWaXNpYmxlOz47bDxvPGY%2bOz4%2bOz47bDxpPDE%2bO2k8Mz47aTw1PjtpPDc%2bOz47bDx0PHA8cDxsPFRleHQ7PjtsPDIxMDk7Pj47Pjs7Pjt0PHA8cDxsPFRleHQ7PjtsPEVETXMgYW5kIEFtZW5kbWVudHM7Pj47Pjs7Pjt0PHA8cDxsPFRleHQ7PjtsPDE7Pj47Pjs7Pjt0PHA8cDxsPFRleHQ7PjtsPDUwOz4%2bOz47Oz47Pj47Pj47Pj47Pj47bDxfTWVudUN0cmw6X0dvVG87Pj5NHcFbPBNzNuwxs7sYLdUE2omkjw%3d%3d&_MenuCtrl%3AddlSession=875&_MenuCtrl%3A_GoTo.x=57&_MenuCtrl%3A_GoTo.y=14&ddlStatus=1&ddlSortedBy=1

This is the RAW request sent by IE:

POST /EDMi/EDMList.aspx HTTP/1.1 Accept: image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/xaml+xml, application/vnd.ms-xpsdocument, application/x-ms-xbap, application/x-ms-application, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, / Referer: http://edmi.parliament.uk/EDMi/EDMList.aspx Accept-Language: en-gb User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; InfoPath.1; .NET CLR 3.0.04506.648; OfficeLiveConnector.1.3; OfficeLivePatch.0.0; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729) Content-Type: application/x-www-form-urlencoded Accept-Encoding: gzip, deflate Host: edmi.parliament.uk Content-Length: 2431 Connection: Keep-Alive Pragma: no-cache Cookie: WT_FPC=id=83.217.99.254-2364242496.30021299:lv=1249572414567:ss=1249572414567; ASP.NET_SessionId=vwxgo4rlex1j5m55l0bivrqo

__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=dDwxMDgyNzIxNDQ2O3Q8O2w8aTwzPjs%2BO2w8dDw7bDxpPDE%2BO2k8Mz47aTw1PjtpPDExPjs%2BO2w8dDw7bDxpPDEzPjtpPDE3Pjs%2BO2w8dDx0PHA8cDxsPERhdGFWYWx1ZUZpZWxkO0RhdGFUZXh0RmllbGQ7PjtsPFNFU1NJT05JRDtJVEVNX1ZBTFVFOz4%2BOz47dDxpPDIwPjtAPDA4LTA5OzA3LTA4OzA2LTA3OzA1LTA2OzA0LTA1OzAzLTA0OzAyLTAzOzAxLTAyOzAwLTAxOzk5LTAwOzk4LTk5Ozk3LTk4Ozk2LTk3Ozk1LTk2Ozk0LTk1OzkzLTk0OzkyLTkzOzkxLTkyOzkwLTkxOzg5LTkwOz47QDw4OTk7ODkxOzg4NTs4NzU7ODczOzY4Mjs2ODE7NjgwOzY3OTs3MDM7NzAyOzcwMTs3MDA7Njk5OzY5ODs2OTc7Njk2OzY5NTs2OTQ7NjkzOz4%2BOz47Oz47dDxwPGw8VGV4dDs%2BO2w8TGlzdCBPZiBFYXJseSBEYXkgTW90aW9uczs%2BPjs7Pjs%2BPjt0PDtsPGk8MT47aTwzPjs%2BO2w8dDx0PDs7bDxpPDA%2BOz4%2BOzs%2BO3Q8dDw7O2w8aTwwPjs%2BPjs7Pjs%2BPjt0PDtsPGk8MT47aTwzPjs%2BO2w8dDw7bDxpPDE%2BO2k8Mz47aTw1PjtpPDc%2BOz47bDx0PHA8cDxsPENvbW1hbmRBcmd1bWVudDtDc3NDbGFzcztFbmFibGVkO18hU0I7PjtsPDA7UGFnZUZpcnN0RGlzYWJsZWQ7bzxmPjtpPDI%2BOz4%2BOz47Oz47dDxwPHA8bDxDb21tYW5kQXJndW1lbnQ7Q3NzQ2xhc3M7RW5hYmxlZDtfIVNCOz47bDwtMTtQYWdlUHJldkRpc2FibGVkO288Zj47aTwyPjs%2BPjs%2BOzs%2BO3Q8cDxwPGw8Q29tbWFuZEFyZ3VtZW50O0Nzc0NsYXNzO18hU0I7PjtsPDE7UGFnZU5leHRFbmFibGVkO2k8Mj47Pj47Pjs7Pjt0PHA8cDxsPENvbW1hbmRBcmd1bWVudDtDc3NDbGFzcztfIVNCOz47bDw0MjtQYWdlTGFzdEVuYWJsZWQ7aTwyPjs%2BPjs%2BOzs%2BOz4%2BO3Q8O2w8aTwxPjtpPDM%2BO2k8NT47aTw3Pjs%2BO2w8dDxwPHA8bDxUZXh0Oz47bDwyMTA5Oz4%2BOz47Oz47dDxwPHA8bDxUZXh0Oz47bDxFRE1zIGFuZCBBbWVuZG1lbnRzOz4%2BOz47Oz47dDxwPHA8bDxUZXh0Oz47bDwxOz4%2BOz47Oz47dDxwPHA8bDxUZXh0Oz47bDw1MDs%2BPjs%2BOzs%2BOz4%2BOz4%2BO3Q8O2w8aTwxPjtpPDM%2BOz47bDx0PDtsPGk8MT47aTwzPjtpPDU%2BO2k8Nz47PjtsPHQ8cDxwPGw8Q29tbWFuZEFyZ3VtZW50O0Nzc0NsYXNzO0VuYWJsZWQ7XyFTQjs%2BO2w8MDtQYWdlRmlyc3REaXNhYmxlZDtvPGY%2BO2k8Mj47Pj47Pjs7Pjt0PHA8cDxsPENvbW1hbmRBcmd1bWVudDtDc3NDbGFzcztFbmFibGVkO18hU0I7PjtsPC0xO1BhZ2VQcmV2RGlzYWJsZWQ7bzxmPjtpPDI%2BOz4%2BOz47Oz47dDxwPHA8bDxDb21tYW5kQXJndW1lbnQ7Q3NzQ2xhc3M7XyFTQjs%2BO2w8MTtQYWdlTmV4dEVuYWJsZWQ7aTwyPjs%2BPjs%2BOzs%2BO3Q8cDxwPGw8Q29tbWFuZEFyZ3VtZW50O0Nzc0NsYXNzO18hU0I7PjtsPDQyO1BhZ2VMYXN0RW5hYmxlZDtpPDI%2BOz4%2BOz47Oz47Pj47dDxwPHA8bDxWaXNpYmxlOz47bDxvPGY%2BOz4%2BOz47bDxpPDE%2BO2k8Mz47aTw1PjtpPDc%2BOz47bDx0PHA8cDxsPFRleHQ7PjtsPDIxMDk7Pj47Pjs7Pjt0PHA8cDxsPFRleHQ7PjtsPEVETXMgYW5kIEFtZW5kbWVudHM7Pj47Pjs7Pjt0PHA8cDxsPFRleHQ7PjtsPDE7Pj47Pjs7Pjt0PHA8cDxsPFRleHQ7PjtsPDUwOz4%2BOz47Oz47Pj47Pj47Pj47Pj47bDxfTWVudUN0cmw6X0dvVG87Pj5NHcFbPBNzNuwxs7sYLdUE2omkjw%3D%3D&_MenuCtrl%3AddlSession=885&ddlStatus=0&ddlSortedBy=1&_MenuCtrl%3A_GoTo.x=37&_MenuCtrl%3A_GoTo.y=12

The IE Header seems to have an extra cookie(WT_FPC=id=83.217.99.254-2364242496.30021299:lv=1249572414567:ss=1249572414567;) witch appers to track visitors using cookies via the WebTrends Cookie Plug-In. Both POST Requests Return HTTP Status Code 302 and redirect to a GET request that returns status 200.

Any ideas ?


Solution

  • I have cracked it. It seems that the .NET server does not like me changing the Status and the Session at the same time. It works if i change the "ddlStatus" from:

    string postdata = "__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=" + viewState + "&_MenuCtrl%3AddlSession=" + sessionId + "&_MenuCtrl%3A_GoTo.x=57&_MenuCtrl%3A_GoTo.y=14&ddlStatus=1&ddlSortedBy=1";

    to this:

    string postdata = "__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=" + viewState + "&_MenuCtrl%3AddlSession=" + sessionId + "&_MenuCtrl%3A_GoTo.x=57&_MenuCtrl%3A_GoTo.y=14&ddlStatus=0&ddlSortedBy=1";

    I will have to change the ddlStatus in a second POST request, and so on for each change in the form. .NET seems to be very well educated, It can only take one sweet at a time.

    Thanks for all the help :)