Search code examples
c#.netwindowssocketsscreen-scraping

Sockets receive hangs


I am trying to download, search page of bing, and ask using sockets, i have decided to use sockets, instead of webclient.

The socket.Receive(); hangs after few loops in case of bing, yahoo, google but works for ask. for google loop will receive for 4 - 5 times, then freeze on the call.

I am not able to figure out why?

public string Get(string url)
{
    Uri requestedUri = new Uri(url);
    string fulladdress = requestedUri.Host;
    IPHostEntry entry = Dns.GetHostEntry(fulladdress);
    StringBuilder sb = new StringBuilder();

    try
    {
        using (Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.IP))
        {
            socket.Connect(entry.AddressList[0], 80);

            NetworkStream ns = new NetworkStream(socket);

            string part_request = string.Empty;
            string build_request = string.Empty;
            if (jar.Count != 0)
            {
                part_request = "GET {0} HTTP/1.1\r\nHost: {1} \r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nCookie: {2}\r\nConnection: keep-alive\r\n\r\n";
                build_request = string.Format(part_request, requestedUri.PathAndQuery, requestedUri.Host, GetCookies(requestedUri));
            }
            else
            {
                part_request = "GET {0} HTTP/1.1\r\nHost: {1} \r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nConnection: keep-alive\r\n\r\n";
                build_request = string.Format(part_request, requestedUri.PathAndQuery, requestedUri.Host);
            }

            byte[] data = Encoding.UTF8.GetBytes(build_request);
            socket.Send(data, data.Length, 0);

            byte[] bytesReceived = new byte[102400];
            int bytes = 0;

            do
            {
                bytes = socket.Receive(bytesReceived, bytesReceived.Length, 0);
                sb.Append(Encoding.ASCII.GetString(bytesReceived, 0, bytes));
            }
            while (bytes > 0);

            List<String> CookieHeaders = new List<string>();
            foreach (string header in sb.ToString().Split("\n\r".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                if (header.StartsWith("Set-Cookie"))
                {
                    CookieHeaders.Add(header.Replace("Set-Cookie: ", ""));
                }
            }

            this.AddCookies(CookieHeaders, requestedUri);

            socket.Close();
        }
    }
    catch (Exception ex)
    {
        string errorMessage = ex.Message;
    }

    return sb.ToString();
}

CookieContainer jar = new CookieContainer();

public string GetCookies(Uri _uri)
{
    StringBuilder sb = new StringBuilder();
    CookieCollection collection = jar.GetCookies(_uri);

    if (collection.Count != 0)
    {
        foreach (Cookie item in collection)
        {
            sb.Append(item.Name + "=" + item.Value + ";");
        }
    }
    return sb.ToString();
}

Solution

  • Its because you've reached the end of the content and yet you are still requesting more ...

    do
    {
       bytes = socket.Receive(bytesReceived, bytesReceived.Length, 0);
       sb.Append(Encoding.ASCII.GetString(bytesReceived, 0, bytes));
    }
    while (bytes > 0);
    

    This assumes that as long as the last request returned more than 0 bytes theres more available, when in actual fact when a network stream reaches the end the chances are you'll fill some of your buffer on the last loop. (e.g. bytes > 0 but nothing more to get) ... so the server closes the connection.

    try something like this instead ...

    do
    {
       bytes = socket.Receive(bytesReceived, bytesReceived.Length, 0);
       sb.Append(Encoding.ASCII.GetString(bytesReceived, 0, bytes));
    }
    while (bytes == bytesReceived.Length);
    

    Some servers (ask is probably one of them) obviously don't auto close the connection as you would expect hence the reason it won't always fail.

    :::EDIT:::

    My test sample:

    Load visual studio, create a new console app then paste the following in to the generated program class (in place of all existing code):

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.Net.Sockets;
    
    namespace ConsoleApplication1
    {
        class Program
        {
            static void Main(string[] args)
            {
                string test = Get("http://www.google.co.uk/search?q=test&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-GB:official&client=firefox-a");
                Console.Read();
            }
    
            public static string Get(string url)
            {
                Uri requestedUri = new Uri(url);
                string fulladdress = requestedUri.Host;
                IPHostEntry entry = Dns.GetHostEntry(fulladdress);
                StringBuilder sb = new StringBuilder();
    
                try
                {
                    using (Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.IP))
                    {
                        socket.Connect(entry.AddressList[0], 80);
    
                        NetworkStream ns = new NetworkStream(socket);
    
                        string part_request = string.Empty;
                        string build_request = string.Empty;
                        if (jar.Count != 0)
                        {
                            part_request = "GET {0} HTTP/1.1\r\nHost: {1} \r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nCookie: {2}\r\nConnection: keep-alive\r\n\r\n";
                            build_request = string.Format(part_request, requestedUri.PathAndQuery, requestedUri.Host, GetCookies(requestedUri));
                        }
                        else
                        {
                            part_request = "GET {0} HTTP/1.1\r\nHost: {1} \r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nConnection: keep-alive\r\n\r\n";
                            build_request = string.Format(part_request, requestedUri.PathAndQuery, requestedUri.Host);
                        }
    
                        byte[] data = Encoding.UTF8.GetBytes(build_request);
                        socket.Send(data, data.Length, 0);
    
                        byte[] bytesReceived = new byte[4096];
                        int bytes = 0;
                        string currentBatch = "";
    
                        do
                        {
                            bytes = socket.Receive(bytesReceived);
                            currentBatch = Encoding.ASCII.GetString(bytesReceived, 0, bytes);
                            Console.Write(currentBatch);
                            sb.Append(currentBatch);
                        }
                        while (bytes == bytesReceived.Length);
    
                        List<String> CookieHeaders = new List<string>();
                        foreach (string header in sb.ToString().Split("\n\r".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
                        {
                            if (header.StartsWith("Set-Cookie"))
                            {
                                CookieHeaders.Add(header.Replace("Set-Cookie: ", ""));
                            }
                        }
    
                        //this.AddCookies(CookieHeaders, requestedUri);
    
                        socket.Close();
                    }
                }
                catch (Exception ex)
                {
                    string errorMessage = ex.Message;
                }
    
                return sb.ToString();
            }
    
            static CookieContainer jar = new CookieContainer();
    
            public static string GetCookies(Uri _uri)
            {
                StringBuilder sb = new StringBuilder();
                CookieCollection collection = jar.GetCookies(_uri);
    
                if (collection.Count != 0)
                {
                    foreach (Cookie item in collection)
                    {
                        sb.Append(item.Name + "=" + item.Value + ";");
                    }
                }
                return sb.ToString();
            }
            }
        }
    

    I reduced the buffer to ensure that it was filled more than once ... seems ok from my end This post comes with the typical works on my pc garantee :)