HTTPS Proxy using Tunnelling

I am new to networking. I am trying to build a program that runs on the proxy. However, this only works for HTTP based requests. I am trying to implement it for HTTPS.

I came across this concept of tunneling, where a tunnel is created between the client and the destination server via the proxy server. However, I am still not clear that when the client sends the HTTPS packet to the proxy, how does the proxy server get to know which IP to forward the packet to?

I did listen about the initial TCP handshake between the client and the proxy and the proxy with the destination server, but does that even happen in the first place? The proxy doesn't know anything about the destination server, also because the packets are encrypted, the proxy doesn't have access to the Host header field, unlike with HTTP requests.

Can someone clarify this to me?

BTW, I have come across multiple articles and answers on StackOverflow which talk about HTTP tunneling, but simply use the HTTP CONNECT method on the client side for tunneling. But I only have access to the proxy server, I cannot control what the client sends to me.

Moreover, I am looking to implement this tunnel from scratch in C.

Below is my code implementation of an HTTP proxy, in case that helps:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netdb.h>

#define PROXY_PORT 8080
#define BLACKLISTED_URL "example.com"

void handle_client(int client_socket) {
    char request[4096];
    ssize_t bytes_received = 0;

    printf("Inside handle_client\n");

    int target_socket;
    struct sockaddr_in target_addr;

    bytes_received = recv(client_socket,request,sizeof(request)-1,0);
    printf("Inside while loop\n");

    // Find the start of the "Host:" field
    char* hostStart = strstr(request, "Host:");
    if (hostStart == NULL) {
        printf("No Host field found in the request\n");
        return;
    }

    // Skip past "Host: " to the start of the URL
    hostStart += strlen("Host: ");

    // Find the end of the URL
    char* hostEnd = strchr(hostStart, '\n');
    if (hostEnd == NULL) {
        printf("No end of line found after the Host field\n");
        return;
    }

    // Copy the URL into a new string
    size_t urlLength = hostEnd - hostStart;
    char* url = malloc(urlLength + 1);
    if (url == NULL) {
        perror("Error allocating memory for URL");
        return;
    }
    strncpy(url, hostStart, urlLength);
    url[urlLength] = '\0'; // Null-terminate the string

    // Remove trailing newline or carriage return characters
    url[strcspn(url, "\r\n")] = '\0';
    
    printf("Extracted URL: %s\n", url);


    struct addrinfo hints = {0};
    hints.ai_flags = AI_NUMERICHOST;
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_protocol = IPPROTO_TCP;

    struct addrinfo *addr = NULL;
    struct sockaddr_in target;

    int ret = getaddrinfo(url, NULL, &hints, &addr);
    if (ret == EAI_NONAME) // not an IP, retry as a hostname
    {
        hints.ai_flags = 0;
        ret = getaddrinfo(url, NULL, &hints, &addr);
    }
    if (ret == 0)
    {
        target = *(struct sockaddr_in*)(addr->ai_addr);
        freeaddrinfo(addr);
    }

    printf("IP address: %s & %s \n", inet_ntoa(target.sin_addr), url);

    // Create a connection to the target server
    target_socket = socket(AF_INET, SOCK_STREAM, 0);
    // struct sockaddr_in target_addr;
    target_addr.sin_family = AF_INET;
    target_addr.sin_port = htons(80);
    target_addr.sin_addr.s_addr = target.sin_addr.s_addr;

    printf("Forwarding request to the target server: \n");

    
    if (strstr(url, BLACKLISTED_URL) == NULL) {
        printf("Sending request to target server\n");
    }

    printf("Received request from client: %s\n", request);

    // Check if the request contains a blacklisted URL
    if (strstr(request, BLACKLISTED_URL) != NULL) {
        printf("URL blocked: %s\n", BLACKLISTED_URL);
        const char* response = "HTTP/1.1 403 Forbidden\r\nContent-Length: 19\r\n\r\nAccess Denied: URL blocked\r\n";
        send(client_socket, response, strlen(response), 0);
        close(client_socket);
        return;
    }

    printf("Forwarding request to the target server...\n");

    int ret_;
    if ((ret_ = connect(target_socket, (struct sockaddr*)&target_addr, sizeof(target_addr))) < 0) {
        perror("Error connecting to the target server");
        close(client_socket);
        return;
    }

    printf("Connected to the target server\n");

    // Forward the request to the target server
    send(target_socket, request, bytes_received, 0);

    printf("Request forwarded to the target server\n");

    // Forward the response from the target server to the client
    char response_buffer[4096];
    ssize_t bytes_sent;
    while ((bytes_received = recv(target_socket, response_buffer, sizeof(response_buffer), 0)) > 0) {
        bytes_sent = send(client_socket, response_buffer, bytes_received, 0);
        if (bytes_sent < 0) {
            perror("Error sending response to the client");
            break;
        }
    }

    printf("Response forwarded to the client\n");


    // Close the sockets
    close(client_socket);
    close(target_socket);
}

int main() {
    int proxy_socket = socket(AF_INET, SOCK_STREAM, 0);
    struct sockaddr_in proxy_addr;
    proxy_addr.sin_family = AF_INET;
    proxy_addr.sin_port = htons(PROXY_PORT);
    proxy_addr.sin_addr.s_addr = inet_addr("127.0.0.1");

    if (bind(proxy_socket, (struct sockaddr*)&proxy_addr, sizeof(proxy_addr)) < 0) {
        perror("Error binding to the proxy port");
        return 1;
    }

    if (listen(proxy_socket, 10) < 0) {
        perror("Error listening on the proxy socket");
        return 1;
    }

    printf("Proxy server listening on port %d...\n", PROXY_PORT);

    while (1) {
        struct sockaddr_in client_addr;
        socklen_t client_addr_len = sizeof(client_addr);
        int client_socket = accept(proxy_socket, (struct sockaddr*)&client_addr, &client_addr_len);

        printf("Connection accepted from %s:%d\n", inet_ntoa(client_addr.sin_addr), ntohs(client_addr.sin_port));
        if (client_socket < 0) {
            perror("Error accepting client connection");
        } else {
            printf("Handling client request...\n");
            handle_client(client_socket);
            printf("Done handling client request\n");
        }
    }

    close(proxy_socket);
    return 0;
}

Solution

However, I am still not clear that when the client sends the HTTPS packet to the proxy, how does the proxy server get to know which IP to forward the packet to?

The client has to tell the proxy up front which target IP:port to create a tunnel to, and if that tunnel is successful then the client and target will exchange HTTPS messages (or any other data) through that tunnel.

I did listen about the initial TCP handshake between the client and the proxy and the proxy with the destination server, but does that even happen in the first place?

Yes. In this case, the client first creates a TCP connection with the proxy, then sends a tunnel request to the proxy specifying the target, then the proxy creates a TCP connection with the target, and if successful then the proxy passes all subsequent raw bytes back and forth between the two TCP connections until one party disconnects, then the proxy disconnects the other party.

The proxy doesn't know anything about the destination server

Yes, it does, because the client will tell the proxy up front what the target is. That is how most proxy protocols work (ie CONNECT, SOCKS, etc).

because the packets are encrypted, the proxy doesn't have access to the Host header field, unlike with HTTP requests.

For non-encrypted HTTP proxying, the proxy itself acts as an HTTP server, which the client then sends HTTP messages directly to as-if it were the target server. The client must specify the target server in each HTTP message, either in the HTTP request line as an absolute URL, or in the HTTP Host header. The proxy can then read that target and forward the HTTP message to the specified server. It doesn't need a persistent TCP tunnel in this case, since HTTP is stateless from one message to the next.

That is not the case for proxying encrypted HTTPS, though. The proxy can't parse HTTPS messages at all, as it doesn't have the encryption details. All it can do is create a tunnel between the client and the target and then pass along raw bytes between them through the tunnel. The client and target negotiate the encryption details between themselves through the tunnel, before any encrypted data messages are then exchanged between them through the tunnel. As far as the proxy is concerned, the tunnel is just carrying opaque data.

BTW, I have come across multiple articles and answers on StackOverflow which talk about HTTP tunneling, but simply use the HTTP CONNECT method on the client side for tunneling. But I only have access to the proxy server, I cannot control what the client sends to me.

The proxy decides which proxying protocol(s) it implements - be that HTTP, CONNECT, SOCKS, etc. The client must then follow that protocol in order to pass application data back and forth through it.