Logistic regression code stops working above ~43,500 generated observations

Having some difficulty troubleshooting code I wrote in C to perform a logistic regression. While it seems to work on smaller, semi-randomized datasets, it stops working (e.g. assigning proper probabilities of belonging to class 1) at around the point where I pass 43,500 observations (determined by tweaking the number of observations created. When creating the 150 features used in the code, I do create the first two as a function of the number of observations, so I'm not sure if maybe that's the issue here, though I am using double precision. Maybe there's an overflow somewhere in the code?

The below code should be self-contained; it generates m=50,000 observations with n=150 features. Setting m below 43,500 should return "Percent class 1: 0.250000", setting to 44,000 or above will return "Percent class 1: 0.000000", regardless of what max_iter (number of times we sample m observations) is set to.

The first feature is set to 1.0 divided by the total number of observations, if class 0 (first 75% of observations), or the index of the observation divided by the total number of observations otherwise.

The second feature is just index divided by total number of observations.

All other features are random.

The logistic regression is intended to use stochastic gradient descent, randomly selecting an observation index, computing the gradient of the loss with the predicted y using current weights, and updating weights with the gradient and learning rate (eta).

Using the same initialization with Python and NumPy, I still get the proper results, even above 50,000 observations.

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>

// Compute z = w * x + b
double dlc( int n, double *X, double *coef, double intercept )
{
    double y_pred = intercept;
    for (int i = 0; i < n; i++)
    {
        y_pred += X[i] * coef[i];
    }
    return y_pred;
}

// Compute y_hat = 1 / (1 + e^(-z))
double sigmoid( int n, double alpha, double *X, double *coef, double beta, double intercept )
{
    double y_pred;
    y_pred = dlc(n, X, coef, intercept);
    y_pred = 1.0 / (1.0 + exp(-y_pred));

    return y_pred;
}

// Stochastic gradient descent
void sgd( int m, int n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept, int random_seed )
{
    double *gradient_coef, *X_i;
    double y_i, y_pred, resid;
    int idx;

    double gradient_intercept = 0.0, alpha = 1.0, beta = 1.0;

    X_i = (double *) malloc (n * sizeof(double));
    gradient_coef = (double *) malloc (n * sizeof(double));

    for ( int i = 0; i < n; i++ )
    {
        coef[i] = 0.0;
        gradient_coef[i] = 0.0;
    }
    *intercept = 0.0;

    srand(random_seed);
    
    for ( int epoch = 0; epoch < max_iter; epoch++ )
    {
        for ( int run = 0; run < m; run++ )
        {
            // Randomly sample an observation
            idx = rand() % m;
            for ( int i = 0; i < n; i++ )
            {
                X_i[i] = X[n*idx+i];
            }
            y_i = y[idx];
            // Compute y_hat
            y_pred = sigmoid( n, alpha, X_i, coef, beta, *intercept );
            resid = -(y_i - y_pred);
            // Compute gradients and adjust weights
            for (int i = 0; i < n; i++)
            {
                gradient_coef[i] = X_i[i] * resid;
                coef[i] -= eta * gradient_coef[i];
            }
            if ( fit_intercept == 1 )
            {
                *intercept -= eta * resid;
            }
        }
    }
}

int main(void)
{
    double *X, *y, *coef, *y_pred;
    double intercept;
    double eta = 0.05;
    double alpha = 1.0, beta = 1.0;
    long m = 50000;
    long n = 150;
    int max_iter = 20;

    long class_0 = (long)(3.0 / 4.0 * (double)m);
    double pct_class_1 = 0.0;

    clock_t test_start;
    clock_t test_end;
    double test_time;

    printf("Constructing variables...\n");
    X = (double *) malloc (m * n * sizeof(double));
    y = (double *) malloc (m * sizeof(double));
    y_pred = (double *) malloc (m * sizeof(double));
    coef = (double *) malloc (n * sizeof(double));

    // Initialize classes
    for (int i = 0; i < m; i++)
    {
        if (i < class_0)
        {
            y[i] = 0.0;
        }
        else
        {
            y[i] = 1.0;
        }
    }

    // Initialize observation features
    for (int i = 0; i < m; i++)
    {
        if (i < class_0)
        {
            X[n*i] = 1.0 / (double)m;
        }
        else
        {
            X[n*i] = (double)i / (double)m;
        }
        X[n*i + 1] = (double)i / (double)m;
        for (int j = 2; j < n; j++)
        {
            X[n*i + j] = (double)(rand() % 100) / 100.0;
        }
    }

    // Fit weights
    printf("Running SGD...\n");
    test_start = clock();
    sgd( m, n, X, y, coef, &intercept, eta, max_iter, 1, 42 );
    test_end = clock();
    test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC;
    printf("Time taken: %f\n", test_time);

    // Compute y_hat and share of observations predicted as class 1
    printf("Making predictions...\n");
    for ( int i = 0; i < m; i++ )
    {
        y_pred[i] = sigmoid( n, alpha, &X[i*n], coef, beta, intercept );
    }

    printf("Printing results...\n");
    for ( int i = 0; i < m; i++ )
    {
        //printf("%f\n", y_pred[i]);
        if (y_pred[i] > 0.5)
        {
            pct_class_1 += 1.0;
        }
        // Troubleshooting print
        if (i < 10 || i > m - 10)
        {
            printf("%g\n", y_pred[i]);
        }
    }
    printf("Percent class 1: %f", pct_class_1 / (double)m);

    return 0;
}

For reference, here is my (presumably) equivalent Python code, which returns the correct percent of identified classes at more than 50,000 observations:

import numpy as np
import time

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


class LogisticRegressor:
    def __init__(self, eta, init_runs, fit_intercept=True):
        self.eta = eta
        self.init_runs = init_runs
        self.fit_intercept = fit_intercept
    
    def fit(self, x, y):
        m, n = x.shape
        self.coef = np.zeros((n, 1))
        self.intercept = np.zeros((1, 1))
        
        for epoch in range(self.init_runs):
            for run in range(m):
                idx = np.random.randint(0, m)
                x_i = x[idx:idx+1, :]
                y_i = y[idx]
                y_pred_i = sigmoid(x_i.dot(self.coef) + self.intercept)
                gradient_w = -(x_i.T * (y_i - y_pred_i))
                self.coef -= self.eta * gradient_w
                if self.fit_intercept:
                    gradient_b = -(y_i - y_pred_i)
                    self.intercept -= self.eta * gradient_b
        
    def predict_proba(self, x):
        m, n = x.shape
        y_pred = np.ones((m, 2))
        y_pred[:,1:2] = sigmoid(x.dot(self.coef) + self.intercept)
        y_pred[:,0:1] -= y_pred[:,1:2]
        return y_pred
    
    def predict(self, x):
        return np.round(sigmoid(x.dot(self.coef) + self.intercept))
    

m = 50000
n = 150
class1 = int(3.0 / 4.0 * m)

X = np.random.rand(m, n)
y = np.zeros((m, 1))

for obs in range(m):
    if obs < class1:
        continue
    else:
        y[obs,0] = 1

for obs in range(m):
    if obs < class1:
        X[obs, 0] = 1.0 / float(m)
    else:
        X[obs, 0] = float(obs) / float(m)
    X[obs, 1] = float(obs) / float(m)

logit = LogisticRegressor(0.05, 20)
start_time = time.time()
logit.fit(X, y)
end_time = time.time()
print(round(end_time - start_time, 2))
y_pred = logit.predict(X)
print("Percent:", y_pred.sum() / len(y_pred))

Solution

The issue is here:

            // Randomly sample an observation
            idx = rand() % m;

... in light of the fact that the OP's RAND_MAX is 32767. This is exacerbated by the fact that all of the class 0 observations are at the end.

All samples will be drawn from the first 32768 observations, and when the total number of observations is greater than that, the proportion of class 0 observations among those that can be sampled is less than 0.25. At 43691 total observations, there are no class 0 observations among those that can be sampled.

As a secondary issue, rand() % m does not yield a wholly uniform distribution if m does not evenly divide RAND_MAX + 1, though the effect of this issue will be much more subtle.

Bottom line: you need a better random number generator.

At minimum, you could consider combining the bits from two calls to rand() to yield an integer with sufficient range, but you might want to consider getting a third-party generator. There are several available.