Search code examples
rlinear-regressiongsl

Multiple linear regression in GSL is correct?


I was attempting to perform multiple linear regression using GSL. The estimator for beta is beta=(X'X)^(-1)X'y. Three approaches were used -

  1. direct, i.e. calculate the right-hand side of this formula using required operations;
  2. solve the X'Xb=X'y, i.e. solve the system Ax=b, where A=X'X, b=X'y, and x=b;
  3. use function gsl_multifit_linear. The test data are from here.

All three approaches gave same wrong result:

beta1=0.0365505, beta2=0.0435827, alpha=0.645627.

The same data file produces correct output in R:

beta1=0.086409, beta2=0.087602, alpha=-4.1.

Results for another dataset (WHO life expectancy data from here) also differed between GSL and R.

Most likely I have a mistake somewhere. But it would be good if at least somebody can confirm that GSL multiple linear regression works correctly.

Edit1. The language is C++. The code for option1, i.e. beta=(X'X)^(-1)X'y

void mols(vector <double> predictandVector,vector <vector <double> > predictorMatrix) {
        //the function accepts vector of predictant values and matrix of predictands
        //convert input vectors into gsl_vector
        int n=predictandVector.size();
        int m=predictorMatrix[0].size();
        gsl_vector*y=gsl_vector_alloc(n);
        gsl_matrix*x=gsl_matrix_alloc(n,m);
        for(int i=0;i<n;i++) {
                gsl_vector_set(y,i,predictandVector[i]);
                for(int j=0;j<m;j++) {
                        gsl_matrix_set(x,i,j,predictorMatrix[i][j]);
                }
        }
        //multiply X' by X
        gsl_matrix*xxTranspose=gsl_matrix_alloc(m,m);
        gsl_blas_dgemm(CblasTrans,CblasNoTrans,1.0,x,x,0.0,xxTranspose);
        /////////////////////////////////
        //perform LU decomposition of xxTranspose to find it's inverse
        gsl_permutation*p=gsl_permutation_alloc(m);
        int s;
        gsl_linalg_LU_decomp(xxTranspose,p,&s);
        //////////////////////////////
        //compute inverse of xxTranspose matrix
        gsl_matrix*xxTransposeInverse=gsl_matrix_alloc(m,m);
        gsl_linalg_LU_invert(xxTranspose,p,xxTransposeInverse);
        //////////////////////////////
        //multiply inverse of xxTranspose matrix by xTranspose
        gsl_matrix*xxTransposeInverseXtranspose=gsl_matrix_alloc(m,n);
         gsl_blas_dgemm(CblasNoTrans,CblasTrans,1.0,xxTransposeInverse,x,0.0,xxTransposeInverseXtranspose);
        //////////////////////////////////////
        //multiply matrix (X'X)^(-1)X' and vector y; this must give values of beta
        gsl_vector*b=gsl_vector_alloc(m);
        gsl_blas_dgemv(CblasNoTrans,1.0,xxTransposeInverseXtranspose,y,0.0,b);
        //write beta values to file
        FILE*f;
        f=fopen("molsResult.dat","w");
        gsl_vector_fprintf(f,b,"%g");
        ////////////////////
}

The code for option2, i.e. solve the system X'Xbeta=X'y

oid mols(vector <double> predictandVector,vector <vector <double> > predictorMatrix) {
        //convert input vectors into gsl_vector
        int n=predictandVector.size();
        int m=predictorMatrix[0].size();
        gsl_vector*y=gsl_vector_alloc(n);
        gsl_matrix*x=gsl_matrix_alloc(n,m);
        for(int i=0;i<n;i++) {
                gsl_vector_set(y,i,predictandVector[i]);
                for(int j=0;j<m;j++) {
                        gsl_matrix_set(x,i,j,predictorMatrix[i][j]);
                }
        }
        /////////////////////
        //compute X'X
        gsl_matrix*xxTranspose=gsl_matrix_alloc(m,m);
        gsl_blas_dgemm(CblasTrans,CblasNoTrans,1.0,x,x,0.0,xxTranspose);
        /////////////////////////////////
        //compute X'y
        gsl_vector*XtransposeY=gsl_vector_alloc(m);
        gsl_blas_dgemv(CblasTrans,1.0,x,y,0.0,XtransposeY);
        //////////////////////////////////////
        //solve the linear system X'Xb=X'y to find coefficients beta
        gsl_vector*b=gsl_vector_alloc(m);
        int k=n;
        if(m<n) {
                k=m;
        }
        gsl_vector*tau=gsl_vector_alloc(k);
        gsl_linalg_QR_decomp(xxTranspose,tau);
        gsl_linalg_QR_solve(xxTranspose,tau,XtransposeY,b);
        //write beta values into file
        FILE*f;
        f=fopen("molsResult.dat","w");
        gsl_vector_fprintf(f,b,"%g");
        ////////////////////////////////
}

The code for option3, i.e. using gsl_multifit_linear.

void mols(vector <double> predictandVector,vector <vector <double> > predictorMatrix) {
        //convert input vectors into gsl_vector
        int n=predictandVector.size();
        int m=predictorMatrix[0].size();
        gsl_vector*y=gsl_vector_alloc(n);
        gsl_matrix*x=gsl_matrix_alloc(n,m);
        for(int i=0;i<n;i++) {
                gsl_vector_set(y,i,predictandVector[i]);
                for(int j=0;j<m;j++) {
                        gsl_matrix_set(x,i,j,predictorMatrix[i][j]);
                }
        }
        ///////////////////////////
        //Use gsl_multifit_linear
        gsl_multifit_linear_workspace*w=gsl_multifit_linear_alloc(n,2);
        double chisq;
        gsl_matrix*cov=gsl_matrix_alloc(m,m);
        gsl_vector*b=gsl_vector_alloc(m);
        gsl_multifit_linear(x,y,b,cov,&chisq,w);
        //////////////////////
        //write beta values into file
        FILE*f;
        f=fopen("molsResult.dat","w");
        gsl_vector_fprintf(f,b,"%g");
        //////////////////////////
}

Edit2. Also to absolutely assure that input data is read in correctly the following code as added to the option3 code:

FILE*xyWrite;
        xyWrite=fopen("molsInputData.dat","w");
        for(int i=0;i<n;i++) {
                fputs(to_string(gsl_vector_get(y,i)).c_str(),xyWrite);
                fputs(" ",xyWrite);
                fputs(to_string(gsl_matrix_get(x,i,0)).c_str(),xyWrite);
                fputs(" ",xyWrite);
                fputs(to_string(gsl_matrix_get(x,i,1)).c_str(),xyWrite);
                fputs("\n",xyWrite);
        }

This code writes gsl_vector(s) x and y (used as input) into file. Then this file was loaded into R and multiple linear regression performed. The results were correct. This proves that input data is read in correctly.


Solution

  • It's a little bit too much of a nuisance for me to write a C driver for gsl_multifit_linear at the moment; as far as I can tell you must be running it from C or C++ as the gsl package for R doesn't appear to include a wrapper for this function ...

    However, doing the linear algebra in both of the ways that you suggested (in R) gives the same answer as lm():

    m1 <- lm(Job_Perf~Mech_Apt+Consc, data=dd)
    y <- dd$Job_Perf
    X <- model.matrix(~Mech_Apt+Consc, data=dd)
    cbind(lm=coef(m1),
          solve1=c(solve(crossprod(X)) %*% t(X) %*% y),
          solve2=c(solve(crossprod(X), t(X)%*%y)))
    ##                      lm      solve1      solve2
    ## (Intercept) -4.10358123 -4.10358123 -4.10358123
    ## Mech_Apt     0.08640901  0.08640901  0.08640901
    ## Consc        0.08760164  0.08760164  0.08760164
    

    The most common mistake I've seen when implementing regression by hand is to forget that R automatically includes an intercept column in the model matrix, but as you list an alpha estimate I guess you're not making that mistake ... ?

    My best guess would be that you're setting up X incorrectly in some way: it should look like this

       (Intercept) Mech_Apt Consc
              1       40    25
              1       45    20
              1       38    30
    ...
    

    Data:

    dd <- read.table(header=TRUE, text="
    Job_Perf Mech_Apt Consc
    1   40  25
    2   45  20
    1   38  30
    3   50  30
    2   48  28
    3   55  30
    3   53  34
    4   55  36
    4   58  32
    3   40  34
    5   55  38
    3   48  28
    3   45  30
    2   55  36
    4   60  34
    5   60  38
    5   60  42
    5   65  38
    4   50  34  
    3   58  38
    ")