Search code examples
ccurve-fittingleast-squares

linear fit on log log axis plot does not fit the data


When I try to fit a linear curve to my data points the curve does not match my data. The line is below my data points and the slope is too deep, too. It could be due to starting at x=10 and its corresponding y value, but I am not sure. least square method should be correct, as I tried multiple ways of calculating slope and intercept, but all look methods produce the same output.

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#define PATH_GNUPLOT "your/path/gnuplot"

void ERROR(char* error_string);

int main() {

    // first 10 data points are irrelevant
    double data[100] = {0,0,0,0,0,0,0,0,0,0, 0.242910, 0.235523, 0.228976, 0.223114, 0.217820, 0.213005, 0.208597, 0.204539, 0.200786, 0.197299, 0.194047, 0.191003, 0.188146, 0.185456, 0.182916, 0.180513, 0.178234, 0.176068, 0.174006, 0.172039, 0.170160, 0.168362, 0.166639, 0.164987, 0.163399, 0.161872, 0.160402, 0.158984, 0.157617, 0.156297, 0.155020, 0.153785, 0.152590, 0.151431, 0.150308, 0.149218, 0.148159, 0.147131, 0.146131, 0.145158, 0.144211, 0.143289, 0.142391, 0.141515, 0.140661, 0.139827, 0.139014, 0.138219, 0.137442, 0.136683, 0.135941, 0.135215, 0.134504, 0.133809, 0.133128, 0.132461, 0.131807, 0.131166, 0.130538, 0.129922, 0.129318, 0.128725, 0.128142, 0.127571, 0.127010, 0.126458, 0.125916, 0.125384, 0.124861, 0.124346, 0.123840, 0.123342, 0.122853, 0.122371, 0.121897, 0.121430, 0.120970, 0.120518, 0.120072, 0.119633, 0.119200, 0.118774, 0.118354, 0.117939, 0.117531, 0.117128, 0.116731, 0.116340, 0.115953, 0.115572};

    int n = 100;

    /* calc fit */
    double sumx, sumy, sumx2, sumxy, a, b = 0.0;

    // => log(y) = a * log(x) + b
    // data starts at 10 for x value
    for (int i = 10; i < n; i++) {

        double x = log(i);
        double y = log(data[i]);

        sumx += x;
        sumx2 += x * x;
        sumy += y;
        sumxy += y * x;
    }

    a = (n * sumxy  -  sumx * sumy) / (n * sumx2 - sumx*sumx);
    b = (sumy * sumx2  -  sumx * sumxy) / (n * sumx2 - sumx*sumx);
    /* end calc fit */

    

    /* save data and plot */
    FILE *fp = fopen("data.dat", "w");
    
    if (fp == NULL) {
        ERROR("failed to open data.dat");
    }
    
    for (int i = 10; i < n; i++)
        fprintf(fp, "%d %.16lf\n", i, data[i]);
    
    fclose(fp);

    FILE *GNUpipe = popen(PATH_GNUPLOT, "w");
    
    if (GNUpipe == NULL) {
        ERROR("failed to open gnuplot");
    }
    
    fprintf(GNUpipe, "set term png\n");
    fprintf(GNUpipe, "set logscale xy\n");

    // log(y) = a * log(x) + b
    // => y = x**a + 10**b              
    fprintf(GNUpipe, "Fit(x) = x**%lf * 10**%lf\n", a, b);
    
    fprintf(GNUpipe, "plot [0.1:%d*1.1] 'data.dat' title 'error', ", n);

    fprintf(GNUpipe, "Fit(x) title 'fit'\n");
    
    pclose(GNUpipe);

}

void ERROR(char* error_string) {
    
    fprintf(stderr, error_string);
    
    exit(EXIT_FAILURE);
}

enter image description here

EDIT:

Code so far with all changes implemented:

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#define PATH_GNUPLOT "your/path/gnuplot"

void ERROR(char* error_string);

int main() {

    // first 10 data points are irrelevant
    double data[100] = {0,0,0,0,0,0,0,0,0,0, 0.242910, 0.235523, 0.228976, 0.223114, 0.217820, 0.213005, 0.208597, 0.204539, 0.200786, 0.197299, 0.194047, 0.191003, 0.188146, 0.185456, 0.182916, 0.180513, 0.178234, 0.176068, 0.174006, 0.172039, 0.170160, 0.168362, 0.166639, 0.164987, 0.163399, 0.161872, 0.160402, 0.158984, 0.157617, 0.156297, 0.155020, 0.153785, 0.152590, 0.151431, 0.150308, 0.149218, 0.148159, 0.147131, 0.146131, 0.145158, 0.144211, 0.143289, 0.142391, 0.141515, 0.140661, 0.139827, 0.139014, 0.138219, 0.137442, 0.136683, 0.135941, 0.135215, 0.134504, 0.133809, 0.133128, 0.132461, 0.131807, 0.131166, 0.130538, 0.129922, 0.129318, 0.128725, 0.128142, 0.127571, 0.127010, 0.126458, 0.125916, 0.125384, 0.124861, 0.124346, 0.123840, 0.123342, 0.122853, 0.122371, 0.121897, 0.121430, 0.120970, 0.120518, 0.120072, 0.119633, 0.119200, 0.118774, 0.118354, 0.117939, 0.117531, 0.117128, 0.116731, 0.116340, 0.115953, 0.115572};

    int n = 100;

    /* calc fit */
    double sumx = 0.0; 
    double sumy = 0.0; 
    double sumx2 = 0.0;
    double sumxy = 0.0;
    double a = 0.0;
    double b = 0.0;

    // => log(y) = a * log(x) + b
    // data starts at 10 for x value
    for (int i = 10; i < n; i++) {

        double x = log(i);
        double y = log(data[i]);

        sumx += x;
        sumx2 += x * x;
        sumy += y;
        sumxy += y * x;
    }

    int N = n - 10;
    double denom = N*sumx2 - sumx * sumx;
    b = (sumy * sumx2 - sumx * sumxy)/ denom;
    a = (N*sumxy - sumx * sumy)/ denom;

    printf("%g %g\n",a , b);
    /* end calc fit */

    

    /* save data and plot */
    FILE *fp = fopen("data.dat", "w");
    
    if (fp == NULL) {
        ERROR("failed to open data.dat");
    }
    
    for (int i = 10; i < n; i++)
        fprintf(fp, "%d %.16le\n", i, data[i]);
    
    fclose(fp);

    FILE *GNUpipe = popen(PATH_GNUPLOT, "w");
    
    if (GNUpipe == NULL) {
        ERROR("failed to open gnuplot");
    }
    
    fprintf(GNUpipe, "set term png\n");
    fprintf(GNUpipe, "set logscale xy\n");

    // log(y) = a * log(x) + b
    // => y = x**a * 10**b              
    fprintf(GNUpipe, "Fit(x) = x**%.16lf * 10**%.16lf\n", a, b);
    
    fprintf(GNUpipe, "plot [0.1:%d*1.1] 'data.dat' title 'error', ", n);

    fprintf(GNUpipe, "Fit(x) title 'fit'\n");
    
    pclose(GNUpipe);

}

void ERROR(char* error_string) {
    
    fprintf(stderr, error_string);
    
    exit(EXIT_FAILURE);
}

"data.dat": link stdout: -0.323977 -0.66909 for printf("%g %g\n",a , b);


Solution

  • At least these problems:

    Need to zero initialize

    // double sumx, sumy, sumx2, sumxy, a, b = 0.0; 
    double sumx = 0.0;
    double sumy = 0.0;
    ...
    

    Save time

    Enable more warnings: "warning: excess elements in array initializer"

    Too many initializers

    // double data[100] = {  101 initializers };
    

    Avoid losing precision

    // fprintf(GNUpipe, "Fit(x) = x**%lf * 10**%.*le\n", a, b);
    fprintf(GNUpipe, "Fit(x) = x**%.16le * 10**%.16le\n", a, b);
    
    // fprintf(fp, "%d %.16lf\n", i, data[i]);
    fprintf(fp, "%d %.16le\n", i, data[i]);
    

    Wrong n

    Code uses n. Since, for (int i = 10; i < n; i++), should be n-10 in a, b formulas.

    Maybe wrong formula?

    Looks wrong to me, need deeper review.

    // ??
    a = (n * sumxy  -  sumx * sumy) / (n * sumx2 - sumx*sumx);
    b = (sumy * sumx2  -  sumx * sumxy) / (n * sumx2 - sumx*sumx);
    

    I expect: ref.

    N = n - 10;
    double denom = N*sumx2 - sumx * sumx;
    a = (sumy * sumx2 - sumx * sumxy)/ denom;
    b = (N*sumxy - sumx * sumy)/ denom;
    

    Ah, I see we have a, b reversed.


    [Append]

    Wrong base

    Code is using log() (log base-e). Likely wants log10() if "Fit(x) = x**%lf * 10**%lf\n" is important.