c++math optimization implementation number-theory

A Program to Find Absolute Euler Pseudoprimes

I am now trying to make a program to find the Absolute Euler Pseudoprimes ('AEPSP' in short, not Euler-Jacobi Pseudoprimes), with the definition that n is an AEPSP if

a^(n-1)/2 ≡ ±1 (mod n)

for all positive integers a satisfying that the GCD of a and n is 1.

I used a C++ code to generate AEPSPs, which is based on a code to generate Carmichael numbers:

#include <iostream>
#include <cmath>
#include <algorithm>
#include <numeric>

using namespace std;

unsigned int bm(unsigned int a, unsigned int n, unsigned int p){
    unsigned long long b;
    switch (n) {
        case 0:
            return 1;
        case 1:
            return a % p;
        default:
            b = bm(a,n/2,p);
            b = (b*b) % p;
            if (n % 2 == 1) b = (b*a) % p;
            return b;
        }
} 

int numc(unsigned int n){
    int a, s;
    int found = 0;
    if (n % 2 == 0) return 0;
    s = sqrt(n);
    a = 2;
    while (a < n) {
        if (a > s && !found) {
            return 0;
        }
        if (gcd(a, n) > 1) {
            found = 1;
        }
        else {
            if (bm(a, (n-1)/2, n) != 1) {
                return 0;
            }
        }
        a++;
    }
    return 1;
}

int main(void) {
    unsigned int n;
    for (n = 3; n < 1e9; n += 2){
    if (numc(n)) printf("%u\n",n);
    }
    return 0; 
}

Yet, the program is very slow. It generates AEPSPs up to 1.5e6 in 20 minutes. Does anyone have any ideas on optimizing the program?

Any help is most appreciated. :)

Solution

I've come up with a different algorithm, based on sieving for primes upfront while simultaneously marking off non-squarefree numbers. I've applied a few optimizations to pack the information into memory a bit tighter, to help with cache-friendliness as well. Here is the code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

#define PRIME_BIT  (1UL << 31)
#define SQUARE_BIT (1UL << 30)
#define FACTOR_MASK (SQUARE_BIT - 1)

void sieve(uint64_t size, uint32_t *buffer) {
    for (uint64_t i = 3; i * i < size; i += 2) {
        if (buffer[i/2] & PRIME_BIT) {
            for (uint64_t j = i * i; j < size; j += 2 * i) {
                buffer[j/2] &= SQUARE_BIT;
                buffer[j/2] |= i;
            }
            for (uint64_t j = i * i; j < size; j += 2 * i * i) {
                buffer[j/2] |= SQUARE_BIT;
            }
        }
    }
}

int main(int argc, char **argv) {
    if (argc < 2) {
        printf("Usage: prog LIMIT\n");
        return 1;
    }

    uint64_t size = atoi(argv[1]);

    uint32_t *buffer = malloc(size * sizeof(uint32_t) / 2);
    memset(buffer, 0x80, size * sizeof(uint32_t) / 2);

    sieve(size, buffer);

    for (uint64_t i = 5; i < size; i += 4) {
        if (buffer[i/2] & PRIME_BIT)
            continue;
        if (buffer[i/2] & SQUARE_BIT)
            continue;

        uint64_t num = i;
        uint64_t factor;

        while (num > 1) {
            if (buffer[num/2] & PRIME_BIT)
                factor = num;
            else
                factor = buffer[num/2] & FACTOR_MASK;
            
            if ((i / 2) % (factor - 1) != 0) {
                break;
            }
            num /= factor;
        }

        if (num == 1)
            printf("Found pseudo-prime: %ld\n", i);
    }
}

This produces the pseudo-primes up to 1.5e6 in about 8ms on my machine, and for 2e9 it takes 1.8sec.

The time complexity of the solution is O(n log n) - the sieve is O(n log n), and for each number we either do constant time checks or do a divisibility test for each of its factors, which there are at most log n. So, the main loop is also O(n log n), resulting in O(n log n) overall.