Are there any real benefits to compiling a 32-bit version of my DLL with AVX or higher?

I have a legacy Windows DLL (written in c++) for which I need to maintain a 32-bit version, along with the 64-bit version. I'm updating the heavy math code with simd using Agner's vector class library, and seeing little or no speed improvements for the 32-bit version when compiling with AVX as compared to SSE4.2. I'm aware that with 32-bit code there are always only 8 vector registers available, but I'm not clear (after much searching) exactly what this means when compiling with AVX, AVX2 or AVX512. Are there compiler options (Microsoft or Clang) that will give me some worthwhile speed improvements over SSE4.2 (for simple loops of floating point operations), or should I just save myself some trouble and compile the 32-bit version with SSE4.2?

Solution

I'm answering this question myself even though the question should arguably just be deleted ... maybe it will help someone, sometime.

By the time I got my simd code punched up (aligning the memory made a big difference) and fiddled around with MSVC compiler options, my 32-bit compile started acting exactly as expected when comparing no simd to SSE4.2, AVX and AVX512. Benchmarking the sample code below showed speed improvement ratios of 48%, 22% and 10% for SSE4.2, AVX, AVX512, respectively, for the 32-bit.

Oddly, the 64-bit compile runs much faster for no simd but slightly SLOWER than the 32-bit for all three simd options (good subject for a new question).

I compiled the code with no /Qpar switch and /Qvec-report:2 /Qpar-report:2 to verify to the extent possible that there was no auto-vectorization or auto-parallelization going on.

int Simd_debug(int idebug_branch, int iters, int asize)
{
    int j, k, iret = -3;
    double u, d;
    double* TR = 0; 
    double* UP = 0;
    double* DN = 0; 
    char* TR_unaligned = 0;
    char* UP_unaligned = 0;
    char* DN_unaligned = 0;

    const int vectorsize = SIMD_SIZE_SPN;   //8, 4, 2 = AVX512, AVX, SSE size
    #if SIMD_SIZE_SPN == 8
        Vec8d vec_up, vec_dn, vec_tree;
    #elif SIMD_SIZE_SPN == 4
        Vec4d vec_up, vec_dn, vec_tree;
    #else
        Vec2d vec_up, vec_dn, vec_tree;
    #endif

    const bool go_align_mem = true; 

    bool go_simd = (idebug_branch != 2);
    bool go_intrinsic = (idebug_branch == 1);
    int alignby = sizeof(double) * vectorsize;
    int datasize = asize;
    int arraysize = (datasize + vectorsize - 1) & (-vectorsize);
    int regularpart = arraysize & (-vectorsize);

    if (go_simd)
    {
        if (go_align_mem)
        {
            TR_unaligned = new char[arraysize * sizeof(double) + alignby];
            char* TR_aligned = (char*)(((size_t)TR_unaligned + alignby - 1) & (-alignby));
            TR = (double*)TR_aligned;

            UP_unaligned = new char[arraysize * sizeof(double) + alignby];
            char* UP_aligned = (char*)(((size_t)UP_unaligned + alignby - 1) & (-alignby));
            UP = (double*)UP_aligned;

            DN_unaligned = new char[arraysize * sizeof(double) + alignby];
            char* DN_aligned = (char*)(((size_t)DN_unaligned + alignby - 1) & (-alignby));
            DN = (double*)DN_aligned;

            //debug check alignment
            if ((((uintptr_t)TR & (alignby - 1)) != 0) || (((uintptr_t)UP & (alignby - 1)) != 0) || (((uintptr_t)DN & (alignby - 1)) != 0))
            {
                iret = -703;
                goto bail_out;
            }
        }
        else
        {
            TR = new double[arraysize];
            UP = new double[arraysize];
            DN = new double[arraysize];
        }//if (go_align_mem)
    }
    else
    {
        TR = new double[arraysize];
        UP = new double[arraysize];
        DN = new double[arraysize];
    }//if (go_simd)
    
    u = 1.01;
    d = 0.99;
    UP[0] = u;
    DN[0] = d;
    for (k = 1; k < arraysize; k++)
    {
        UP[k] = u * UP[k - 1];
        DN[k] = d * DN[k - 1];
    }

    for (j = 0; j < iters; j++)
    {
        if (go_simd)
        {
            for (k = 0; k < regularpart; k += vectorsize)
            {
                vec_up.load(UP + k);
                vec_dn.load(DN + k);
                vec_tree = vec_up * vec_dn;
                vec_tree.store(TR + k);
            }
        }
        else
        {
            #pragma loop(no_vector) //don't need this, according to /Qvec-report:2 ...
            for (k = 0; k < arraysize; k++)
            {
                TR[k] = UP[k] * DN[k];
            }
        }//if (go_simd)
    }

    iret = 10000 * idebug_branch + arraysize;

bail_out:
    if (go_simd && go_align_mem)
    {
        delete[] TR_unaligned;
        delete[] UP_unaligned;
        delete[] DN_unaligned;
    }
    else
    {
        delete[] TR;
        delete[] UP;
        delete[] DN;
    }
    return iret;
}