Search code examples
c#parallel-processingtaskparallel.foreachpartitioner

Why is a parallel-processing much slower for a first call in C#?


I am trying to process numbers as fast as possible with C# app. I use a Thread.Sleep() to simulate a processing and random numbers. I use 3 different techniques.

This is test code that I used:

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;

namespace Test
{
    internal class Program
    {
        private static void Main()
        {
            var data = new int[500000];
            var random = new Random();

            for (int i = 0; i < 500000; i++)
            {
                data[i] = random.Next();
            }

            var partialTimes = new Dictionary<int, double>();
            var iterations = 5;

            for (int i = 1; i < iterations + 1; i++)
            {
                Console.Write($"ProcessData3 {i}\t");
                StartProcessing(data, partialTimes, ProcessData3);
                GC.Collect();
            }

            Console.WriteLine();
            Console.WriteLine("Press Enter to Exit");
            Console.ReadLine();
        }

        private static void StartProcessing(int[] data, Dictionary<int, double> partialTimes, Action<int[], Dictionary<int, double>> processData)
        {
            var stopwatch = Stopwatch.StartNew();

            try
            {
                processData?.Invoke(data, partialTimes);
                stopwatch.Stop();

                Console.WriteLine($"{stopwatch.Elapsed.ToString(@"mm\:ss\:fffffff")} total = {partialTimes.Sum(s => s.Value)} max = {partialTimes.Values.Max()}");
            }
            finally
            {
                partialTimes.Clear();
            }
        }

        private static void ProcessData1(int[] data, Dictionary<int, double> partialTimes)
        {
            Parallel.ForEach(data, number =>
            {
                var partialStopwatch = Stopwatch.StartNew();

                Thread.Sleep(1);

                partialStopwatch.Stop();

                lock (partialTimes)
                {
                    partialTimes[number] = partialStopwatch.Elapsed.TotalMilliseconds;
                }
            });
        }

        private static void ProcessData3(int[] data, Dictionary<int, double> partialTimes)
        {
            // Partition the entire source array.
            var rangePartitioner = Partitioner.Create(0, data.Length);

            // Loop over the partitions in parallel.
            Parallel.ForEach(rangePartitioner, (range, loopState) =>
            {
                // Loop over each range element without a delegate invocation.
                for (int i = range.Item1; i < range.Item2; i++)
                {
                    var number = data[i];
                    var partialStopwatch = Stopwatch.StartNew();

                    Thread.Sleep(1);

                    partialStopwatch.Stop();

                    lock (partialTimes)
                    {
                        partialTimes[number] = partialStopwatch.Elapsed.TotalMilliseconds;
                    }
                }
            });
        }

        private static void ProcessData2(int[] data, Dictionary<int, double> partialTimes)
        {
            var tasks = new Task[data.Count()];
            for (int i = 0; i < data.Count(); i++)
            {
                var number = data[i];

                tasks[i] = Task.Factory.StartNew(() =>
                {
                    var partialStopwatch = Stopwatch.StartNew();

                    Thread.Sleep(1);

                    partialStopwatch.Stop();

                    lock (partialTimes)
                    {
                        partialTimes[number] = partialStopwatch.Elapsed.TotalMilliseconds;
                    }
                });
            }

            Task.WaitAll(tasks);
        }
    }
}

For each technique I restart the program. And I get these results,
with having a Thread.Sleep( 1 ):

ProcessData1 1  00:56:1796688 total = 801335,282599955 max = 16,8783
ProcessData1 2  00:23:5390014 total = 816167,642100022 max = 14,5913
ProcessData1 3  00:14:7090566 total = 827589,675899998 max = 13,2617
ProcessData1 4  00:10:8929177 total = 829296,528300007 max = 15,0175
ProcessData1 5  00:10:6333310 total = 839282,123200008 max = 29,2738

ProcessData2 1  00:37:8084153 total = 824507,174200022 max = 112,071
ProcessData2 2  00:16:3762096 total = 849272,47810001  max = 77,1514
ProcessData2 3  00:12:9177717 total = 854012,353100029 max = 67,5684
ProcessData2 4  00:10:4798701 total = 857396,642899983 max = 92,9408
ProcessData2 5  00:09:2206146 total = 870966,655499989 max = 51,8945

ProcessData3 1  01:13:6814541 total = 803581,718699918 max = 25,6815
ProcessData3 2  01:07:9809277 total = 814069,532899922 max = 26,0671
ProcessData3 3  01:07:9857984 total = 814148,329399928 max = 21,3116
ProcessData3 4  01:07:4812183 total = 808042,695499966 max = 16,8601
ProcessData3 5  01:07:2954614 total = 805895,325499903 max = 23,8517

Where
total is total a time spent inside each Parallel.ForEach() function together and
max is a maximum time of each function.

Why is the first loop so slow? How is it possible that other attempts are processed so quickly? How to achieve a faster parallel processing on the first attempt?


EDIT:

So I tried it also with having a Thread.Sleep( 10 )
Results are:

ProcessData1 1  02:50:2845698 total = 5109831,95429994 max = 12,0612
ProcessData1 2  00:56:3361645 total = 5125884,05919954 max = 12,7666
ProcessData1 3  00:53:4911541 total = 5131105,15209993 max = 12,7486
ProcessData1 4  00:49:5665628 total = 5144654,75829992 max = 13,2678
ProcessData1 5  00:46:0218194 total = 5152955,19509996 max = 13,702

ProcessData2 1  01:21:7207557 total = 5121889,31579983 max = 73,8152
ProcessData2 2  00:39:6660074 total = 5175557,68889969 max = 59,369
ProcessData2 3  00:31:9036416 total = 5193819,89889973 max = 56,2895
ProcessData2 4  00:27:4616803 total = 5207168,56969977 max = 65,5495
ProcessData2 5  00:24:4270755 total = 5222567,9044998  max = 65,368

ProcessData3 1  02:44:9985645 total = 5110117,19019997 max = 11,7172
ProcessData3 2  02:25:6533128 total = 5237779,27010012 max = 26,3171
ProcessData3 3  02:22:2771259 total = 5116123,45259975 max = 12,0581
ProcessData3 4  02:22:1678911 total = 5112574,93779995 max = 11,5334
ProcessData3 5  02:21:9418178 total = 5104980,07120004 max = 11,5583

So first loop still takes much more seconds than others..


Solution

  • The behavior you're seeing is entirely explained by the fact that the ThreadPool class delays creating new threads until some small amount of time has passed (on the order of 1 second…it's changed over the years).

    It can be informative to add instrumentation to one's program. In your example, a very useful tool is to count the number of concurrent threads as managed by the thread pool, determine the "high water mark" (i.e. the maximum number of threads it eventually settles on), and then use that number to override the thread pool's behavior.

    When I did that, I discovered that on the first run of the first method, you get up to about 25 threads. But since the default for the thread pool is to only create a number of threads equal to the number of cores on your computer (eight, in my case), creating the additional threads can take a fair amount of time. And of course, during that time, you get significantly less throughput than you would otherwise (so you incur a larger delay than just the 20 seconds or so getting up to that number of threads causes).

    On the subsequent runs of that test, the max number of threads gradually rises (since each new run is starting with more threads in the thread pool already, from the previous run) gets as high as around 53.

    If you know in advance how many threads the thread pool is going to require in order to perform your work efficiently, you can use the SetMinThreads() method to increase the number of threads it will create immediately on demand before switching to the throttled thread-creation algorithm. For example, having that 53 thread high water mark in hand, you can set the number of minimum threads to that number (or a nice round one, like 50).

    When I do that, all five runs of your first test, which previously took between 25 seconds to 1 minute (with the longer runs being earlier, of course), take around 19 seconds to complete.

    I'd like to emphasize that you should use SetMinThreads() very carefully. The thread pool is, in general, very good about managing work-loads. The scenario you present above is obviously just for the sake of example and not realistic, but it does have the problem that you're not really doing that much work in each Parallel.ForEach() iteration in the first place. It doesn't seem like a good fit for concurrency, since so much of the time spent will be on overhead. Using SetMinThreads() in any similar scenario just papers over a more insidious underlying issue.

    You'll find that if you tailor your workloads to better match available resources, and to minimize transitions between tasks and threads, you can get good throughput without overriding the default thread pool numbers.


    Some other notes on this particular test…

    Note that if you change the program to run all three tests in the same session (five runs each), the "first run is longer" happens only for the first test. For future reference, you should always approach this sort of "first time is slower" question with an eye to testing different combinations and ordering, to verify whether it's a particular implementation that suffers from the effect, or if you see the effect for the first test, regardless of which implementation is run first. There are a number of implementation and platform details, including JIT, thread pool, disk cache that can affect the initial run of any algorithm, and you'll want to make sure that you quickly narrow down your search to knowing whether you're dealing with one of those or some genuine issue in your own algorithm.

    By the way, not that it really matters for your question, but I find it odd your choice to use the random number in the data array as the key for your timings dictionary. This IMHO renders those timing values useless, due to collisions in the random numbers. You won't count every time (when there's a collision, only the last instance of that number will get stored) which means that the "total" time displayed is less than the true total time spent, and even the max values won't necessarily be correct (if the true max value gets overwritten by a later value using the same key, you'll miss it).


    Here's my modified version of your first test, which shows both the diagnostic code I added, and (commented out) the statements to set the thread pool counts to produce faster, more consistent behavior:

    private static int _threadCount1;
    private static int _maxThreadCount1;
    
    private static void ProcessData1(int[] data, Dictionary<int, double> partialTimes)
    {
        const int minOverride = 50;
        int minMain, minIOCP, maxMain, maxIOCP;
    
        ThreadPool.GetMinThreads(out minMain, out minIOCP);
        ThreadPool.GetMaxThreads(out maxMain, out maxIOCP);
    
        WriteLine($"cores: {Environment.ProcessorCount}");
        WriteLine($"threads: {minMain} min, {maxMain} max");
    
        // Uncomment two lines below to see uniform behavior across test runs:
    
        //ThreadPool.SetMinThreads(minOverride, minIOCP);
        //ThreadPool.SetMaxThreads(minOverride, maxIOCP);
    
        _threadCount1 = _maxThreadCount1 = 0;
    
        Parallel.ForEach(data, number =>
        {
            int threadCount = Interlocked.Increment(ref _threadCount1);
    
            var partialStopwatch = Stopwatch.StartNew();
    
            Thread.Sleep(1);
    
            partialStopwatch.Stop();
    
            lock (partialTimes)
            {
                partialTimes[number] = partialStopwatch.Elapsed.TotalMilliseconds;
                if (_maxThreadCount1 < threadCount)
                {
                    _maxThreadCount1 = threadCount;
                }
            }
    
            Interlocked.Decrement(ref _threadCount1);
        });
    
        ThreadPool.SetMinThreads(minMain, minIOCP);
        ThreadPool.SetMaxThreads(maxMain, maxIOCP);
        WriteLine($"max thread count: {_maxThreadCount1}");
    }