Search code examples
c#.netlinqlinqpaddrop-duplicates

How to remove duplicates using time difference with linq


I have an IEnumerable of an item class defined like this:

class Checkup
{
    public Guid SubjectGuid { get; set; }
    public Guid DoctorGuid { get; set; }
    public DateTime Date {get; set;}
}

For example I have the following data set:

SubjectGuid                          DoctorGuid                           Date
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:04:46.644
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:12:27.369
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:30:36.564
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:24:46.935
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:39:27.853
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:59:27.853
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:00:00.000
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:21:00.203
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:50:33.906

And I want to remove duplicates if the difference between the date is less than or equal to twenty minutes, while the SubjectGuid and DoctorGuid are the same. If there are more than two duplicates, then the element with the most recent date within 20 minutes should remain:

SubjectGuid                          DoctorGuid                           Date
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:04:46.644
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:30:36.564
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:24:46.935
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:59:27.853
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:00:00.000
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:21:00.203
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:50:33.906

Here is the code (LINQPad) with which I tried to get the desired result, but it does not work as I would like:

void Main()
{
    var checkups = new List<Checkup>()
    {
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:04:46.644") },
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:12:27.369") },
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:30:36.564") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:24:46.935") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:39:27.853") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:59:27.853") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:00:00.000") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:21:00.203") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:50:33.906") },
    };
    checkups.Dump();

    var checkupsNoDuplicates = checkups
        .GroupBy(c => new
        {
            SubjectGuid = c.SubjectGuid,
            DoctorGuid = c.DoctorGuid,
            Time = c.Date.Ticks / TimeSpan.FromMinutes(20).Ticks
        })
        .Select(g => g.OrderByDescending(sg => sg.Date).FirstOrDefault());
    checkupsNoDuplicates.Dump();
}

class Checkup
{
    public Guid SubjectGuid { get; set; }
    public Guid DoctorGuid { get; set; }
    public DateTime Date { get; set; }
}

This code produces this result:

SubjectGuid                          DoctorGuid                           Date
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:12:27
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:30:36
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:39:27
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:59:27
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:00:00
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:21:00
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:50:33

And dividing ticks will not work always. For example ticks division in this code:

void Main()
{
    var dt1 = DateTime.Parse("03.01.2024 01:27:44.907");
    Console.WriteLine(dt1.ToString("MM.dd.yyyy hh:mm:ss.fff"));
    Console.WriteLine(dt1.Ticks / TimeSpan.FromMinutes(20).Ticks);
    
    var dt2 = DateTime.Parse("03.01.2024 01:41:55.088");
    Console.WriteLine(dt2.ToString("MM.dd.yyyy hh:mm:ss.fff"));
    Console.WriteLine(dt2.Ticks / TimeSpan.FromMinutes(20).Ticks);
}

Produces result:

01.03.2024 01:27:44.907
53199868
01.03.2024 01:41:55.088
53199869

Solution

  • You can leverage the Aggregate, which performs a reduction.

    Here we go:

    List<Checkup> result = checkups
        .OrderBy(c => c.Date)   //sorting
        .Aggregate<Checkup, List<Checkup>>(
            [],
            (accumulate, item) =>
            {
                Checkup? match = accumulate
                    .LastOrDefault(c => c.SubjectGuid == item.SubjectGuid && c.DoctorGuid == item.DoctorGuid);
    
                if (match == null)
                {
                    accumulate.Add(item);
                }
                else if ((item.Date - match.Date) > TimeSpan.FromMinutes(20))
                {
                    accumulate.Add(item);
                }
                return accumulate;
            });
    

    There's a first ordering (ascending), which could be optional if the input data are ensured already sorted.

    Then, there is the aggregation block.

    The Aggregate function takes as first argument the initial value of the expected result, which is an empty List<Checkup> in our case.

    The core function is its second argument, which operates on the "accumulation" and the current item. By iterating, the accumulator represents the current-step result. The function elaborates then the post-step result, which will be the accumulator for the next iteration.

    When the iteration is over, the actual result is what the accumulator contains.

    What the core does in our case?

    1. It finds the last entry of the same (SubjectGuid, DoctorGuid) pair.

    2. If there's not, then adds the current item to the accumuator.

    3. If there's one, then checks if enough time (20 mins) have been elapsed, and adds the item if that applies.