Search code examples
excelvbaexcel-formula

How can I automatically and efficiently group by column and count distinct values on another column within that group when data has thousands of rows?


Here is some data generated using R:

library(tidyverse)
set.seed(0)

tibble(A = round(rnorm(10, 20, 2)),
       B = round(rnorm(10, 100, 2))) %>% 
  group_by(B) %>% 
  mutate(C = n_distinct(A)) %>%  # count distinct A per B
  arrange(B, A) %>% 
  write_csv("small_example.csv")

tibble(A = round(rnorm(5000, 10000, 1000)),
       B = round(rnorm(5000, 50000, 1000))) %>% 
  group_by(B) %>% 
  mutate(C = n_distinct(A)) %>%
  arrange(B, A) %>% 
  write_csv("big_example.csv")

small_example.csv should look something like this:

small_example.csv

I am attempting to recreate column C in column D, using VBA in Excel:

Sub UpdateFormulas()
  Dim last_row As Integer
  last_row = ActiveSheet.UsedRange.Rows.Count
  For i = 2 To last_row
    ActiveSheet.Cells(i, 4).Formula = "=SUMPRODUCT(($B$2:$B$" & last_row & "=B" & _
    i & ")/COUNTIFS($A$2:$A$" & last_row & ", $A$2:$A$" & last_row & ", $B$2:$B$" _
    & last_row & ", $B$2:$B$" & last_row & "))"
  Next i
End Sub

This VBA procedure does exactly what it is supposed to do and it works regardless of the length of the data. However, the formula I'm using is way too slow and seems to be an O(N^2) operation. How can I efficiently update these values on demand? I'm not opposed to simpler methods, but the method must be fairly automatic.


Solution

    • Using Dictionary improves efficiency
    • 1000 rows sample data takes less than 1 sec
    Option Explicit
    
    Sub Demo()
        Dim i As Long, lastRow As Long
        Dim arrData, rngData As Range
        Dim oDicB, oDicAB
        Dim sKeyB As String, sKeyAB As String
        Set oDicB = CreateObject("scripting.dictionary")
        Set oDicAB = CreateObject("scripting.dictionary")
        lastRow = Cells(Rows.Count, "A").End(xlUp).Row
        Set rngData = Range("A2:C" & lastRow)
        arrData = rngData.Value
        For i = LBound(arrData) To UBound(arrData)
            arrData(i, 1) = CStr(arrData(i, 1))
            arrData(i, 2) = CStr(arrData(i, 2))
            sKeyB = arrData(i, 2)
            sKeyAB = arrData(i, 1) & "|" & arrData(i, 2)
            If Not oDicAB.exists(sKeyAB) Then
                oDicAB(sKeyAB) = ""
                If Not oDicB.exists(sKeyB) Then
                    oDicB(sKeyB) = 1
                Else
                    oDicB(sKeyB) = oDicB(sKeyB) + 1
                End If
            End If
        Next i
        For i = LBound(arrData) To UBound(arrData)
            sKeyB = arrData(i, 2)
            If oDicB.exists(sKeyB) Then
                arrData(i, 3) = oDicB(sKeyB)
            End If
        Next
        rngData.Value = arrData
    End Sub