Search code examples
scalaperformanceperformance-testing

Trying to understand Scala Array


I am trying to evaluate which data structure would be best represent sparse vectors in Scala. These sparse vectors contain a list of indexes, and one value for each index. I implemented a small benchmark, which seems to indicate that an Array[(Long, Double)] seems to take a lot less space than 2 parallel arrays. Is that correct? Am I doing that benchmark correctly? (I wouldn't be surprised if I did something wrong somewhere)

import java.lang.management.ManagementFactory
import java.text.NumberFormat

object TestSize {

  val N = 100000000
  val formatter: NumberFormat = java.text.NumberFormat.getIntegerInstance

  def twoParallelArrays(): Unit = {

    val Z1 = Array.ofDim[Long](N)
    val Z2 = Array.ofDim[Double](N)
    Z1(N-1) = 1
    Z2(N-1) = 1.0D
    println(Z2(N-1) - Z1(N-1))
    val z1 = ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
    val z2 = ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage.getUsed
    println(s"${formatter.format(z1)} ${formatter.format(z2)}")
  }

  def arrayOfTuples(): Unit = {

    val Z = Array.ofDim[(Long, Double)](N)
    Z(N-1) = (1, 1.0D)
    println(Z(N-1)._2 - Z(N-1)._1)
    val z1 = ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
    val z2 = ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage.getUsed
    println(s"${formatter.format(z1)} ${formatter.format(z2)}")
  }

  def main(args: Array[String]): Unit = {

    // Comment one or the other to look at the results
    //arrayOfTuples()
    twoParallelArrays()
  }
}

Solution

  • No, not correct.

    Array.ofDim[(Long, Double)](N)
    

    creates a large array filled with null, and does not allocate any space for Long, Double, or the actual Tuple2-instance, that's why you don't see anything in heap-memory usage.

    The two-array version allocates all the space it needs for all Long and Double immediately, and you see it in heap space usage.

    Just replace ofDim by an appropriate fill to see the real numbers.

    On array of size N = 1000000:

    arrayOfTuples:     45,693,312 19,190,296
    twoParallelArrays: 25,925,792 19,315,256
    

    The arrayOfTuples-solution clearly takes more space.

    You might wonder why the factor is roughly 1.8 instead of at least 2.5. This is because Tuple2 is @specialized for a few primitive datatypes, especially for Long and Double, therefore these two 8-byte primitives can be stored in Tuple2 without boxing. Therefore, the total overhead is only 8 bytes for a 64-bit reference from array to Tuple2, and some overhead in each Tuple2 instance. But still, it's more than storing Long and Double directly in arrays.

    By the way: that's exactly the reason why Apache Spark stores the data using all those Encoders: so that you don't have to worry about the overhead caused by your tuples and case-classes.


    Full code:

    import java.lang.management.ManagementFactory
    import java.text.NumberFormat
    
    object TestSize {
    
      val N = 1000000
      val formatter: NumberFormat = java.text.NumberFormat.getIntegerInstance
    
      def twoParallelArrays(): Unit = {
    
        val Z1 = Array.fill[Long](N)(42L)
        val Z2 = Array.fill[Double](N)(42.0)
        println(Z1)
        println(Z2)
        Z1(N-1) = 1
        Z2(N-1) = 1.0D
        println(Z2(N-1) - Z1(N-1))
        val z1 = ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
        val z2 = ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage.getUsed
        Z1((new scala.util.Random).nextInt(N)) = 1234L
        Z2((new scala.util.Random).nextInt(N)) = 345.0d
        println(Z2(N-1) - Z1(N-1))
        println(s"${formatter.format(z1)} ${formatter.format(z2)}")
      }
    
      def arrayOfTuples(): Unit = {
    
        val Z = Array.fill[(Long, Double)](N)((42L, 42.0d))
        Z(N-1) = (1, 1.0D)
        println(Z(N-1)._2 - Z(N-1)._1)
        val z1 = ManagementFactory.getMemoryMXBean.getHeapMemoryUsage.getUsed
        val z2 = ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage.getUsed
        Z((new scala.util.Random).nextInt(N)) = (1234L, 543.0d)
        println(Z(N-1)._2 - Z(N-1)._1)
        println(s"${formatter.format(z1)} ${formatter.format(z2)}")
      }
    
      def main(args: Array[String]): Unit = {
    
        // Comment one or the other to look at the results
        arrayOfTuples()
        // twoParallelArrays()
      }
    }