Search code examples
apache-spark-sqlscalebigdecimal

spark sql: select rows where column of DecimalType has larger scale than a numer


I have a dataframe that has colum of datatype DecimalType(38,10). Not all values have 10 decimal digits. I want to select those rows that have a scale bigger than 4 (after removing the trailing zeros).

Is there a way to do that?

In pseudo code something like ds.select(col1, col2).where(col3.hasScale >4)


Solution

  • Something like this could do it:

    import org.apache.spark.sql.Row;
    import org.apache.spark.sql.types.StructField;
    import org.apache.spark.sql.types.StructType;
    import org.apache.spark.sql.types.StringType;
    import org.apache.spark.sql.types.DataTypes;
    
    val maxScale = 10
    
    val decimalType = DataTypes.createDecimalType(38, maxScale)
    
    val data = Seq(
      Row(BigDecimal.decimal(3.302302)),
      Row(BigDecimal.decimal(3.4434)),
      Row(BigDecimal.decimal(4.32)),
      Row(BigDecimal.decimal(4.230240505)),
      Row(BigDecimal.decimal(7.302)),
      Row(BigDecimal.decimal(4.34444))
    )
    
    val schema = List(
      StructField("number", decimalType, true)
    )
    
    val df = spark.createDataFrame(
      spark.sparkContext.parallelize(data),
      StructType(schema)
    )
    
    df.show()
    
    val decimalScale = udf((n: Double) => {
      Stream.range(0, maxScale + 1).map { s => 
        val multiplier = scala.math.pow(10, maxScale)
        val modulus = scala.math.pow(10, maxScale - s)
        (s, n * multiplier % modulus)
      }.find(_._2 == 0).get._1
    })
    
    df.filter(decimalScale(col("number")) > 4).show()