I have a dataframe that has colum of datatype DecimalType(38,10). Not all values have 10 decimal digits. I want to select those rows that have a scale bigger than 4 (after removing the trailing zeros).
Is there a way to do that?
In pseudo code something like ds.select(col1, col2).where(col3.hasScale >4)
Something like this could do it:
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.DataTypes;
val maxScale = 10
val decimalType = DataTypes.createDecimalType(38, maxScale)
val data = Seq(
Row(BigDecimal.decimal(3.302302)),
Row(BigDecimal.decimal(3.4434)),
Row(BigDecimal.decimal(4.32)),
Row(BigDecimal.decimal(4.230240505)),
Row(BigDecimal.decimal(7.302)),
Row(BigDecimal.decimal(4.34444))
)
val schema = List(
StructField("number", decimalType, true)
)
val df = spark.createDataFrame(
spark.sparkContext.parallelize(data),
StructType(schema)
)
df.show()
val decimalScale = udf((n: Double) => {
Stream.range(0, maxScale + 1).map { s =>
val multiplier = scala.math.pow(10, maxScale)
val modulus = scala.math.pow(10, maxScale - s)
(s, n * multiplier % modulus)
}.find(_._2 == 0).get._1
})
df.filter(decimalScale(col("number")) > 4).show()