Search code examples
apache-sparkschemaparquet

Replacing invalid characters in spark nested attribute names


There are a few posts on here about handling invalid characters at the first level but not multi-nested attributes

I encountered this error with my multi-nested schema

org.apache.spark.sql.AnalysisException: Attribute name "Foo Bar" contains invalid character(s) among " ,;{}()\n\t=". Please use alias to rename it.;

Solution

  • Here is my solution in scala

    private val INVALID_ATTRIBUTE_CHARS = "[ ,;{}()\n\t=]"
    
    def replaceBadAttriName(structType: StructType): StructType =
      StructType(structType.fields.map(cleanStructFld))
    
    private def cleanStructFld(fld: StructField): StructField = {
      fld.dataType match {
        case struct: StructType =>
          StructField(fld.name, StructType(struct.map(cleanStructFld)), fld.nullable, fld.metadata)
        case _ =>
          val newName = fld.name.replaceAll(INVALID_ATTRIBUTE_CHARS, "_")
          StructField(newName, fld.dataType, fld.nullable, fld.metadata)
      }
    }