Search code examples
apache-sparkhivebigdataorc

Whats the easiest way to get a table DDL from an orc file?


With spark I can do for example:

spark.read.orc("/path/to/file").printSchema

But I would like to get something like the output of show create table in hive. Is it possible?


Solution

  • This should handle most cases (tailor it to your specific case if needed):

    import org.apache.spark.sql.SparkSession
    import org.apache.spark.sql.types.{ArrayType, BooleanType, DoubleType, IntegerType, LongType, StringType, StructField}
    
    object Main {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession.builder().master("local[*]").getOrCreate()
        val types = spark.read.orc("/path/to/orc/orc_file.orc").schema
        println("CREATE EXTERNAL TABLE name (")
        types.foreach {
          //case (name, typ) => println("    " + name + " " + getType(typ))
          case StructField(name, dataType, nullable, metadata) =>
            println("  " + name.toLowerCase + " " + getType(dataType) + ",")
        }
        println(")")
      }
    
      def getType(typ: Any): String = {
        typ match {
          case StringType => "string"
          case IntegerType => "int"
          case DoubleType => "double"
          case LongType => "bigint"
          case BooleanType => "boolean"
          case ArrayType(elementType, containsNull) => "array<" + getType(elementType) + ">"
          case StructField(name, dataType, nullable, metadata) => s"${name.toLowerCase}:${getType(dataType)}"
          case seq: Seq[StructField] => "struct<" + seq.map(e => getType(e)).mkString(",") + ">"
        }
      }
    }