Search code examples
jsonscalaapache-sparkuser-defined-functions

SPARK: Parse JSON and get the Parent Column name when child column name is set to true


Given a below data to pass:

{
    
    "properties": {
        "student_id": {
            "type": "string",
            "unique_id": true
        },
        "status": {
            "type": "boolean"
        },
        "name": {
            "type": "string"
        },
        "phone_number": {
            "type": "string",
            "ignore": true
        },
        "e_mail": {
            "type": "string",
            "ignore": true
        },
        "address": {
            "type": "string"
        }
    },
    "subjects": [
        "science",
        "english"
    ]
}

I am specifically looking for child columns "unique_id" and "ignore" and want to get the below output:

unique_id     | ignore                 |
----------------------------------------
[student_id]  | [phone_number, e_mail] |

There parent column names are always changing. The child columns like "unique_id" and "ignore" can be present for any of the parent columns. I would like to know the parent column name for which the "unique_id" and "ignore" is true.


Solution

  • I use json4s to parse, and something like this works.

    imports & raw JSON

    import org.json4s.DefaultFormats
    import org.json4s.{ JValue, JObject, JString, JBool, JField }
    import org.json4s.jackson.JsonMethods.parse
    
    val jsonString = "{\"properties\":{\"student_id\":{\"type\":\"string\",\"unique_id\":true},\"status\":{\"type\":\"boolean\"},\"name\":{\"type\":\"string\"},\"phone_number\":{\"type\":\"string\",\"ignore\":true},\"e_mail\":{\"type\":\"string\",\"ignore\":true},\"address\":{\"type\":\"string\"}},\"subjects\":[\"science\",\"english\"]}"
    

    function to extract properties where key == true

    /** @param key where (key, true) exists for given properties, extract the property names
      * @param json raw String of JSON data
      */
    def extractProperties(key: String, json: String): List[String] = {
      // parse String => JValue
      implicit val formats = DefaultFormats
      val jval: JValue = parse(json)
    
      // extract "properties" from JSON
      val properties: List[JField] = jval match {
        case JObject(obj) => obj.filter(_._1 == "properties").head._2 match {
          case JObject(obj2) => obj2
          case _ => throw new IllegalArgumentException
        }
        case _ => throw new IllegalArgumentException
      }
    
     // extract property names where key == true
     properties
       .filter(property => property._2.asInstanceOf[JObject].obj contains (key,JBool(true)))
       .map(_._1)
    }
    

    test it out

    val unique: List[String] = extractProperties("unique_id", jsonString)
    // unique: List[String] = List(student_id)
    
    val ignored: List[String] = extractProperties("ignore", jsonString)
    // ignored: List[String] = List(phone_number, e_mail)