Search code examples
jsoncsvazureazure-data-lakeu-sql

How do I flatten JSON to CSV using USQL


I can get some data out using Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple, but I'm having trouble flattening the whole file.

Here's the file format I'm working with:

{
 "SourceUrl":"http://www.unittest.org/test.html",
 "Title":"Unit Test File",
 "Guest":"Unit Test Guest",
 "PublishDate":"2017-05-15T00:00:00",
 "TranscriptionSections":[  
    {  
     "SectionStartTime":"00:00:03",
     "Sentences":[  
        {  
           "Text":"Intro."
        },
        {  
           "Text":"Sentence one"
        },
        {  
           "Text":"Sentence two"
        }
     ]
  },
  {  
     "SectionStartTime":"00:04:46",
     "Sentences":[  
        {  
           "Text":"Sentence three"
        },
        {  
           "Text":"Sentence four"
        }
     ]
  }
 ],
 "Categories":null
}

What I'm trying to get is a row per text (5 of them) including it's 'SectionStartTime' and all top level properties ('PublishDate', 'Guest'...).

So far I can get a row per 'SectionStartTime' using this:

USE econosphere;

REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];

DECLARE @in string="adl://abc.azuredatalakestore.net/data/20170515UnitTest.json";

DECLARE @out 
string="adl://abc.azuredatalakestore.net/processed/20170515UnitTest.csv";

@ep = EXTRACT
Title string,
SourceUrl string,
Guest string,
PublishDate DateTime,
TranscriptionSections string
FROM @in
USING new Microsoft.Analytics.Samples.Formats.Json.JsonExtractor();

@epAndTransctripts =
    SELECT Title,
        SourceUrl,
        Guest,
        PublishDate,
        Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(TranscriptionSections).Values AS TranscriptionSections_arr
    FROM @ep;

@all =
    SELECT
        Title,
        SourceUrl,
        Guest,
        PublishDate,
        Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(sects)["SectionStartTime"] AS TranscriptionSectionTimes

    FROM @epAndTransctripts
    CROSS APPLY
        EXPLODE(TranscriptionSections_arr) AS t(sects);


OUTPUT @all
TO @out 
USING Outputters.Csv();

Solution

  • Here is the solution that worked for me:

    DECLARE @input string = "/input/data.json";
    
    REFERENCE ASSEMBLY JSONBlog.[Newtonsoft.Json];
    REFERENCE ASSEMBLY JSONBlog.[Microsoft.Analytics.Samples.Formats];
    
    USING Microsoft.Analytics.Samples.Formats.Json;
    
    @data =
    EXTRACT SourceUrl string,
            Title string,
            Guest string,
            PublishDate DateTime,
            TranscriptionSections string,
            Categories string
    FROM @input
    USING new JsonExtractor();
    
    @data =
    SELECT SourceUrl,
           Title,
           Guest,
           PublishDate,
           Categories,
           JsonFunctions.JsonTuple(transcription_section) AS ts_map
    FROM @data
     CROSS APPLY
         EXPLODE(JsonFunctions.JsonTuple(TranscriptionSections).Values) AS T(transcription_section);
    
    @data =
    SELECT SourceUrl,
           Title,
           Guest,
           PublishDate,
           Categories,
           ts_map["SectionStartTime"]AS SectionStartTime,
           JsonFunctions.JsonTuple(text_item) ["Text"]AS text
    FROM @data
          CROSS APPLY
             EXPLODE(JsonFunctions.JsonTuple(ts_map["Sentences"]).Values) AS S(text_item);
    
    OUTPUT @data
    TO "/output/jsondata.csv"
    USING Outputters.Csv(outputHeader : true);