Search code examples
jsonhivehive-serde

I am only getting one row when using JSON SERDE to read a json file


JSON Data:

[{
  "liked": "true",
  "user_id": "101",
  "video_end_type": "3",
  "minutes_played": "3",
  "video_id": "101",
  "geo_cd": "AP",
  "channel_id": "11",
  "creator_id": "101",
  "timestamp": "07/05/2019 01:36:35",
  "disliked": "true"
},
{
"liked": true, 
"user_id": 102, 
"video_end_type": null,
 "minutes_played": 4,
 "video_id": 102, 
"geo_cd": "AP",
 "channel_id": 12,
 "creator_id": 102,
 "timestamp": "15/04/2019 17:04:00", 
"disliked": true
}
]

Output:

hive> select * from stream;
OK
NULL    NULL    NULL    NULL    NULL    NULL    NULL    NULL    NULL    NULL
true    101     3       3       101     AP      11      101     NULL    true
NULL    NULL    NULL    NULL    NULL    NULL    NULL    NULL    NULL    NULL

The output I get is only the first line.

Code:

add jar json-serde-1.3.8-jar-with-dependencies.jar;

create external table stream(
    liked string, user_id int, video_end_type int, minutes_played int, video_id int, geo_cd string, channel_id int, creator_id int, time timestamp, disliked 
string)
    ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
    LOCATION '/location';

Note: I can't remove the brackets because then the data is not in correct JSON format.


Solution

  • Something similar would work for you:

    JSON DATA:

    {"liked": "true",  "user_id": "101",  "video_end_type": "3",  "minutes_played": "3",  "video_id": "101",  "geo_cd": "AP",  "channel_id": "11",  "creator_id": "101",  "timestamp": "07/05/2019 01:36:35",  "disliked": "true"}
    {"liked": true, "user_id": 102, "video_end_type": null, "minutes_played": 4, "video_id": 102, "geo_cd": "AP", "channel_id": 12, "creator_id": 102, "timestamp": "15/04/2019 17:04:00", "disliked": true}
    

    And your create query should be as below:

    create external table stream(
    liked string,
    user_id string,
    video_end_type string,
    minutes_played string,
    video_id string,
    geo_cd string,
    channel_id string,
    creator_id string,
    `timestamp` string,
    disliked string
    )
    ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
    STORED AS TEXTFILE
    LOCATION '/location/';
    

    Note: If your JSON data is consistent, you can use appropriate datatypes instead of string.