Search code examples
mysqlregexmysql-json

how to merge all MySQL JSON column values with JSON_MERGE_PATCH?


Am trying to turn all scalar values in any given JSON into an empty string.

Given i have a JSON column called arbitraryjsonvalues in mysql database table called ITEMS and the arbitraryjsonvalues column have the following values.

arbitraryjsonvalues
===================
{"foo": {"big": {"cat": 16, "dog": 90}, "boo": ["babe1", "babe2", "babe3"], "small": ["paradise", "hell"]}, "goo": "heavens gate", "yeah": "rice pot on fire"}

{"foo": {"big": {"cone": 90, "cylinder": 16}, "lover": ["lover1", "lover2", "lover3"], "small": ["banner", "logo"]}, "yeah": "pizza on the table"}

{"foo": {"big": {"ape": 16, "humans": 90}, "kalo": ["kalo1", "kalo2", "kalo3"], "small": ["paradise", "hell"]}, "goo": "heavens gate", "yeah": "rice pot on fire", "freak": "bed"}

{"fcs": ["arsenal", "chelsea", "man utd", "leicester", "inter milan"], "foo": {"big": {"ape": 16, "humans": 90}, "kalo": ["kalo1", "kalo2", "kalo3"], "small": ["paradise", "hell"]}, "goo": "heavens gate", "yeah": "rice pot on fire", "freak": "bed"}

{"a": "dsd"}

{"foo": {"ll": true, "boo": ["", "", {"cc": {"dd": ["", true, "", 43], "gg": true}, "jj": "fu"}, "", 90, false, true]}, "lls": 90, "mmn": 9, "bbbd": "ad_a", "lulu": "adfdasf", "_alago": {"a": 4}}

{"phone": "+234809488485"}

{"foo": {"big": {"cat": 16, "dog": 90}, "boo": ["", "", ""], "small": ["http://koks.com", ""]}, "goo": "+2345554444 677888", "yeah": "rice pot on fire"}

{"ll": true, "boo": ["http://kij.com", "___89jjjjdhfhhf8jjd__", {"cc": {"dd": ["", true, "", 43], "gg": true}, "jj": "fu"}, "", 90, false, true]}

{"ll": true, "boo": ["http://kij.com", "___+++++89jjjjdhfhhf8jjd__", {"cc": {"dd": ["", true, "", 43], "gg": true}, "jj": "fu"}, "", 90, false, true]}

{"ll": true, "boo": ["http://kij.com", "___+++++", {"cc": {"dd": ["", true, "", 43], "gg": true}, "jj": "fu"}, "", 90, false, true]}

{"gg": ["a", {"ll": "pink"}]}

{"gg": ["a", {"ll": ["pink", "orange"]}]}

I have created the following stored procedure to enable me merge all json values in a JSON column.

MERGE_ALL_JSON Procedure

DELIMITER $$
CREATE DEFINER=`root`@`%` PROCEDURE `MERGE_ALL_JSON`(IN `$LIMIT` INT)
BEGIN

DECLARE `$LIST` LONGTEXT;
DECLARE `$REMAINING` INT;
DECLARE `$PAGE` INT;
DECLARE `$TOTAL_JSON_OBJECT_COUNT` INT;
DECLARE `$OFFSET` INT;
DECLARE `$NEXT` TEXT;
DECLARE `$NEXT_LENGTH` INT;
DECLARE `$VALUE` TEXT;
DECLARE `$COUNTER` INT;
DECLARE `$MERGED_JSON` JSON;

SET `$MERGED_JSON` = '{}';

SET `$OFFSET` = 1;

CALL 
GET_TOTAL_JSON_OBJECT_COUNT(`$TOTAL_JSON_OBJECT_COUNT`);

CALL CALCULATE_PAGE_COUNT(
`$LIMIT`,`$TOTAL_JSON_OBJECT_COUNT`,`$PAGE`
);

WHILE `$OFFSET` <= `$PAGE`
DO
  CALL GET_JSON_LIST(`$LIMIT`, `$OFFSET`, `$LIST`);

  SET `$COUNTER` = 0;

  SELECT `$LIST`;

  iterator:
  LOOP
    IF CHAR_LENGTH(TRIM(`$LIST`)) = 0 OR `$LIST` IS NULL 
    THEN
      LEAVE iterator;
    END IF;

    SET `$NEXT` = SUBSTRING_INDEX(`$LIST`,"__|__',",1);

    SET `$NEXT_LENGTH` = CHAR_LENGTH(`$NEXT`);

    SET `$COUNTER` = `$COUNTER` + 1;

    SET `$REMAINING` = (`$TOTAL_JSON_OBJECT_COUNT` - ((`$OFFSET` - 1)*`$LIMIT`));

    IF `$OFFSET` = `$PAGE` AND `$COUNTER` = `$REMAINING` THEN
      SET `$NEXT` = SUBSTRING_INDEX(`$NEXT`, "__|__'", 1);
    END IF;

    SET `$VALUE` = CONCAT(TRIM(`$NEXT`), "'");
    SET `$VALUE` = substring(`$VALUE`, 2, length(`$VALUE`) - 2);

    SET `$MERGED_JSON` = JSON_MERGE_PATCH(
      `$MERGED_JSON`,`$VALUE`
    );

    SET `$LIST` = INSERT(`$LIST`,1,`$NEXT_LENGTH` + CHAR_LENGTH("__|__',"),'');

  END LOOP;

  SET `$OFFSET` = `$OFFSET` + 1;

END WHILE;

SELECT `$MERGED_JSON`;

END$$
DELIMITER ;

GET_JSON_LIST Procedure

DELIMITER $$
CREATE DEFINER=`root`@`%` PROCEDURE `GET_JSON_LIST`(IN `$LIMIT` INT, IN `$OFFSET` INT, OUT `$LIST` LONGTEXT)
BEGIN
DECLARE `$NEWOFFSET` INT;
SET `$NEWOFFSET` = (`$OFFSET`-1)*`$LIMIT`;

SET @t = @@group_concat_max_len;
SET @@group_concat_max_len = 4294967295899999;
SET `$LIST` = (SELECT 
  GROUP_CONCAT(
    (SELECT DISTINCT 
    CONCAT(
      "'",
      arbitraryjsonvalues,
      "__|__'"
    ))
  )
  FROM (
    SELECT DISTINCT arbitraryjsonvalues 
    FROM ITEMS 
    WHERE arbitraryjsonvalues != JSON_OBJECT() AND 
JSON_TYPE(arbitraryjsonvalues) = "OBJECT"
LIMIT `$NEWOFFSET`, `$LIMIT`
) as jsonvalues);
SET @@group_concat_max_len = @t;

END$$
DELIMITER ;

GET_TOTAL_JSON_OBJECT_COUNT Procedure

DELIMITER $$
CREATE DEFINER=`root`@`%` PROCEDURE `GET_TOTAL_JSON_OBJECT_COUNT`(OUT `$TOTAL_JSON_OBJECT_COUNT` INT)
BEGIN
SELECT COUNT(*) FROM (
  SELECT DISTINCT arbitraryjsonvalues 
  FROM ITEMS
  WHERE JSON_TYPE(arbitraryjsonvalues) = "OBJECT" AND 
  arbitraryjsonvalues != JSON_OBJECT()
) as distinctcount INTO `$TOTAL_JSON_OBJECT_COUNT`;

END$$
DELIMITER ;

finally, CALCULATE_PAGE_COUNT Procedure

DELIMITER $$
CREATE DEFINER=`root`@`%` PROCEDURE `CALCULATE_PAGE_COUNT`(IN `$LIMIT` INT, IN `$TOTAL` INT, OUT `$PAGE` INT)
BEGIN
DECLARE `$REMAINDER` INT;
SET `$PAGE` = `$TOTAL` / `$LIMIT`;
SET `$REMAINDER` = `$TOTAL` - (`$PAGE`*`$LIMIT`);
IF `$REMAINDER` > 0 THEN
  SET `$PAGE` = `$PAGE` + 1;
END IF;

END$$
DELIMITER ;

However, I discovered that two JSON values with the same structure but different scalar values are distinct to one another, which means an attempt to merge all values in JSON column may fetch too many values ranging to thousands or more for processing; meanwhile, each of this JSON values will possibly not vary that much in structure, so I believe it will be good to turn all scalar values into an empty string, which is the problem I am currently trying to solve.

Afterward, I will be looking to turn series of empty strings in an array into a single empty string, for example

["", "", "", {....}, "", ""] will be equal to ["", {}]

["", "", "", {"a": ["", ""]}, "", ""] will be equal to ["", {"a": [""]}]

So if I can solve this two problems, which is to have a regex to turn all the scalar values into an empty string and a regex to turn series of empty string as described above, I will be able to figure out where I can use the regex in MYSQL statement of the above procedures to fetch distinct json values.

I believe by doing so, i will be able to fetch not too many distinct values. and my goal is to merge any given JSON column in a split of seconds. I don't care to have the scalar values, all i wanted is for my backend to automatically determine the structure and send it to the frontend to display a tree or collapsing tree where each node can have a text field to allow for a more context-specific search.

So whenever a node is searched, the frontend is aware of the path and the backend knows where to get the data...

Trying to implement some sort of reusable backend for searching JSON column of any given mysql database table column with JSON type.

enter image description here

I am using the following regex below to perform this action but it matches not only the scalar values but also the key. wish to get one that will match only the scalar values and not the keys.

true|false|\d+|("(?:(?!")((?:\\"|[^"])*))*")

Given that I have this merged JSON value for testing

{
  "a": "dsd", 
  "ll": [true, true, true], 
  "boo": [
    "http://kij.com", 
    "___89jjjjdhfhhf8jjd__", 
    {
      "cc": {
        "dd": ["", true, "", 43], 
        "gg": true
      }, 
      "jj": "f'u"
    }, 
    "", 
    90, 
    false, 
    true, 
    "http://kij.com", 
    "___+++++89jjjjdhfhhf8jjd__", 
    {
      "cc": {
        "dd": ["", true, "", 43], 
        "gg": true
      }, 
      "jj": "fu"
    }, 
    "", 
    90, 
    false, 
    true, 
    "http://kij.com", 
    "___+++++", 
    {
      "cc": {
        "dd": ["", true, "@8jkk=", 43], 
        "gg": true
      }, 
      "jj": "fu@"
    }, 
    "", 
    90, 
    false, 
    true
  ], 
  "fcs": ["arsenal", "chelsea", "man utd", "leicester", "inter milan"], 
  "foo": {
    "ll": true, 
    "big": {
      "ape": [16, 16], 
      "cat": [16, 16], 
      "dog": [90, 90], 
      "cone": 90, 
      "humans": [90, 90], 
      "cylinder": 16
    }, 
    "boo": ["babe1", "babe2", "babe3", "", "", {
      "cc": {
        "dd": ["", true, "", 43], 
        "gg": true
      }, 
      "jj": "fu"
    }, "", 90, false, true, "", "", ""], 
    "kalo": ["kalo1", "kalo2", "kalo3", "kalo1", "kalo2", "kalo3"], 
    "lover": ["lover1", "lover2", "lover3"], 
    "small": ["paradise", "hell", "banner", "logo", "paradise", "hell", "paradise", "hell", "http://koks.com", ""]
  }, 
  "goo": ["heavens gate", "heavens gate", "heavens gate", "+2345554444 677888"], 
  "lls": 90, 
  "mmn": 9, 
  "bbbd": "ad_a", 
  "lulu": "adfdasf", 
  "yeah": ["rice pot on fire", "pizza on the table", "rice pot on fire", "rice pot on fire", "rice pot on fire"], 
  "freak": ["bed", "bed"], 
  "phone": "+2347777777", 
  "_alago": {"a": "$4m-jkk+=$900"}
}

Here is a link for you to test it

Test Regex

Please i need someone to help me solve this problem


Solution

  • after digging this is what I've come up with..

    GET_JSON_LIST procedure

    DELIMITER $$
    CREATE DEFINER=`root`@`%` PROCEDURE `GET_JSON_LIST`(IN `$LIMIT` INT, IN `$OFFSET` INT, OUT `$LIST` LONGTEXT)
    BEGIN
    DECLARE `$NEWOFFSET` INT;
    SET `$NEWOFFSET` = (`$OFFSET`-1)*`$LIMIT`;
    
    SET @t = @@group_concat_max_len;
    SET @@group_concat_max_len = 4294967295899999;
    SET @LIST = '{}';
    SET @TEMP=(
      SELECT GROUP_CONCAT(
        @LIST:=JSON_MERGE_PRESERVE(
          @LIST, 
          jsonvalues
        )
      )
      FROM (
        SELECT DISTINCT REGEXP_REPLACE(
          REGEXP_REPLACE(
            REGEXP_REPLACE(
              REGEXP_REPLACE(
                REGEXP_REPLACE(
                  arbitraryjsonvalues,
                  '"(\\w+)":',
                  '$1:'
                ),
                '(true|false|\\d+|("(?:(?!")((?:\\\\"|[^"])*))*"))',
                '""'
              ),
              '(\\w*):',
              '"$1":'
            ),
            '(?:""\\s*,\\s*){2,}',
            '"",'
          ),
          '(?:"",\\s*"")+]',
          '""]'
        ) as jsonvalues
        FROM ITEMS 
        WHERE JSON_TYPE(arbitraryjsonvalues) = "OBJECT" AND arbitraryjsonvalues != JSON_OBJECT() 
        LIMIT `$NEWOFFSET`, `$LIMIT`
      ) AS arbitval2
    );
    SET @@group_concat_max_len = @t;
    
    SET `$LIST` = REGEXP_REPLACE(
      REGEXP_REPLACE(
        REGEXP_REPLACE(
          REGEXP_REPLACE(
            REGEXP_REPLACE(
              @LIST,
              '"(\\w+)":',
              '$1:'
            ),
            '(true|false|\\d+|("(?:(?!")((?:\\\\"|[^"])*))*"))',
            '""'
          ),
          '(\\w*):',
          '"$1":'
        ),
        '(?:""\\s*,\\s*){2,}',
        '"",'
      ),
      '(?:"",\\s*"")+]',
      '""]'
    );
    
    END$$
    DELIMITER ;
    

    GET_TOTAL_JSON_OBJECT_COUNT Procedure

    DELIMITER $$
    CREATE DEFINER=`root`@`%` PROCEDURE `GET_TOTAL_JSON_OBJECT_COUNT`(OUT `$TOTAL_JSON_OBJECT_COUNT` INT)
    BEGIN
    SELECT COUNT(*) 
    FROM (
        SELECT DISTINCT REGEXP_REPLACE(
          REGEXP_REPLACE(
            REGEXP_REPLACE(
              REGEXP_REPLACE(
                REGEXP_REPLACE(
                  arbitraryjsonvalues,
                  '"(\\w+)":',
                  '$1:'
                ),
                '(true|false|\\d+|("(?:(?!")((?:\\\\"|[^"])*))*"))',
                '""'
              ),
              '(\\w*):',
              '"$1":'
            ),
            '(?:""\\s*,\\s*){2,}',
            '"",'
          ),
          '(?:"",\\s*"")+]',
          '""]'
        ) AS total 
        FROM ITEMS
        WHERE arbitraryjsonvalues != JSON_OBJECT() AND arbitraryjsonvalues != JSON_ARRAY()
    ) as distinctcount INTO `$TOTAL_JSON_OBJECT_COUNT`;
    
    END$$
    DELIMITER ;
    

    MERGE_ALL_JSON Procedure

    DELIMITER $$
    CREATE DEFINER=`root`@`%` PROCEDURE `MERGE_ALL_JSON`(IN `$LIMIT` INT)
    BEGIN
    
    DECLARE `$LIST` LONGTEXT;
    DECLARE `$PAGE` INT;
    DECLARE `$TOTAL_JSON_OBJECT_COUNT` INT;
    DECLARE `$OFFSET` INT;
    DECLARE `$NEXT` TEXT;
    DECLARE `$NEXT_LENGTH` INT;
    DECLARE `$VALUE` TEXT;
    DECLARE `$MERGED_JSON` JSON;
    
    SET `$MERGED_JSON` = '{}';
    
    SET `$OFFSET` = 1;
    
    CALL 
    GET_TOTAL_JSON_OBJECT_COUNT(`$TOTAL_JSON_OBJECT_COUNT`);
    
    CALL CALCULATE_PAGE_COUNT(`$LIMIT`,`$TOTAL_JSON_OBJECT_COUNT`,`$PAGE`);
    
    WHILE `$OFFSET`<=`$PAGE`
    DO
      CALL GET_JSON_LIST(`$LIMIT`, `$OFFSET`, `$LIST`);
      SET `$MERGED_JSON` = JSON_MERGE_PRESERVE(`$MERGED_JSON`,`$LIST`);
      SET `$OFFSET`=(`$OFFSET`+1);
    END WHILE;
    
    SELECT `$MERGED_JSON`;
    
    END$$
    DELIMITER ;
    

    CALCULATE_PAGE_COUNT Procedure

    DELIMITER $$
    CREATE DEFINER=`root`@`%` PROCEDURE `CALCULATE_PAGE_COUNT`(IN `$LIMIT` INT, IN `$TOTAL` INT, OUT `$PAGE` INT)
    BEGIN
    DECLARE `$REMAINDER` INT;
    SET `$PAGE` = `$TOTAL` / `$LIMIT`;
    SET `$REMAINDER` = `$TOTAL` - (`$PAGE`*`$LIMIT`);
    IF `$REMAINDER` > 0 THEN
      SET `$PAGE` = `$PAGE` + 1;
    END IF;
    
    END$$
    DELIMITER ;
    

    Running MERGE_ALL_JSON Procedure produces the result below, and this solves my problem somehow, but I am looking for a way to merge all objects inside an array into one with REGEX_REPLACE and JSON_MERGE_PRESERVE.

    {
      "a": "", 
      "gg": ["", {"ll": ""}, "", {"ll": [""]}],
      "ll": "", "boo": ["", {"cc": {"dd": [""], "gg": ""}, "jj": ""}, ""], 
      "fcs": [""], 
      "foo": [
        {
          "ll": "", 
          "big": {
            "ape": [""], 
            "cat": [""], 
            "dog": [""], 
            "cone": "", 
            "humans": [""], 
            "cylinder": ""
          }, 
          "boo": ["", {"cc": {"dd": [""], "gg": ""}, "jj": ""}, ""],
          "kalo": [""], 
          "lover": [""], 
          "small": [""]
        }, 
        "", 
        {"bag": ""}
      ], 
      "goo": [""], 
      "lls": "", 
      "mmn": "", 
      "bbbd": "", 
      "lulu": "", 
      "yeah": ["", {"jj": ""}, "", {"jj": ""}, ""],
      "freak": [""], 
      "light": "", 
      "phone": "", 
      "_alago": {"a": ""}
    }
    

    Let me drop this answer here, it might help someone.