Search code examples
oracleparsingclob

Oracle parsing data between tags from a clob


I have a situation where an application is storing chunks of text in a CLOB.

Each chunk of text is surrounded by a tag [SYSDATE] (see below for test CASE).

I'm looking for a query that can extract the data between the matching tags. For example, how can I get the text between [11-22-2021 14:16:19] tags for all matches. In this case 'ZZZZZZZZZZZZZZZZZZZZ'

Secondly, is there a better way to handle this situation? Perhaps maybe using XML? All input, ideas and working examples would be greatly appreciated.


ALTER SESSION SET NLS_DATE_FORMAT = 'MMDDYYYY HH24:MI:SS';

CREATE table t(
seq_num integer  GENERATED BY DEFAULT AS IDENTITY (START WITH 1) NOT NULL,
c CLOB DEFAULT ' ',
create_date DATE DEFAULT SYSDATE
);
/


insert into t (c) values (' ')
/


CREATE OR REPLACE PROCEDURE lob_append(
  p_clob IN OUT CLOB,
  p_text IN     VARCHAR2
)
AS
  l_text varchar2(32760);
  l_date_string VARCHAR2(50);
BEGIN

select '[' || TO_CHAR (SYSDATE, 'MM-DD-YYYY HH24:MI:SS') || ']'
    into l_date_string  from dual;

 
-- newline each time code is appended for clarity.
  l_text :=chr(10) || l_date_string || chr(10)
            || p_text || chr(10)
            || l_date_string||chr(10);

  dbms_lob.writeappend(p_clob, length(l_text), l_text );
END;
/
 

DECLARE
  l_clob CLOB := empty_clob();
lTime date;

BEGIN
lTime := sysdate;

  SELECT c INTO l_clob FROM t WHERE seq_num = 1 FOR UPDATE;

 lob_append(l_clob, rpad('Z',20,'Z'));

loop
        exit when sysdate = lTime + interval '5' second;
     end loop;

l_clob  := empty_clob();
SELECT c INTO l_clob FROM t WHERE seq_num = 1 FOR UPDATE;


lob_append(l_clob, rpad('Y',10,'Y'));


END;
/




SEQ_NUM    C    CREATE_DATE
1     
[11-22-2021 14:16:19]
ZZZZZZZZZZZZZZZZZZZZ
[11-22-2021 14:16:19]

[11-22-2021 14:16:24]
YYYYYYYYYY
[11-22-2021 14:16:24]
11222021 14:15:54



Solution

  • Your sample CLOB contains newlines; I assumed that is correct, and handled the newlines explicitly. (The solution below assumes each "token" does not contain newlines itself - if it does, then you can modify the regexp_substr call slightly, using the option to allow the dot metacharacter to match newlines.)

    select t.seq_num, l.ord, l.token
    from   t cross join lateral
           (
             select level as ord,
                    regexp_substr(c, '(\[\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}])'
                                      || chr(10) || '(.*?)' || chr(10) || '\1'
                                 , 1, level, null, 2) 
                    as token
             from   dual
             connect by level <= 
                     regexp_count(c, '(\[\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}])'
                                      || chr(10) || '(.*?)' || chr(10) || '\1')
           ) l
    order  by seq_num, ord   --  if needed
    ;
    
    
       SEQ_NUM        ORD TOKEN                    
    ---------- ---------- -------------------------
             1          1 ZZZZZZZZZZZZZZZZZZZZ     
             1          2 YYYYYYYYYY