Search code examples
sqloracle-databasereplacepattern-matchingregexp-replace

How to identify the words exactly to replace based on multiple conditions using REGEXP_REPLACE in Oracle SQL?


The table consists of the columns: word and sentence. I am trying to replace the words in the sentences with a link(consists of the word and its id) if the words exist in the word column. The below code replaces just fine. But I need help to figure out a way to identify the exact word to replace when id's are the same but the text is different.

For example: id= 2 has 2 rows with words testing and test.

Right now, it replaces the first sentence as below. Both testing and test are replaced with http://localhost/2/<u>testing</u>
automtestingation http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u> is popular kind of http://localhost/2/<u>testing</u>

I am expecting it to be
automtestingation http://localhost/2/<u>testing</u> http://localhost/2/<u>test</u> is popular kind of http://localhost/2/<u>testing</u>
Create table temp(
  id       NUMBER,
  word     VARCHAR2(1000),
  sentence VARCHAR2(2000)
);

insert into temp
SELECT 1,'automation testing', 'automtestingation testing test is popular kind of testing' FROM DUAL UNION ALL
SELECT 2,'testing','manual testing' FROM DUAL UNION ALL
SELECT 2,'test','test' FROM DUAL UNION ALL
SELECT 3,'manual testing','this is an old method of testing' FROM DUAL

with words(id, word, word_length, search1, replace1, search2, replace2) as (
  select id, word, length(word),
  '(^|\W)' || REGEXP_REPLACE(word, '([][)(}{|^$\.*+?])', '\\\1') || '($|\W)',
  '\1{'|| id ||'}\2',
  '{'|| id ||'}',
  'http://localhost/' || id || '/<u>' || word || '</u>'
  FROM temp
)
, joined_data as (
  select w.search1, w.replace1, w.search2, w.replace2,
    s.rowid s_rid, s.sentence,
    row_number() over(partition by s.rowid order by word_length desc) rn
  from words w
  join temp s
  on instr(UPPER(s.sentence), UPPER(w.word)) > 0
  and regexp_like(s.sentence, w.search1)
)
, unpivoted_data as (
  select S_RID, SENTENCE, PHASE, SEARCH_STRING, REPLACE_STRING,
    row_number() over(partition by s_rid order by phase, rn) rn,
    case when row_number() over(partition by s_rid order by phase, rn)
      = count(*) over(partition by s_rid)
      then 1
      else 0
    end is_last
  from joined_data
  unpivot(
    (search_string, replace_string) 
    for phase in ( (search1, replace1) as 1, (search2, replace2) as 2 ))
)
, replaced_data(S_RID, RN, is_last, SENTENCE) as (
  select S_RID, RN, is_last,
    regexp_replace(SENTENCE, search_string, replace_string,1,0,'i')
  from unpivoted_data
  where rn = 1
  union all
  select n.S_RID, n.RN, n.is_last,
    case when n.phase = 1
      then regexp_replace(o.SENTENCE, n.search_string, n.replace_string,1,0,'i')
      else replace(o.SENTENCE, n.search_string, n.replace_string)
    end
  from unpivoted_data n
  join replaced_data o
    on o.s_rid = n.s_rid and n.rn = o.rn + 1  
)
select s_rid, sentence from replaced_data
where is_last = 1
order by s_rid;

Solution

  • Same as previous answer with a couple of small changes to replace the matched word with a unique identifier (already generated using ROW_NUMBER analytic function) and then when replacing with the URI then use the id again.

    Merge:

    MERGE INTO temp dst
    USING (
      WITH ordered_words ( rn, id, word, regex_safe_word ) AS (
        SELECT ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) ASC, word DESC ),
               id,
               word,
               REGEXP_REPLACE( word, '([][)(}{|^$\.*+?])', '\\\1' )
        FROM   temp
      ),
      sentences_with_ids ( rid, sentence, rn ) AS (
        SELECT ROWID,
               sentence,
               ( SELECT COUNT(*) + 1 FROM ordered_words )
        FROM   temp
      UNION ALL
        SELECT s.rid,
               REGEXP_REPLACE(
                 REGEXP_REPLACE(
                   s.sentence,
                   '(^|\W)' || w.regex_safe_word || '($|\W)',
                   '\1${'|| w.rn ||'}\2'                       -- Changed here
                  ),
                 '(^|\W)' || w.regex_safe_word || '($|\W)',
                 '\1${' || w.rn || '}\2'                       -- Changed here
               ),
               s.rn - 1
        FROM   sentences_with_ids s
               INNER JOIN ordered_words w
               ON ( s.rn - 1 = w.rn ) 
      ),
      sentences_with_words ( rid, sentence, rn ) AS (
        SELECT rid,
               sentence,
               ( SELECT COUNT(*) + 1 FROM ordered_words )
        FROM   sentences_with_ids
        WHERE  rn = 1
      UNION ALL
        SELECT s.rid,
               REPLACE(
                 s.sentence,
                 '${' || w.rn || '}',                       -- Changed here
                 'http://localhost/' || w.id || '/<u>' || w.word || '</u>'
               ),
               s.rn - 1
        FROM   sentences_with_words s
               INNER JOIN ordered_words w
               ON ( s.rn - 1 = w.rn ) 
      )
      SELECT rid, sentence
      FROM   sentences_with_words
      WHERE  rn = 1
    ) src
    ON ( dst.ROWID = src.RID )
    WHEN MATCHED THEN
      UPDATE
      SET    sentence = src.sentence;
    

    Output:

    SELECT * FROM temp
    
    ID | WORD                    | SENTENCE                                                                                                                                                                                                  
    -: | :---------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
     1 | automation testing      | automtestingation http://localhost/2/<u>testing</u> http://localhost/2/<u>test</u> is popular kind of http://localhost/2/<u>testing</u>                                                                   
     2 | testing                 | http://localhost/3/<u>manual testing</u>                                                                                                                                                                  
     2 | test                    | http://localhost/2/<u>test</u>                                                                                                                                                                            
     3 | manual testing          | this is an old method of http://localhost/2/<u>testing</u>                                                                                                                                                
     4 | punctuation             | http://localhost/1/<u>automation testing</u>,http://localhost/3/<u>manual testing</u>,http://localhost/4/<u>punctuation</u>,automanual http://localhost/2/<u>testing</u>-http://localhost/2/<u>testing</u>
     5 | B-number analysis       | http://localhost/6/<u>B-number analysis table</u>                                                                                                                                                         
     6 | B-number analysis table | http://localhost/2/<u>testing</u> http://localhost/5/<u>B-number analysis</u>                                                                                                                             
     7 | Not Matched             | http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u>                                                                                                     
     8 | ^[($                    | http://localhost/2/<u>testing</u> characters http://localhost/8/<u>^[($</u> that need escaping in a regular expression                                                                                    
    

    db<>fiddle here


    For your code, use the same technique in the first sub-query factoring clause:

    with words(id, word, word_length, search1, replace1, search2, replace2) as (
      select id, word, length(word),
      '(^|\W)' || REGEXP_REPLACE(word, '([][)(}{|^$\.*+?])', '\\\1') || '($|\W)',
      '\1{'|| ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) DESC, word ASC ) ||'}\2',
      '{'|| ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) DESC, word ASC ) ||'}',
      'http://localhost/' || id || '/<u>' || word || '</u>'
      FROM temp
    )