Search code examples
marklogicmarklogic-9

Issue with NEAR query in Marklogic


I have a strange issue with near query..

let $xml :=
  <titles count="6">
    <title type="source">ASIA-PACIFIC JOURNAL OF CLINICAL ONCOLOGY</title>
    <title type="source_abbrev">ASIA-PAC J CLIN ONCO</title>
    <title type="abbrev_iso">Asia-Pac. J. Clin. Oncol.</title>
    <title type="abbrev_11">ASIA-PAC J</title>
    <title type="abbrev_29">ASIA-PAC J CLIN ONCOL</title>
    <title type="item">Phase II study of cetuximab with irinotecan for KRAS wild-type colorectal cancer in Japanese patients</title>
   </titles>

Initially I ran this query

let $q1 := 
      cts:element-query((xs:QName("title")),
          cts:word-query(("phase 0","phase 1","phase 2","phase 3","phase 4","phase I","phase ii","phase iii","phase iv"),
          ("case-insensitive", "wildcarded"))
        )
return
  cts:highlight($xml,$q1, <b>{$cts:text}</b>)

I got the result, which is correct enter image description here

Now I ran this, and I got the following result which is correct

let $q2 := 
      cts:element-query((xs:QName("title")),
          cts:word-query(("trial*", "study", "studies*"),
          ("case-insensitive", "wildcarded"))
        )

return
  cts:highlight($xml,$q2, <b>{$cts:text}</b>)

enter image description here Then I ran the following query with NEAR/0 and I did not get any

let $q3 :=
    cts:near-query((
              cts:element-query((xs:QName("title")),
                cts:word-query(("phase 0","phase 1","phase 2","phase 3","phase 4","phase I","phase ii","phase iii","phase iv"),
                  ("case-insensitive", "wildcarded")))
          ,
             cts:element-query((xs:QName("title")),
                cts:word-query(("trial*", "study", "studies*"),
                  ("case-insensitive", "wildcarded")))
         ),
         0,
         ('ordered'))

return
  cts:highlight($xml,$q3, <b>{$cts:text}</b>)

enter image description here

But then I ran the query with NEAR/1 and I got the result.. But why is that ? the pharse 1 is immediately followed by pharse 2 . so the NEAR distance should be 0 right ?

let $q3 :=
    cts:near-query((
              cts:element-query((xs:QName("title")),
                cts:word-query(("phase 0","phase 1","phase 2","phase 3","phase 4","phase I","phase ii","phase iii","phase iv"),
                  ("case-insensitive", "wildcarded")))
          ,
             cts:element-query((xs:QName("title")),
                cts:word-query(("trial*", "study", "studies*"),
                  ("case-insensitive", "wildcarded")))
         ),
         1,
         ('ordered'))

return
  cts:highlight($xml,$q3, <b>{$cts:text}</b>)

enter image description here


Solution

  • I believe MarkLogic indexes word distances starting with the anchor word at a location of 0, and the subsequent token at a distance of 1 etc. In order query neighboring words you need to use a near-query distance of 1. The queries in your examples are performing correctly.

    To borrow from the MarkLogic cts:near-query documentation:

    xquery version "1.0-ml";
    let $x := <p>Now is the winter of our discontent</p>
    return
    cts:contains($x, cts:near-query(
                        ("now", "the"),
                        2, "ordered"));
    
    (: => returns true, "the" is 2 words from "now" :)
    
    let $x := <p>Now is the winter of our discontent</p>
    return
    cts:contains($x, cts:near-query(
                        ("now", "is"),
                        1, "ordered"));
    (: => returns true, "is" is 1 word from "now" :)
    
    let $x := <p>Now is the winter of our discontent</p>
    return
    cts:contains($x, cts:near-query(
                        ("now", "is"),
                        0, "ordered"));
    
    (: => returns false, "is" is 1 word from "now" :)