Search code examples
regexgoogle-apps-scriptgoogle-docsre2

Find and change cyrillic word with boundary in google scripts


The problem is that \b doesn't work with Russian and Ukrainian letters.

Here I try to find all matches of a word 'февраля' it the text, change them to tempword, then make it a link and change it back to 'февраля'.

function addLinks(word, siteurl) {
  var id = 'doc\'s ID';
  var doc = DocumentApp.openById(id);
  var body = doc.getBody();
  var tempword = 'ASDFDSGDDKDSL2';
  var searchText = "\\b"+word+"\\b";
  var element = body.findText(searchText);
  console.log(element);
  while (element) {
    var start = element.getStartOffset();
    var text = element.getElement().asText();
    text.replaceText(searchText, tempword);
    text.setLinkUrl(start, start + tempword.length - 1, siteurl);
    element = body.findText(searchText);
  }
  body.replaceText(tempword, word);
}

addLinks('февраля', 'example.com');

It works as it should, if I change Russian word 'февраля' to English 'february'.

addLinks('february', 'example.com');

I need regular expression, because if I just look for 'февраля' script will apply it to other words like 'февралям', 'февралями' etc. So, it is a question, how to make it work. Mistake "Exception: Invalid regular expression pattern" occurs with this code:

var searchText = "(?<=[\\s,.:;\"']|^)"+word+"(?=[\\s,.:;\"']|$)";

or this:

var searchText = "(^|\s)"+word+"(?=\s|$)";

and some other.


Solution

  • Here is my solution:

    function main() {
      addLinks('февраля', 'example.com');
    }
    
    function addLinks(word, url) {
      var doc   = DocumentApp.getActiveDocument();
      var pgfs  = doc.getParagraphs();
      var bound = '[^А-яЁё]'; // any letter except Russian one
    
      var patterns = [
        {regex: bound + word + bound, start: 1, end: 1}, // word inside of line
        {regex: '^'   + word + bound, start: 0, end: 1}, // word at the start
        {regex: bound + word + '$',   start: 1, end: 0}, // word at the end
        {regex: '^'   + word + '$',   start: 0, end: 0}  // word = line
      ];
    
      for (var pgf of pgfs) for (var pattern of patterns) {
        var location = pgf.findText(pattern.regex);
        while (location) {
          var start = location.getStartOffset() + pattern.start;
          var end   = location.getEndOffsetInclusive() - pattern.end;
          pgf.editAsText().setLinkUrl(start, end, url);
          location = pgf.findText(pattern.regex, location);
        }
      }
    }
    

    Test output:

    enter image description here

    It handles well the word placed at the start or at the end of the line (or both). And it gives no the weird error message.