Search code examples
regexgoogle-apps-scriptgoogle-docs

How to split 1 long paragraph to 2 shorter paragraphs? Google Document


I want paragraphs to be up to 3 sentences only.

For that, my strategy is to loop on all paragraphs and find the 3rd sentence ending (see note). And then, to add a "\r" char after it.

This is the code I have:

for (var i = 1; i < paragraphs.length; i++) {
  ...
  sentEnds = paragraphs[i].getText().match(/[a-zA-Z0-9_\u0590-\u05fe][.?!](\s|$)|[.?!][.?!](\s|$)/g);
  //this array is used to count sentences in Hebrew/English/digits that end with 1 or more of either ".","?" or "!"
  ...
  if ((sentEnds != null) && (sentEnds.length > 3)) {
    lineBreakAnchor = paragraphs[i].getText().match(/.{10}[.?!](\s)/g);
    paragraphs[i].replaceText(lineBreakAnchor[2],lineBreakAnchor[2] + "\r");
  }
}

This works fine for round 1. But if I run the code again- the text after the inserted "\r" char is not recognized as a new paragraph. Hence, more "\r" (new lines) will be inserted each time the script is running.

How can I make the script "understand" that "\r" means new, separate paragraph?

OR

Is there another character/approach that will do the trick?

Thank you.

  • Note: I use the last 10 characters of the sentence assuming the match will be unique enough to make only 1 replacement.

Solution

  • Without modifying your own regex expression you can achieve this.

    enter image description here

    Try this approach to split the paragraphs:

    • Grab the whole content of the document and create an array of sentences.
    • Insert paragraphs with up to 3 sentences after original paragraphs.
    • Remove original paragraphs from hell.
    function sentenceMe() {
      var doc = DocumentApp.getActiveDocument();
      var paragraphs = doc.getBody().getParagraphs();
      var sentences = [];
      // Split paragraphs into sentences
      for (var i = 0; i < paragraphs.length; i++) {
        var parText = paragraphs[i].getText();
        //Count sentences in Hebrew/English/digits that end with 1 or more of either ".","?" or "!"
        var sentEnds = parText.match(/[a-zA-Z0-9_\u0590-\u05fe][.?!](\s|$)|[.?!][.?!](\s|$)/g);
        if (sentEnds){
          for (var j=0; j< sentEnds.length; j++){
            var initIdx = 0;
            var sentence = parText.substring(initIdx,parText.indexOf(sentEnds[j])+3);
            var parInitIdx = initIdx;
            initIdx = parText.indexOf(sentEnds[j])+3;
            parText = parText.substring(initIdx - parInitIdx);
            sentences.push(sentence);
          }
    
        }
        // console.log(sentences);
      }
    
      inThrees(doc, paragraphs, sentences)
    }
    
    function inThrees(doc, paragraphs, sentences) {
      // define offset
      var offset = paragraphs.length;
      // Create paragraphs with up to 3 sentences
      var k=0;
      do {
        var parText = sentences.splice(0,3).join(' ');
        doc.getBody().insertParagraph(k + offset  , parText.concat('\n'));
        k++
      }
      while (sentences.length > 0)
    
        // Remove paragraphs from hell
        for (var i = 0; i < offset; i++){
          doc.getBody().removeChild(paragraphs[i]);
        }
    }
    

    In case you are wondering about the custom menu, here is it:

    function onOpen() {
      var ui = DocumentApp.getUi();
      ui.createMenu('Custom Menu')
      .addItem("3's the magic number", 'sentenceMe')
      .addToUi();
    }
    

    References: