Search code examples
javascriptnode.jsreactjshigher-order-functions

Extract Clickable words from String and Include Punctuation Marks


I have a sentence and also an array of clickable words from the sentence. The array does not include the punctuation marks.

Here is the sentence:

Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. “We’re going on a journey,” my father said. “To a secret place. We’ll catch the air! We’ll catch the breeze!”

Here is the structure of the clickable words. It's an array containing the indexes of where within the sentence the word begins and ends. This Array does not contain the punctuation marks in the sentence

The punctuation marks are not clickable.

"tokens": [
            {
              "position": [
                0,
                4
              ],
              "value": "into"
            },
            {
              "position": [
                5,
                8
              ],
              "value": "the"
            },
            {
              "position": [
                9,
                14
              ],
              "value": "trunk"
            },
            {
              "position": [
                15,
                17
              ],
              "value": "we"
            },
            {
              "position": [
                18,
                21
              ],
              "value": "put"
            },
            {
              "position": [
                22,
                25
              ],
              "value": "two"
            },
            {
              "position": [
                26,
                31
              ],
              "value": "poles"
            },
            {
              "position": [
                32,
                35
              ],
              "value": "and"
            },
            {
              "position": [
                36,
                39
              ],
              "value": "the"
            },
            {
              "position": [
                40,
                43
              ],
              "value": "can"
            },
            {
              "position": [
                44,
                46
              ],
              "value": "of"
            },
            {
              "position": [
                47,
                52
              ],
              "value": "worms"
            },
            {
              "position": [
                53,
                56
              ],
              "value": "and"
            },
            {
              "position": [
                57,
                58
              ],
              "value": "a"
            },
            {
              "position": [
                59,
                63
              ],
              "value": "sack"
            },
            {
              "position": [
                64,
                66
              ],
              "value": "of"
            },
            {
              "position": [
                67,
                77
              ],
              "value": "sandwiches"
            },
            {
              "position": [
                78,
                81
              ],
              "value": "and"
            },
            {
              "position": [
                82,
                83
              ],
              "value": "a"
            },
            {
              "position": [
                84,
                91
              ],
              "value": "thermos"
            },
            {
              "position": [
                92,
                94
              ],
              "value": "of"
            },
            {
              "position": [
                95,
                100
              ],
              "value": "water"
            },
            {
              "position": [
                103,
                108
              ],
              "value": "we're"
            },
            {
              "position": [
                109,
                114
              ],
              "value": "going"
            },
            {
              "position": [
                115,
                117
              ],
              "value": "on"
            },
            {
              "position": [
                118,
                119
              ],
              "value": "a"
            },
            {
              "position": [
                120,
                127
              ],
              "value": "journey"
            },
            {
              "position": [
                130,
                132
              ],
              "value": "my"
            },
            {
              "position": [
                133,
                139
              ],
              "value": "father"
            },
            {
              "position": [
                140,
                144
              ],
              "value": "said"
            },
            {
              "position": [
                147,
                149
              ],
              "value": "to"
            },
            {
              "position": [
                150,
                151
              ],
              "value": "a"
            },
            {
              "position": [
                152,
                158
              ],
              "value": "secret"
            },
            {
              "position": [
                159,
                164
              ],
              "value": "place"
            },
            {
              "position": [
                166,
                171
              ],
              "value": "we'll"
            },
            {
              "position": [
                172,
                177
              ],
              "value": "catch"
            },
            {
              "position": [
                178,
                181
              ],
              "value": "the"
            },
            {
              "position": [
                182,
                185
              ],
              "value": "air"
            },
            {
              "position": [
                187,
                192
              ],
              "value": "we'll"
            },
            {
              "position": [
                193,
                198
              ],
              "value": "catch"
            },
            {
              "position": [
                199,
                202
              ],
              "value": "the"
            },
            {
              "position": [
                203,
                209
              ],
              "value": "breeze"
            }
          ]
        },

Here is my code that gets the clickable words

 const getWordsFromTokens = tokens.reduce((words, token)=>{
   let start = token.position[0]; //Start is the first character of the token value in the sentence
   let end = token.position[1]; // end is the last character of the token value in the sentence

   let differenceBetweenLastPositionAndFirst = end+(end-start); 
   
    /* You get punctuationMarks or any characters not in the Tokens by getting the string between 
        the end and difference between the end and start
    */
   let punctuationMarks = content.substring(end, (differenceBetweenLastPositionAndFirst)); 
   
   console.log(punctuationMarks);

   words.push( content.substring(start, end)+punctuationMarks); //concat with any space of pucntuation mark after the word.
   return words; //<- return this to be used in next round of reduce untill all words are
  },[]);

Here is How I'm rendering the text

return (
    <div>
      <p> {
        getWordsFromTokens.map((word, index)=>{
         return <a href={'/word/' + word} > {word}</a>
        })
      }
      </p>
    </div>
  )

Here is my problem, When I render the text, it does not look exactly like the original text. What is it that I could be doing wrong?

Here is how the final Result looks like

Into the the tr trunk we p we p put tw two po poles and and th the ca can of of w worms and and a a sack of of s sandwiches and a the and a a thermos of wat of w water. “We We’re goin going on a on a a journey,” my f my f father said. said. “T To a a secret place place. We’ We’ll catc catch the the ai air! W We’ll catc catch the the br breeze!”


Solution

  • What about a solution like this? I use a cursor to track the position inside the sentence.

    const tokens = [{
        "position": [
          0,
          4
        ],
        "value": "into"
      },
      {
        "position": [
          5,
          8
        ],
        "value": "the"
      },
      {
        "position": [
          9,
          14
        ],
        "value": "trunk"
      },
      {
        "position": [
          15,
          17
        ],
        "value": "we"
      },
      {
        "position": [
          18,
          21
        ],
        "value": "put"
      },
      {
        "position": [
          22,
          25
        ],
        "value": "two"
      },
      {
        "position": [
          26,
          31
        ],
        "value": "poles"
      },
      {
        "position": [
          32,
          35
        ],
        "value": "and"
      },
      {
        "position": [
          36,
          39
        ],
        "value": "the"
      },
      {
        "position": [
          40,
          43
        ],
        "value": "can"
      },
      {
        "position": [
          44,
          46
        ],
        "value": "of"
      },
      {
        "position": [
          47,
          52
        ],
        "value": "worms"
      },
      {
        "position": [
          53,
          56
        ],
        "value": "and"
      },
      {
        "position": [
          57,
          58
        ],
        "value": "a"
      },
      {
        "position": [
          59,
          63
        ],
        "value": "sack"
      },
      {
        "position": [
          64,
          66
        ],
        "value": "of"
      },
      {
        "position": [
          67,
          77
        ],
        "value": "sandwiches"
      },
      {
        "position": [
          78,
          81
        ],
        "value": "and"
      },
      {
        "position": [
          82,
          83
        ],
        "value": "a"
      },
      {
        "position": [
          84,
          91
        ],
        "value": "thermos"
      },
      {
        "position": [
          92,
          94
        ],
        "value": "of"
      },
      {
        "position": [
          95,
          100
        ],
        "value": "water"
      },
      {
        "position": [
          103,
          108
        ],
        "value": "we're"
      },
      {
        "position": [
          109,
          114
        ],
        "value": "going"
      },
      {
        "position": [
          115,
          117
        ],
        "value": "on"
      },
      {
        "position": [
          118,
          119
        ],
        "value": "a"
      },
      {
        "position": [
          120,
          127
        ],
        "value": "journey"
      },
      {
        "position": [
          130,
          132
        ],
        "value": "my"
      },
      {
        "position": [
          133,
          139
        ],
        "value": "father"
      },
      {
        "position": [
          140,
          144
        ],
        "value": "said"
      },
      {
        "position": [
          147,
          149
        ],
        "value": "to"
      },
      {
        "position": [
          150,
          151
        ],
        "value": "a"
      },
      {
        "position": [
          152,
          158
        ],
        "value": "secret"
      },
      {
        "position": [
          159,
          164
        ],
        "value": "place"
      },
      {
        "position": [
          166,
          171
        ],
        "value": "we'll"
      },
      {
        "position": [
          172,
          177
        ],
        "value": "catch"
      },
      {
        "position": [
          178,
          181
        ],
        "value": "the"
      },
      {
        "position": [
          182,
          185
        ],
        "value": "air"
      },
      {
        "position": [
          187,
          192
        ],
        "value": "we'll"
      },
      {
        "position": [
          193,
          198
        ],
        "value": "catch"
      },
      {
        "position": [
          199,
          202
        ],
        "value": "the"
      },
      {
        "position": [
          203,
          209
        ],
        "value": "breeze"
      }
    ];
    
    const content = 'Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. “We’re going on a journey,” my father said. “To a secret place. We’ll catch the air! We’ll catch the breeze!"';
    
    let cursorPosition = 0; // set a variable to track the position of cursor
    
    const getWordsFromTokens = tokens.reduce((words, token) => {
      let tokenStart = token.position[0]; //Start is the first character of the token value in the sentence
      let tokenEnd = token.position[1]; // end is the last character of the token value in the sentence
    
      let notWordBeforeThisWord = content.substring(cursorPosition, tokenStart); // get the non-word characters (spaces, punctuation) before the current word
    
      let tokenValue = content.substring(tokenStart, tokenEnd);; // the word value
    
      words.push({
        type: 'non-word',
        value: notWordBeforeThisWord
      }, {
        type: 'word',
        value: tokenValue
      }); //concat with any space of pucntuation mark after the word.
    
      cursorPosition = tokenEnd; // update the cursor position
    
      return words; // return this to be used in next round of reduce untill all words are
    }, []);
    
    getWordsFromTokens.forEach(item => {
      const htmlToAppend = item.type === 'word' ?
        `<a href='/word/${item.value}'>${item.value}</a>` :
        item.value
    
      document.getElementById('new-sentence').innerHTML += htmlToAppend;
    })
    
    const endOfSentence = content.substring(cursorPosition); // get all carachters (if any) after the last token
    
    document.getElementById('new-sentence').innerHTML = document.getElementById('new-sentence').innerHTML + endOfSentence;
    <p id='new-sentence'></p>