Search code examples
javascriptnpmcluster-analysis

Group Similar Strings from an array in nodejs


I am working on an FAQ system, where I have a large number of question-answers. I want to group the questions which are similar. I have been using npm set-clustering.

The package provides a good match on the base of token matching. The problem is I need to provide the number of groups to be created.

What I want that grouping should be automatic , algorithm should decide by itself the number of groups to be created (Unsupervised learning) !

Tell me if any other package or platform can help me.

Questions be like:

Tell me about the pricing of your product ?

Can I talk to your agent ?

Hi

Hi Friend

Hi Good Morning

How much will it cost me ?

Current Result: (When I give '3' as the number of groups)

( Hi , Hi Friend )

( Tell me about the pricing of your product ?, What is cost of the product ? )

( Can I talk to your agent ?,,Hi Good Morning )

I want to group it like: (Without providing '3' as input)

( Hi , Hi Friend ,Hi Good Morning )

( Tell me about the pricing of your product ?, What is cost of the product ? )

( Can I talk to your agent ? )

Existing Code:

                    var cluster = require('set-clustering');

                    for (let row of resp) {
                        articles.push({
                            title: row.que,
                            tags: row.tags
                        });
                    }

                    function similarity(x, y) {
                        var score = 0;
                        x.tags.forEach(function(tx) {
                            y.tags.forEach(function(ty) {
                            if (tx == ty)
                                score += 1;
                            });
                        });
                        return score;
                    }

                    // I do not want to provide number of groups over here, I want grouping to be done autonomous 
                    var groups = c.evenGroups(3);

                    var titles = groups.map(function(group) {
                        return group.map(function(article) {
                            return article.title;
                        });
                    });

                    console.log(titles);

Refer https://www.npmjs.com/package/set-clustering


Solution

  • let ss = require('sentence-similarity')
    var thesaurus = require("thesaurus");
    
    let similarity = ss.sentenceSimilarity;
    let similarityScore = ss.similarityScore;
    let min = (a,b) => {
        if (a< b) return a;
        else return b;
    }
    
    let similar = (sentence1, sentence2) => {
        let s1 = sentence1.split(' ');
        let s2 = sentence2.split(' ');
        let numbers = [];
        for (let e of s1) {
        let syn= thesaurus.find(e);
        for (let i=0; i<syn, i<5; i++) {
            e=syn[i];
            numbers.push(similarity(s1, s2, winkOpts)['score']/min(s1.length,s2.length));
        } 
        }
        return Math.max.apply(null, numbers) >= 0.375;
    }
    
    let winkOpts = { f: similarityScore.winklerMetaphone, options : {threshold: 0} }
    const filter = (source, maximum = 5) => {
      let _source, matches, x, y;
      _source = source.slice();
      matches = [];
      for (x = _source.length - 1; x >= 0; x--) {
        let output = _source.splice(x, 1);
        for (y = _source.length - 1; y >= 0; y--) {
          if (similar(output[0],_source[y])) {
        output.push(_source[y]);
        _source.splice(y, 1);
        x--;
          }
        }
        matches.push(output);
      }
      // matches.splice(0, 1);
      return matches;
    }
    let source = ['Your', 'array', 'here'] 
    let output = filter(source);
    

    Thesaurus library helps with finding similar words. https://www.npmjs.com/package/thesaurus

    Sentence similarity helps to make the match. https://www.npmjs.com/package/sentence-similarity

    Gives a good result with my dataset.