Search code examples
javascriptregex

Regex - check if input still has chances to become matching


We've got such regexp:

var regexp = /^one (two)+ three/;

So only string like "one two three" or "one two three four" or "one twotwo three" etc. will match it.

However, if we've got string like

"one " - is still 'promising' that maybe soon it will match

but this string: "one three" will never match no matter what we'll do.

Is there some way to check if given string have chances to become matching or not?

I need it for some tips during writing when I want to recommend all options that begins with given input (regexp's I'm using are pretty long and I dont want really to mess with them).


In other words - I want to check if string has ended during checking and nothing 'not matching' was faced.

In even more other words - Answer would be inside reason of not matching. If reason is end of string - then it would be promissing. However I dont know any way to check why some string didnt match


Solution

  • This is a regex feature known as partial matching, it's available in several regex engines such as PCRE, Boost, Java but not in JavaScript.

    Andacious's answer shows a very nice way to overcome this limitation, we just need to automate this.

    Well... challenge accepted :)

    Fortunately, JavaScript has a very limited regex feature set with a simple syntax, so I wrote a simple parser and on-the-fly transformation for this task, based on the features listed on MDN. This code has been updated to handle ES2018 features.

    A couple points of interest:

    • This produces a regex which will almost always match the empty string. Therefore a failed partial match occurs when the result of exec is null or an array whose first element is the empty string
    • Negative lookaheads are kept as-is. I believe that's the right thing to do. The only ways to fail a match is through them (ie put a (?!) in the regex) and anchors (^ and $). Lookbehinds (both positive and negative) are also kept as-is.
    • The parser assumes a valid input pattern: you can't create a RegExp object from an invalid pattern in the first place. This may break in the future if new regex features are introduced.
    • This code won't handle backreferences properly: ^(\w+)\s+\1$ won't yield a partial match against hello hel for instance.

    RegExp.prototype.toPartialMatchRegex = function() {
        "use strict";
        
        var re = this,
            source = this.source,
            i = 0;
        
        function process () {
            var result = "",
                tmp;
    
            function appendRaw(nbChars) {
                result += source.substr(i, nbChars);
                i += nbChars;
            };
            
            function appendOptional(nbChars) {
                result += "(?:" + source.substr(i, nbChars) + "|$)";
                i += nbChars;
            };
    
            while (i < source.length) {
                switch (source[i])
                {
                    case "\\":
                        switch (source[i + 1])
                        {
                            case "c":
                                appendOptional(3);
                                break;
                                
                            case "x":
                                appendOptional(4);
                                break;
                                
                            case "u":
                                if (re.unicode) {
                                    if (source[i + 2] === "{") {
                                        appendOptional(source.indexOf("}", i) - i + 1);
                                    } else {
                                        appendOptional(6);
                                    }
                                } else {
                                    appendOptional(2);
                                }
                                break;
    
                            case "p":
                            case "P":
                                if (re.unicode) {
                                    appendOptional(source.indexOf("}", i) - i + 1);
                                } else {
                                    appendOptional(2);
                                }
                                break;
    
                            case "k":
                                appendOptional(source.indexOf(">", i) - i + 1);
                                break;
                                
                            default:
                                appendOptional(2);
                                break;
                        }
                        break;
                        
                    case "[":
                        tmp = /\[(?:\\.|.)*?\]/g;
                        tmp.lastIndex = i;
                        tmp = tmp.exec(source);
                        appendOptional(tmp[0].length);
                        break;
                        
                    case "|":
                    case "^":
                    case "$":
                    case "*":
                    case "+":
                    case "?":
                        appendRaw(1);
                        break;
                        
                    case "{":
                        tmp = /\{\d+,?\d*\}/g;
                        tmp.lastIndex = i;
                        tmp = tmp.exec(source);
                        if (tmp) {
                            appendRaw(tmp[0].length);
                        } else {
                            appendOptional(1);
                        }
                        break;
                        
                    case "(":
                        if (source[i + 1] == "?") {
                            switch (source[i + 2])
                            {
                                case ":":
                                    result += "(?:";
                                    i += 3;
                                    result += process() + "|$)";
                                    break;
                                    
                                case "=":
                                    result += "(?=";
                                    i += 3;
                                    result += process() + ")";
                                    break;
                                    
                                case "!":
                                    tmp = i;
                                    i += 3;
                                    process();
                                    result += source.substr(tmp, i - tmp);
                                    break;
    
                                case "<":
                                    switch (source[i + 3])
                                    {
                                        case "=":
                                        case "!":
                                            tmp = i;
                                            i += 4;
                                            process();
                                            result += source.substr(tmp, i - tmp);
                                            break;
    
                                        default:
                                            appendRaw(source.indexOf(">", i) - i + 1);
                                            result += process() + "|$)";
                                            break;        
                                    }
                                    break;
                            }
                        } else {
                            appendRaw(1);
                            result += process() + "|$)";
                        }
                        break;
                        
                    case ")":
                        ++i;
                        return result;
                        
                    default:
                        appendOptional(1);
                        break;
                }
            }
            
            return result;
        }
        
        return new RegExp(process(), this.flags);
    };
    
    
    
    
    
    
    // Test code
    (function() {
        document.write('<span style="display: inline-block; width: 60px;">Regex: </span><input id="re" value="^one (two)+ three"/><br><span style="display: inline-block; width: 60px;">Input: </span><input id="txt" value="one twotw"/><br><pre id="result"></pre>');
        document.close();
    
        var run = function() {
            var output = document.getElementById("result");
            try
            {
                var regex = new RegExp(document.getElementById("re").value);
                var input = document.getElementById("txt").value;
                var partialMatchRegex = regex.toPartialMatchRegex();
                var result = partialMatchRegex.exec(input);
                var matchType = regex.exec(input) ? "Full match" : result && result[0] ? "Partial match" : "No match";
                output.innerText = partialMatchRegex + "\n\n" + matchType + "\n" + JSON.stringify(result);
            }
            catch (e)
            {
                output.innerText = e;
            }
        };
    
        document.getElementById("re").addEventListener("input", run);
        document.getElementById("txt").addEventListener("input", run);
        run();
    }());

    I tested it a little bit and it seems to work fine, let me know if you find any bugs.