Search code examples
c#regexconfluencewiki-markup

Confluence wiki markup - table rows type determination with Regex


In confluence wiki v6.0 there are two different table types.

So I'm struggling with determination of the table type by it's first row (I split table on rows by detection of the new line with regex like this new Regex(@"(\|(\r\n|\r|\n)(.*?)\|)+"); and split using Matches, but however)

Table row could look like:

that if its header

|| heading 1 || heading 2 || heading 3 ||

that if its regular row

| cell A1 | cell A2 | cell A3 |

and that if its vertical table row

||Heading |cell B2 | cell B3 |

I tried to use expression like this ^(\|\|.*?\|) but found out that it works for headers as well.

After I tried to use this one ^(\|\|.*?\|\|) due to header markup feature, but it's not helps to say, if its the regular row

So is it possible to achieve determination of row type or, at least, to say is it vertical row with usage of Regex?

Or it's better to write something that will process row step by step?


Solution

  • Wrote it without using regex and in javascript, it looks like that

    Simple string scanner

    var Scanner = (function(){
        function Scanner(text){
            this.currentString = text.split('');
            this.position = 0;
            this.errorList = [];
            this.getChar = function(){
                var me = this,
                    pos = me.position,
                    string = me.currentString,
                    stringLength = string.length;
    
                if(pos < stringLength){
                    return string[pos];
                }
    
                return -1;
            };
    
            this.nextChar = function(){
                var me = this,
                    pos = me.position,
                    string = me.currentString,
                    stringLength = string.length;
    
                if(pos < stringLength){
                    me.position++;
                    return;
                }
    
                me.error("EOL reached");
            };
    
            this.error = function(errorMsg){
                var me = this,
                    error = "Error at position " + me.position +"\nMessage: "+errorMsg+".\n";
                    errors = me.errorList;
    
                errors.push[error];
            };      
    
            return this;
        };
    
        return Scanner;
    
    })();
    

    Simple parser

     /**
         LINE ::= { CELL }
    
         CELL ::= '|' CELL1
         CELL1 ::= HEADER_CELL | REGULAR_CELL
    
         HEADER_CELL ::=  '|'  TEXT
         REGULAR_CELL ::=  TEXT
    
     */
    
     function RowParser(){
        this.scanner = {}; 
        this.rawText = "";
        this.cellsData = [];
    
        return this;
    };
    
    RowParser.prototype = {
        parseRow: function(row){
            var me = this;
    
            me.scanner = new Scanner(row);
            me.rawText = row;
            me.cellsData = [];
    
            me.proceedNext();
        },
    
        proceedNext: function(){
            var me = this,
                scanner = me.scanner;
    
            while(scanner.getChar() === '|'){
                me.proceedCell();
            }
    
            if (scanner.getChar() !== -1)
            {
                scanner.error("EOL expected, "+ scanner.getChar() +" got");
            }
    
            return;
        },
    
        proceedCell: function(){
            var me = this,
                scanner = me.scanner;
    
            if(scanner.getChar() === '|'){
                scanner.nextChar();
                me.proceedHeaderCell();
            }
        },
    
        proceedHeaderCell: function(){
            var me = this,
                scanner = me.scanner;
    
            if(scanner.getChar() === '|'){
                me.onHeaderCell();
            } else { 
                me.onRegularCell();
            }
        },
    
        onHeaderCell: function(){
            var me = this,
                scanner = me.scanner,
                cellType = TableCellType.info,
                cellData = {
                    type: cellType.Header
                }
    
            if(scanner.getChar() === '|'){
                scanner.nextChar();
                me.proceedInnerText(cellType.Header);
            }else{
                scanner.error("Expected '|' got "+ currentChar +".");
            }           
        },
    
        onRegularCell:function(){
            var me = this,
                scanner = me.scanner,
                cellType = TableCellType.info;
    
            me.proceedInnerText(cellType.Regular);  
        },  
    
        proceedInnerText: function(cellType){
            var me = this,
                scanner = me.scanner,
                typeData = TableCellType.getValueById(cellType),
                innerText = [];
    
            while(scanner.getChar() !== '|' && scanner.getChar() !== -1){
                innerText.push(scanner.getChar());
                scanner.nextChar();
            }           
    
            me.cellsData.push({
                typeId: typeData.id,
                type: typeData.name,
                text: innerText.join("")
            });
    
            me.proceedNext();       
        },
    
        getRowData: function(){
            var me = this,
                scanner = me.scanner,
                data = me.cellsData,
                emptyCell;
    
            //Proceed cell data
            //if there no empty cell in the end - means no close tag
            var filteredData = data.filter(function(el){
                return el.text.length !== 0;
            });
    
            if(filteredData.length === data.length){
                scanner.error("No close tag at row "+ me.rawText +".");
                return;
            }           
    
            for (var i = 0; i < filteredData.length; i++) {
                filteredData[i].text = filteredData[i].text.trim();
            }
    
            return filteredData;
        }
    };
    

    CellTypeEnum mentioned above

    var TableCellType = {
        info:{
            Regular: 10,
            Header: 20
        },
    
        data:[
            {
                id: 10,
                name: "regular"
            },
            {
                id: 20,
                name: "header"
            }
        ],
    
        getValueById: function(id){
            var me = this,
                data = me.data,
                result = data.filter(function(el){
                    return el.id === id;
                });
    
            return result[0];   
        }       
    }
    

    Usage:

    var rowParser = new RowParser();
    var row = "||AAA||BBB||CCC||\n|Hi|all|people!|";
    rowParser.parseRow(row);
    var result = rowParser.getRowData();