Search code examples
c#.netc#-4.0tagselement

How to get total element tag?


I have the list string with some tags is defined, I want to count how many total element in each tag.

<L1 //2 element
 <L1 //1 element
  <H1 content> 
 > 
 <L1 //3 element
  <H2 content> 
  <P content>
  <L1 //1 element
   <H3 content>
  >
 >
>

This is my snipped code C#

var str = "<L1\r\n <L1\r\n  <H1 content> \r\n > \r\n <L1\r\n  <H2 content> \r\n  <P content>\r\n  <L1\r\n   <H3 content>\r\n  >\r\n >\r\n>";
var list = str.Split(new string[] { "\r\n" }, StringSplitOptions.None);
var array_num = new List<string>();
int startpos = 0, endpos = 0, total = 0, newstartpos = 0;
bool newtag = false;
for (int i = 0; i < list.Length; i++)
{
    if (list[i].Trim() == "<L1")
    {
        startpos = i;
        for (int Lindex = i + 1; Lindex < list.Length ; Lindex++)
        {
            var item = list[Lindex].Trim().ToString();
            if (list[Lindex].Trim().StartsWith("<L1") && list[Lindex].Trim().EndsWith(">"))
            {
                total += 1;
            }
            if (list[Lindex].Trim() == "<L1")
            {
                total += 2;
                newstartpos = Lindex;
                newtag = true;
            }
            if (list[Lindex].Trim() == ">" && newstartpos != 0)
            {
                total -= 1;
                endpos = Lindex;
                newtag = false;
            }
            if (list[Lindex].Trim().StartsWith("<") && list[Lindex].Trim().EndsWith(">") && !newtag)
            {
                total += 1;
            }
            if (list[Lindex].Trim() == ">" && newstartpos == 0)
            {
                endpos = Lindex;
                break;
            }
        }
        array_num.Add("start: " + startpos + " end: " + endpos + " count: " + total);
        startpos = 0;
        endpos = 0;
        total = 0;
        newstartpos = 0;
        newtag = false;
    }
}

But when I run it and get the result not expected, the result array_num get content. The result is correct should be

start:0 end: 11 count: 2 //correct
start:1 end: 3 count: 1 //correct
start:4 end: 11 count: 1 //incorrect should be 4 10 3
start:7 end: 9 count: 1 //correct

But i'm not sure my code is working stable with other example, If you have any idea for this, or adjust something, kindly let me know for changed something.


Solution

  • A stack structure would be best to use as it reflects the structure of content being parsed.

    Here is one solution. Use a stack and a TagCounter class. The TagCounter class tracks the number of children a tag has, whether it is an L1 tag, and its index in the string so they can be put into the correct order at the end:

    internal class TagCounter
    {
        public TagCounter(bool isL1Tag, int index) 
        {
            ChildCount = 0;
            IsL1Tag = isL1Tag;
            Index = index;
        }
    
        public int ChildCount { get; set; }
        public bool IsL1Tag { get; private set; }
        public int Index { get; set; }
    }
    

    snippet to compute it:

    var str = "<L1\r\n <L1\r\n  <H1 content> \r\n > \r\n <L1\r\n  <H2 content> \r\n  <P content>\r\n  <L1\r\n   <H3 content>\r\n  >\r\n >\r\n>";
    
    var openTags = new Stack<TagCounter>();   
    var parsedLTags = new List<TagCounter>();
    
    var shortenedString = str.Replace("\r\n", "");
    TagCounter? currentTag = null;
    
    var stringLength = shortenedString.Length;
    
    for (var i = 0;i < stringLength; i++)
    {
        var nextChar = shortenedString[i];
        if (nextChar == '<')
        {
            if (currentTag != null)
            {
                currentTag.ChildCount++;
            }
           
            var isL1Tag = shortenedString.Substring(i + 1, 2).Equals("L1");
            
            if (currentTag != null)
            {
                openTags.Push(currentTag);
            }
            currentTag = new TagCounter(isL1Tag, i);
        }
        else if (nextChar == '>')
        {
           
            if (currentTag.IsL1Tag)
            {
                parsedLTags.Add(currentTag);
            }
    
            if (openTags.Any())
            {
                currentTag = openTags.Pop();
            }
            else
            {
                currentTag = null;
            }
    
        }
    }
    
    var result = parsedLTags.OrderBy(x=>x.Index).Select(x=>x.ChildCount).ToList();