Search code examples
javastring

Whats wrong with my tag-content-extractor program in java (hackerrank)?


My code is getting through the first two testcases, but failing the third one. Can someone help please?

Link: https://www.hackerrank.com/challenges/tag-content-extractor

Problem Statement:

In a tag-based language like XML or HTML, contents are enclosed between a start tag and an end tag. Note that the corresponding end tag starts with a /.

Given a string of text in a tag-based language, parse this text and retrieve the contents enclosed within sequences of well-organized tags meeting the following criterion:

  1. The name of the start and end tags must be same.

  2. Tags can be nested, but content between nested tags is considered not valid

  3. Tags can consist of any printable characters.

Input Format:

The first line of input contains a single integer, N (the number of lines). The N subsequent lines each contain a line of text.

Constraints:

  • 1 <= N <= 100

  • Each line contains a maximum of 10000 printable characters.

  • The total number of characters in all test cases will not exceed 1000000.

Output Format:

For each line, print the content enclosed within valid tags. If a line contains multiple instances of valid content, print out each instance of valid content on a new line; if no valid content is found, print None.

My code:

import java.io.*;
import java.util.*;   
import java.text.*;    
import java.math.*;
import java.util.regex.*;

public class Solution {
public static void main(String[] args) {
    Scanner in = new Scanner(System.in);
    int testCases = Integer.parseInt(in.nextLine());

    while(testCases > 0) {
        String line = in.nextLine();
        char[] A = line.toCharArray();
        String tag = "", tag1 = "";
        int a1 = 0, b1 = 0;
        int a = 0, b = 0;
        int flag = 0, end = 0;

        a = line.indexOf('<', a1);
        b = line.indexOf('>', b1);
        //System.out.println("Index of first '<' is " + a);
        //System.out.println("Index of first '>' is " + b);

        while ((a != -1) && (b != -1) && b < line.lastIndexOf(">")) {
            tag = "";
            tag1 = "";
            //System.out.println("Index of first '<' is " + a);
            //System.out.println("Index of first '>' is " + b);
            for (int k = a + 1; k < b; k++)
                tag = tag + A[k];
            //System.out.println("tag is " + tag);

            a1 = line.indexOf('<', a + 1);
            b1 = line.indexOf('>', b + 1);

            if (A[a1+1] == '/') {
                //System.out.println("Index of second '<' is " + a1);
                //System.out.println("Index of second '>' is " + b1);   
                for (int k = a1 + 2; k < b1; k++)
                    tag1 = tag1 + A[k];
                if ((!tag.isEmpty()) && (!tag1.isEmpty())) {    
                    if (tag.equals(tag1)) {  
                        if ((b + 1) == a1) {
                            System.out.println("None");
                            flag = 1;
                        } else {
                            for (int k = b + 1; k < a1; k++)
                                System.out.print(A[k]);
                            System.out.println();
                            flag = 1;
                        }
                    } else if (flag == 0) {
                        System.out.println("None");
                        flag = 1;
                    }
                }   
            } 
            a = a1;
            b = b1;
            //System.out.println("tag1 is " + tag1);
        }
        if ((b == -1 || a == -1 || tag1.isEmpty() || tag.isEmpty()) && (flag == 0)) {
            System.out.println("None");
        }
        testCases--;
    }
 }
}

EDIT : For the test case #3 I'm not able to debug the issue for why that string which is large gets parsed line by line whereas it has to parse the whole paragraph! If it would parse that as a whole then i would get the output right.


Solution

  • You can get it here:

    import java.io.*;
    import java.util.*;
    import java.text.*;
    import java.math.*;
    import java.util.regex.*;
    public class Solution{
    public static void main(String[] args){
      Scanner in = new Scanner(System.in);
      int testCases = Integer.parseInt(in.nextLine());
      while(testCases>0){
    
             String line=in.nextLine();
            int cur=0;
            boolean none=true;
            for(;;){
                int start=line.indexOf("<",cur);
                if(start<0)break;
                int end=line.indexOf(">",start);
                if(end<0)break;
                String tag=line.substring(start+1,end);
                if(tag.length()==0 || tag.charAt(0)=='/'){
                    cur=end+1;
                    continue;
                }
                int brk=line.indexOf("</"+tag+">");
                if(brk>=0){
                    String output=line.substring(end+1,brk);
                    if(output.length()>0 && output.indexOf("<")<0){
                        none=false;
                        System.out.println(output);
                    }
                }
                cur=end+1;
            }
            if(none)System.out.println("None");
            testCases--;
        }
      }
     }