Search code examples
pythonpython-3.xnestedparentheses

Extract sentences in nested parentheses using Python


I have multiple .txt files in a directory. Here is a sample of one of my .txt files:

kkkkk;

  select xx("xE'", PUT(xx.xxxx.),"'") jdfjhf:jhfjj from xxxx_x_xx_L ;
quit; 

/* 1.xxxxx FROM xxxx_x_Ex_x */ 
proc sql; ("TRUuuuth");
hhhjhfjs as fdsjfsj:
select * from djfkjd to jfkjs
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and 
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
 );


jjjjjj;

  select xx("xE'", PUT(xx.xxxx.),"'") jdfjhf:jhfjj from xxxx_x_xx_L ;
quit; 

/* 1.xxxxx FROM xxxx_x_Ex_x */ ()
proc sql; ("CUuuiiiiuth");
hhhjhfjs as fdsjfsj:
select * from djfkjd to jfkjs
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and 
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
 );

I am trying to extract all sentences in nested parentheses, in my .txt files.

I have tried multiple methods like stacking parentheses but I get an error which says "list index out of range" when the code parses through one of the .txt files. I'm guessing its because there is nothing written in the brackets.

I have been trying it with regex too, using this code:

with open('lan sample text file.txt','r') as fd:
    lines = fd.read()

    check = set()
    check.add("Select")
    check.add("select")
    check.add("SELECT")
    check.add("from")
    check.add("FROM")
    check.add("From")
    items=re.findall("(\(.*)\)",lines,re.MULTILINE)
    for x in items:
        print(x)

but my output is:

("xE'", PUT(xx.xxxx.),"'"
("TRUuuuth"
((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.)
("xE'", PUT(xx.xxxx.),"'"
("CUuuiiiiuth"
((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.)

My desired output should look something like this:

("xE'", PUT(xx.xxxx.),"'")
("TRUuuuth")
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and 
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
 )
("xE'", PUT(xx.xxxx.),"'")
("CUuuiiiiuth")
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and 
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
 )

Solution

  • I would say my solution is not the optimised one, but it will solve your problem.

    Solution (Just replace test.txt with your file name)

    result = []
    with open('test.txt','r') as fd:
        # To keep track of '(' and ')' parentheses
        parentheses_stack = []
        # To keep track of complete word wrapped by ()
        complete_word = []
        # Iterate through each line in file
        for words in fd.readlines():
            # Iterate each character in a line
            for char in list(words):
                # Initialise the parentheses_stack when you find the first open '(' 
                if char == '(':
                    parentheses_stack.append(char)
                # Pop one open '(' from parentheses_stack when you find a ')'
                if char == ')':
                    if not parentheses_stack = []:
                        parentheses_stack.pop()
                    if parentheses_stack == []:
                        complete_word.append(char)
                # Collect characters in between the first '(' and last ')'
                if not parentheses_stack == []:
                    complete_word.append(char)
                else:
                    if not complete_word == []:
                        # Push the complete_word once you poped all '(' from parentheses_stack
                        result.append(''.join(complete_word))
                        complete_word = []
    
    
    
    for res in result:
        print(res)
    

    Result:

    WS:python rameshrv$ python3 /Users/rameshrv/Documents/python/test.py
    ("xE'", PUT(xx.xxxx.),"'")
    ("TRUuuuth")
    (
    SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
        FROM &xxx..xxx_xxx_xxE
    where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and 
          (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
     )
    ("xE'", PUT(xx.xxxx.),"'")
    ()
    ("CUuuiiiiuth")
    (SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
        FROM &xxx..xxx_xxx_xxE
    where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and 
          (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
     )