I am trying to parse files that are formatted as below and some times same nestedExpr character is used within the literal that I need to extract as is.
Input:
{
# some comment
location 1 {
command 1
}
# this item is commented out
# location 2 {
# command 2
# }
location 3 {
command 3 /tmp; PATH=/usr/bin:${PATH} ./abc.bat"
}
location 4 {
command 4 -c "PATH=/usr/local/bin:${PATH} ls -l"
}
}
Actual output:
[['# some comment',
'location 1 ',
['command 1'],
'# this item is commented out',
'# location 2 ',
['# command 2', '# '],
'location 3 ',
['command 3 /tmp; "PATH=/usr/bin:$', ['PATH'], './abc.bat"'],
'location 4 ',
['command 4 -c "PATH=/usr/local/bin:$', ['PATH'], 'ls -l"']]]
Expected output:
[['# some comment',
'location 1 ',
['command 1'],
'# this item is commented out',
'# location 2 ',
['# command 2', '# '],
'location 3 ',
['command 3 /tmp; "PATH=/usr/bin:${PATH} ./abc.bat"'],
'location 4 ',
['command 4 -c "PATH=/usr/local/bin:${PATH} ls -l"']]]
As can be seen, I want my script to return "${PATH}" as is without parsing it into array.
Below you can see the code I tried and any help is greatly appreciated.
from pyparsing import nestedExpr, Combine, Literal, OneOrMore, CharsNotIn
from pprint import pprint
content = Combine(OneOrMore(~Literal("{")
+ ~Literal("}")
+ CharsNotIn('\n',exact=1)))
parser = nestedExpr(opener='{', closer='}', content=content)
inputStr = ''' {
# some comment
location 1 {
command 1
}
# this item is commented out
# location 2 {
# command 2
# }
location 3 {
command 3 /tmp; "PATH=/usr/bin:${PATH} ./abc.bat"
}
location 4 {
command 4 -c "PATH=/usr/local/bin:${PATH} ls -l"
}
}'''
output = parser.parseString(inputStr, parseAll=True).asList()
pprint(output)
You'll have to expand your definition of content
to explicitly detect these "${...}"
elements before parsing the CharsNotIn
term.
dollar = Literal("$")
substitution_expr = Combine(dollar + "{" + ... + "}")
content = Combine(OneOrMore(~Literal("{")
+ ~Literal("}")
+ (substitution_expr | CharsNotIn('\n', exact=1))
)
)
# content could be simplified to just this
content = Combine(OneOrMore(substitution_expr
| CharsNotIn('{}\n', exact=1))
)
Making this change I get:
[['# some comment',
'location 1 ',
['command 1'],
'# this item is commented out',
'# location 2 ',
['# command 2', '# '],
'location 3 ',
['command 3 /tmp; "PATH=/usr/bin:${PATH} ./abc.bat"'],
'location 4 ',
['command 4 -c "PATH=/usr/local/bin:${PATH} ls -l"']]]