PyParsing OnlyOnce

I'm parsing a file with pyparsing. It works fine but I think the processing time can be improved by using the OnlyOnce class instead of OneOrMore at the "parse_file = pp.OneOrMore(dbuPerMicron | diearea | components) + pp.StringEnd()" line. After the components section of the def file there are other sections that are useless to me and the parser takes a long time to finish due to these lines. By using OnlyOnce in the pase_file it gives: "AttributeError: 'NoneType' object has no attribute 'searchString'".

I appreciate any suggestions.

def parse_def(self):
        ifile = open("path_to.def",'r')
        def_string = ifile.read()
        ifile.close()

        EOL              = pp.LineEnd().suppress()
        linebreak        = pp.Suppress(";" + pp.LineEnd())
        identifier       = pp.Word(pp.alphanums+'_!<>/')
        number           = pp.Word(pp.nums + ".")
        word             = pp.Word(pp.alphas)

        # UNITS DISTANCE MICRONS
        dbuPerMicron_id  = pp.Keyword('UNITS DISTANCE MICRONS')
        dbuPerMicron     = pp.Group(dbuPerMicron_id + number('UnitsPerMicron')).setResultsName('dbuPerMicron')

        # DIEAREA
        diearea_id  = pp.Keyword('DIEAREA')
        diearea     = pp.Group(pp.Suppress(diearea_id) + pp.OneOrMore(pp.Suppress('(') + number + number + pp.Suppress(')')) + pp.Suppress(linebreak)).setResultsName('DIEAREA')

        # COMPONENTS
        components_id    = pp.Keyword('COMPONENTS')
        end_components   = pp.Keyword("END COMPONENTS").suppress()

        begin_comp       = pp.Keyword('-')
        ws_comp          = pp.Keyword('+')  # parameter division in componentes
        comment          = pp.Keyword('#')
        comp_name        = identifier
        compName         = (comp_name('comp_name') + identifier('cell')).setResultsName('compName')
        EEQMASTER        = (pp.Suppress(ws_comp) + identifier('EEQMASTER') + identifier('macroName')).setResultsName('EEQMASTER')

        SOURCE           = (pp.Suppress(ws_comp) + identifier('SOURCE') + identifier('source_type')).setResultsName('SOURCE')

        PLACEMENT_ids    = pp.Keyword('FIXED') | pp.Keyword('COVER') | pp.Keyword('PLACED') | pp.Keyword('UNPLACED')
        PLACEMENT_coord  = pp.Suppress('(') + number('placement_x') + number('placement_y') + pp.Suppress(')')
        PLACEMENT_orient = word('orientation')
        PLACEMENT        = PLACEMENT_ids + pp.ZeroOrMore(PLACEMENT_coord + PLACEMENT_orient)
        PLACEMENT        = (pp.Suppress(ws_comp) + PLACEMENT).setResultsName('PLACEMENT')

        HALO             = (pp.Suppress(ws_comp) + pp.Keyword('HALO') + pp.ZeroOrMore(pp.Keyword('SOFT')) + number('haloL') + number('haloB') + number('haloR') + number('haloT')).setResultsName('HALO')

        ROUTEHALO        = (pp.Suppress(ws_comp) + pp.Keyword('ROUTEHALO') + number('rhaloDist') + identifier('rhaloMinLayer') + identifier('rhaloMaxLayer')).setResultsName('ROUTEHALO')

        WEIGHT           = (pp.Suppress(ws_comp) + pp.Keyword('WEIGHT') + number('weight')).setResultsName('WEIGHT')

        REGION           = (pp.Suppress(ws_comp) + pp.Keyword('REGION') + identifier('region')).setResultsName('REGION')

        PROPERTY         = (pp.Suppress(ws_comp) + pp.Keyword('PROPERTY') + identifier('propName') + identifier('propVal')).setResultsName('PROPERTY')

        subcomponent     = pp.Group(pp.Suppress(begin_comp)
                                  + pp.OneOrMore(compName)
                                  + pp.ZeroOrMore(EEQMASTER)
                                  + pp.ZeroOrMore(SOURCE)
                                  + pp.OneOrMore(PLACEMENT)
                                  + pp.ZeroOrMore(HALO)
                                  + pp.ZeroOrMore(ROUTEHALO)
                                  + pp.ZeroOrMore(WEIGHT)
                                  + pp.ZeroOrMore(REGION)
                                  + pp.ZeroOrMore(PROPERTY)
                                  + pp.Suppress(linebreak)).setResultsName('subcomponents', listAllMatches=True)

        components       = pp.Group(pp.Suppress(components_id) + number('numComps') + pp.Suppress(linebreak)
                                  + pp.OneOrMore(subcomponent )
                                  + pp.Suppress(end_components)).setResultsName('components')


        dbuPerMicron.setParseAction(self.handle_dbuPerMicron)
        diearea.setParseAction(self.handle_diearea)
        components.setParseAction(self.handle_components)

        parse_file       = pp.OneOrMore(dbuPerMicron | diearea | components) + pp.StringEnd()
        # parse_file       = pp.OnlyOnce(dbuPerMicron | diearea | components) + pp.StringEnd()  # It doesn't work

        return parse_file.searchString(def_string)

Example of a def file grammar:

Grammar:
[UNITS DISTANCE MICRONS dbuPerMicron;]

[DIEAREA ptpt [pt] ... ;]

COMPONENTS numComps ;
        [– compName modelName
        [+ EEQMASTER macroName]
        [+ SOURCE {NETLIST | DIST | USER | TIMING}]
        [+ {FIXED pt orient | COVER pt orient | PLACED pt orient | UNPLACED} ]
        [+ HALO [SOFT] leftbottomrighttop]
        [+ ROUTEHALO haloDistminLayermaxLayer]
        [+ WEIGHT weight]
        [+ REGION regionName]
        [+ PROPERTY {propName propVal} ...]...;] ...
END COMPONENTS

Example of a def file:

VERSION 5.7 ;
DIVIDERCHAR "/" ;
BUSBITCHARS "[]" ;
DESIGN c1908 ;
UNITS DISTANCE MICRONS 2000 ;

PROPERTYDEFINITIONS
    COMPONENTPIN designRuleWidth REAL ;
    DESIGN FE_CORE_BOX_LL_X REAL 0.000 ;
    DESIGN FE_CORE_BOX_UR_X REAL 23.425 ;
    DESIGN FE_CORE_BOX_LL_Y REAL 0.000 ;
    DESIGN FE_CORE_BOX_UR_Y REAL 19.600 ;
END PROPERTYDEFINITIONS

DIEAREA ( 0 0 ) ( 46850 39200 ) ;

COMPONENTS 248 ;
- U293 NOR2_X1 + PLACED ( 6080 0 ) N
 ;
- U294 FA_X1 + PLACED ( 0 0 ) N
 ;
- U295 NAND2_X1 + PLACED ( 4560 5600 ) N
 ;
- U296 FA_X1 + PLACED ( 20520 2800 ) N
 ;
- U297 NAND2_X1 + PLACED ( 26600 2800 ) N
 ;
- U298 NAND2_X1 + PLACED ( 27740 2800 ) N
 ;
- U299 NAND2_X1 + PLACED ( 22800 8400 ) N
 ;
- U300 NOR2_X1 + PLACED ( 25460 5600 ) N
 ;
- U301 HA_X1 + PLACED ( 33440 5600 ) N
 ;
- U540 INV_X1 + PLACED ( 760 28000 ) N
 ;
END COMPONENTS

PINS 58 ;
- N1 + NET N1 + DIRECTION INPUT + USE SIGNAL
  + LAYER metal3 ( -70 0 ) ( 70 140 )
And more thousands of lines that are useless to me.

Solution

If creating a parser in its own method, I try to just do the parser definition and return that, and have the caller responsible for applying the parser to the input string. This simplifies the call interface to the parser() method, and makes it much easier to test in isolation.

I changed your parse() method to parser() and wrapped it in a dummy X class, but left the contents almost verbatim, just changing your final parsing statement to:

    return dbuPerMicron | diearea | components

Then I used this code to run the parser against an arbitrarily long sample (your posted sample, plus 10,000,000 random characters, including spaces and newlines):

parser = X().parser()

# accumulate results using scanString
results = []
for t, s, e in parser.scanString(sample):
    results.append(t)
    # BUG! (sorry)
    # if len(t) == 3:
    if len(results) == 3:
        break

# use builtin sum() function to merge all the parsed results into one
results = sum(results)

# or here is the same code as the above loop using islice to do the
# range checking for us
from itertools import islice
results = sum(t for t, s, e in islice(parser.scanString(sample), 0, 3))

# what did we get?
print(results.dump())

Creating the 10 million character bit was the most time-consuming task, but the parsing was able to stop after having parsed the 3 relevant segments. I wrote out the explicit looping using scanString, but with itertools.islice, you can collapse it down to one line.

The output from results.dump() looks like (long list lines are snipped for posting brevity):

[['UNITS DISTANCE MICRONS', '2000'], ['0', '0', ...
- DIEAREA: ['0', '0', '46850', '39200']
- components: ['248', ['U293', 'NOR2_X1', 'PLACED', '6080', ...
  - numComps: '248'
  - subcomponents: [['U293', 'NOR2_X1', 'PLACED', '6080', ... 
    [0]:
      ['U293', 'NOR2_X1', 'PLACED', '6080', '0', 'N']
      - PLACEMENT: ['PLACED', '6080', '0', 'N']
      - cell: 'NOR2_X1'
      - compName: ['U293', 'NOR2_X1']
      - comp_name: 'U293'
      - orientation: 'N'
      - placement_x: '6080'
      - placement_y: '0'
    [1]:
      ['U294', 'FA_X1', 'PLACED', '0', '0', 'N']
      - PLACEMENT: ['PLACED', '0', '0', 'N']
      - cell: 'FA_X1'
      - compName: ['U294', 'FA_X1']
      - comp_name: 'U294'
      - orientation: 'N'
      - placement_x: '0'
      - placement_y: '0'
    [2]:
      ['U295', 'NAND2_X1', 'PLACED', '4560', '5600', 'N']
      - PLACEMENT: ['PLACED', '4560', '5600', 'N']
      - cell: 'NAND2_X1'
      - compName: ['U295', 'NAND2_X1']
      - comp_name: 'U295'
      - orientation: 'N'
      - placement_x: '4560'
      - placement_y: '5600'
    [3]:
      ['U296', 'FA_X1', 'PLACED', '20520', '2800', 'N']
      - PLACEMENT: ['PLACED', '20520', '2800', 'N']
      - cell: 'FA_X1'
      - compName: ['U296', 'FA_X1']
      - comp_name: 'U296'
      - orientation: 'N'
      - placement_x: '20520'
      - placement_y: '2800'
    [4]:
      ['U297', 'NAND2_X1', 'PLACED', '26600', '2800', 'N']
      - PLACEMENT: ['PLACED', '26600', '2800', 'N']
      - cell: 'NAND2_X1'
      - compName: ['U297', 'NAND2_X1']
      - comp_name: 'U297'
      - orientation: 'N'
      - placement_x: '26600'
      - placement_y: '2800'
    [5]:
      ['U298', 'NAND2_X1', 'PLACED', '27740', '2800', 'N']
      - PLACEMENT: ['PLACED', '27740', '2800', 'N']
      - cell: 'NAND2_X1'
      - compName: ['U298', 'NAND2_X1']
      - comp_name: 'U298'
      - orientation: 'N'
      - placement_x: '27740'
      - placement_y: '2800'
    [6]:
      ['U299', 'NAND2_X1', 'PLACED', '22800', '8400', 'N']
      - PLACEMENT: ['PLACED', '22800', '8400', 'N']
      - cell: 'NAND2_X1'
      - compName: ['U299', 'NAND2_X1']
      - comp_name: 'U299'
      - orientation: 'N'
      - placement_x: '22800'
      - placement_y: '8400'
    [7]:
      ['U300', 'NOR2_X1', 'PLACED', '25460', '5600', 'N']
      - PLACEMENT: ['PLACED', '25460', '5600', 'N']
      - cell: 'NOR2_X1'
      - compName: ['U300', 'NOR2_X1']
      - comp_name: 'U300'
      - orientation: 'N'
      - placement_x: '25460'
      - placement_y: '5600'
    [8]:
      ['U301', 'HA_X1', 'PLACED', '33440', '5600', 'N']
      - PLACEMENT: ['PLACED', '33440', '5600', 'N']
      - cell: 'HA_X1'
      - compName: ['U301', 'HA_X1']
      - comp_name: 'U301'
      - orientation: 'N'
      - placement_x: '33440'
      - placement_y: '5600'
    [9]:
      ['U540', 'INV_X1', 'PLACED', '760', '28000', 'N']
      - PLACEMENT: ['PLACED', '760', '28000', 'N']
      - cell: 'INV_X1'
      - compName: ['U540', 'INV_X1']
      - comp_name: 'U540'
      - orientation: 'N'
      - placement_x: '760'
      - placement_y: '28000'
- dbuPerMicron: ['UNITS DISTANCE MICRONS', '2000']
  - UnitsPerMicron: '2000'

For items that you know will be integers or reals, you might use the expressions for integer and real (or just number which will match all the numeric forms) defined in pyparsing.pyparsing_common; these expressions will use a fast Regex for the parsing, and convert the result to the right Python type at parse time, so that you don't have to do this conversion later.