If I have an XML which I have used lxml to Objectify, how do I get slices of the list efficiently?
My script.
# from lxml import etree
from lxml import objectify
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument("path", type=str, nargs="+")
parser.add_argument('-e',
'--extension',
default='',
help='File extension to filter by.')
args = parser.parse_args()
name_pattern = "*" + args.extension
my_dir = args.path[0]
for dir_path, subdir_list, file_list in os.walk(my_dir):
for name_pattern in file_list:
full_path = os.path.join(dir_path, name_pattern)
def getsMeet(file_list):
for filename in sorted(file_list):
filename=my_dir + filename
yield filename
def parseXML():
"""
"""
for file in getsMeet(file_list):
with open(file) as f:
xml = f.read()
root = objectify.fromstring(xml)
print(objectify.dump(root.race.nomination[0]))
find = objectify.ObjectPath(".race.nomination")
print(find.hasattr(root))
parseXML()
The XML flows from root = meeting > Club > Race > Condition | Nomination
So this print show the structure of nomination
print(objectify.dump(root.race.nomination[0]))
(pyxml) [sayth@localhost pyxml]$ python xrace.py data/ -e .xml
nomination = '' [StringElement]
* number = '8'
* saddlecloth = '8'
* horse = 'Chipanda'
* id = '198926'
* idnumber = ''
* regnumber = ''
* blinkers = '0'
* trainernumber = '235'
* trainersurname = "O'Shea"
* trainerfirstname = 'John'
* trainertrack = 'Agnes Banks/Hawkesbury'
* rsbtrainername = "John O'Shea"
* jockeynumber = '84015'
* jockeysurname = 'Avdulla'
* jockeyfirstname = 'Brenton'
* barrier = '5'
* weight = '54'
* rating = '0'
* description = 'B F 2 Sepoy x Lobola (Anabaa(USA))'
* colours = 'Royal Blue'
* owners = 'Godolphin '
* dob = '2013-10-08T00:00:00'
* age = '3'
* sex = 'F'
* career = '2-0-0-2 $30225.00'
* thistrack = '1-0-0-1 $15000.00'
* thisdistance = '0-0-0-0'
* goodtrack = '0-0-0-0'
* heavytrack = '0-0-0-0'
* slowtrack = ''
* deadtrack = ''
* fasttrack = '0-0-0-0'
* firstup = '2-0-0-2 $30225.00'
* secondup = '0-0-0-0'
* mindistancewin = '0'
* maxdistancewin = '0'
* finished = '1'
* weightvariation = '0'
* variedweight = '54'
* decimalmargin = '0.00'
* penalty = '0'
* pricestarting = '$3.50'
* sectional200 = '0'
* sectional400 = '0'
* sectional600 = '0'
* sectional800 = '0'
* sectional1200 = '0'
* bonusindicator = 'E'
True
If I want to return these elements how should i do it?
* number = '8'
* saddlecloth = '8'
* horse = 'Chipanda'
* id = '198926'
* barrier = '5'
* weight = '54'
* rating = '0'
* description = 'B F 2 Sepoy x Lobola (Anabaa(USA))'
* colours = 'Royal Blue'
* owners = 'Godolphin '
* dob = '2013-10-08T00:00:00'
* age = '3'
* sex = 'F'
* career = '2-0-0-2 $30225.00'
* thistrack = '1-0-0-1 $15000.00'
* thisdistance = '0-0-0-0'
* goodtrack = '0-0-0-0'
* heavytrack = '0-0-0-0'
* finished = '1'
* weightvariation = '0'
* variedweight = '54'
* decimalmargin = '0.00'
* penalty = '0'
* pricestarting = '$3.50'
Sample XML
<meeting id="42977" barriertrial="0" venue="Rosehill Gardens" date="2016-05-21T00:00:00" gearchanges="-1" stewardsreport="-1" gearlist="-1" racebook="0" postracestewards="0" meetingtype="TAB" rail="Timing - Electronic : Rail - +6m" weather="Fine " trackcondition="Good 3 " nomsdeadline="2016-05-16T11:00:00" weightsdeadline="2016-05-17T16:00:00" acceptdeadline="2016-05-18T09:00:00" jockeydeadline="2016-05-18T12:00:00">
<club abbrevname="Australian Turf Club" code="56398" associationclass="1" website="http://" />
<race id="215411" number="1" nomnumber="9" division="0" name="LES CARLYON AC PLATE" mediumname="2Y-SWP" shortname="2Y-SWP" stage="Results" distance="1200" minweight="0" raisedweight="0" class="~ " age="2 " grade="0" weightcondition="SWP " trophy="0" owner="0" trainer="0" jockey="0" strapper="0" totalprize="85000" first="48750" second="16750" third="8350" fourth="4150" fifth="2000" time="2016-05-21T11:25:00" bonustype="BOB7 " nomsfee="0" acceptfee="0" trackcondition="Good 3 " timingmethod="Electronic" fastesttime="1-10.22 " sectionaltime="600/34.78 " formavailable="0" racebookprize="Of $85000. First $48750, second $16750, third $8350, fourth $4150, fifth $2000, sixth $1000, seventh $1000, eighth $1000, ninth $1000, tenth $1000">
<condition line="1">Of $85000. First $48750, second $16750, third $8350, fourth $4150, fifth $2000, sixth $1000, seventh $1000, eighth $1000, ninth $1000, tenth $1000</condition>
<condition line="2">Starter Subsidy: $200 for non-prize earning runners.</condition>
<condition line="3">No class restriction, Set Weights plus Penalties, For Two-Years-Old, No sex restriction</condition>
<condition line="4">BOBS Bonus available: $20,000</condition>
<condition line="5">Apprentices can claim. Field Limit: 16 + 4 EM</condition>
<nomination number="8" saddlecloth="8" horse="Chipanda" id="198926" idnumber="" regnumber="" blinkers="0" trainernumber="235" trainersurname="O'Shea" trainerfirstname="John" trainertrack="Agnes Banks/Hawkesbury" rsbtrainername="John O'Shea" jockeynumber="84015" jockeysurname="Avdulla" jockeyfirstname="Brenton" barrier="5" weight="54" rating="0" description="B F 2 Sepoy x Lobola (Anabaa(USA))" colours="Royal Blue" owners="Godolphin " dob="2013-10-08T00:00:00" age="3" sex="F" career="2-0-0-2 $30225.00" thistrack="1-0-0-1 $15000.00" thisdistance="0-0-0-0" goodtrack="0-0-0-0" heavytrack="0-0-0-0" slowtrack="" deadtrack="" fasttrack="0-0-0-0" firstup="2-0-0-2 $30225.00" secondup="0-0-0-0" mindistancewin="0" maxdistancewin="0" finished="1" weightvariation="0" variedweight="54" decimalmargin="0.00" penalty="0" pricestarting="$3.50" sectional200="0" sectional400="0" sectional600="0" sectional800="0" sectional1200="0" bonusindicator="E" />
<nomination number="1" saddlecloth="1" horse="Legerity" id="200769" idnumber="" regnumber="" blinkers="0" trainernumber="77974" trainersurname="Hawkes" trainerfirstname="Michael" trainertrack="Rosehill" rsbtrainername="Michael, Wayne & John Hawkes" jockeynumber="2687" jockeysurname="Reith" jockeyfirstname="Christian" barrier="1" weight="57.5" rating="0" description="B C 2 Snitzel x Simply Spiteful(USA) (Speightstown(USA))" colours="Purple, Gold Checks, Quartered Cap" owners="Highgrove Stud Syndicate (Mgr: R T Gilbert)" dob="2013-08-30T00:00:00" age="3" sex="C" career="4-1-1-1 $85075.00" thistrack="1-1-0-0 $68750.00" thisdistance="0-0-0-0" goodtrack="3-1-0-1 $77150.00" heavytrack="0-0-0-0" slowtrack="" deadtrack="" fasttrack="0-0-0-0" firstup="2-0-1-1 $15125.00" secondup="2-1-0-0 $69950.00" mindistancewin="0" maxdistancewin="0" finished="2" weightvariation="0" variedweight="57.5" decimalmargin="0.50" penalty="0" pricestarting="$2.50F" sectional200="0" sectional400="0" sectional600="0" sectional800="0" sectional1200="0" bonusindicator="E" />
</race>
</meeting>
I can get all values out with this defaultdict however it seems I am not using objectify properly.
d = defaultdict(list)
# nomItems = ['id', 'horse']
for sample in root.xpath('//race/nomination'):
for attr_name, attr_value in sample.items():
d[attr_name].append(attr_value)
pprint(dict(d))
You could get multiple attribute in a single expression but you would need to add them in the order they appear if you wanted to know what belongs to which:
.xpath(('//race/nomination/@*[name() = "number" or name() = "saddlecloth" or name() = "horse"]'
Another option is to pull from the attrib dict, using operator.itemgetter:
from operator import itemgetter
atts = ("number", "id", "horse", "saddlecloth", "barrier", "weight", "rating", "description", "colours",
"owners", "dob", "age", "sex", "career", "thistrack", "thisdistance", "goodtrack", "heavytrack",
"finished", "weightvariation", "variedweight", "decimalmargin", "penalty", "pricestarting")
for sample in root.xpath('//race/nomination'):
print(dict(zip(atts, (itemgetter(*atts)(sample.attrib)
For your example xml would output:
{'thistrack': '1-0-0-1 $15000.00', 'rating': '0', 'weight': '54', 'number': '8', 'sex': 'F', 'id': '198926', 'penalty': '0', 'horse': 'Chipanda', 'pricestarting': '$3.50', 'colours': 'Royal Blue', 'saddlecloth': '8', 'description': 'B F 2 Sepoy x Lobola (Anabaa(USA))', 'barrier': '5', 'weightvariation': '0', 'finished': '1', 'variedweight': '54', 'goodtrack': '0-0-0-0', 'owners': 'Godolphin ', 'decimalmargin': '0.00', 'dob': '2013-10-08T00:00:00', 'thisdistance': '0-0-0-0', 'age': '3', 'heavytrack': '0-0-0-0', 'career': '2-0-0-2 $30225.00'}
{'thistrack': '1-1-0-0 $68750.00', 'rating': '0', 'weight': '57.5', 'number': '1', 'sex': 'C', 'id': '200769', 'penalty': '0', 'horse': 'Legerity', 'pricestarting': '$2.50F', 'colours': 'Purple, Gold Checks, Quartered Cap', 'saddlecloth': '1', 'description': 'B C 2 Snitzel x Simply Spiteful(USA) (Speightstown(USA))', 'barrier': '1', 'weightvariation': '0', 'finished': '2', 'variedweight': '57.5', 'goodtrack': '3-1-0-1 $77150.00', 'owners': 'Highgrove Stud Syndicate (Mgr: R T Gilbert)', 'decimalmargin': '0.50', 'dob': '2013-08-30T00:00:00', 'thisdistance': '0-0-0-0', 'age': '3', 'heavytrack': '0-0-0-0', 'career': '4-1-1-1 $85075.00'}
Or if you want to group in a defauldict:
from collections import defaultdict
from operator import itemgetter
d = defaultdict(list)
for sample in root.xpath('//race/nomination'):
for k,v in zip(atts, itemgetter(*atts)(sample.attrib)):
d[k].append(v)
print(d)
Which would give you:
defaultdict(<type 'list'>, {'thistrack': ['1-0-0-1 $15000.00', '1-1-0-0 $68750.00'], 'rating': ['0', '0'], 'weight': ['54', '57.5'], 'number': ['8', '1'], 'sex': ['F', 'C'], 'id': ['198926', '200769'], 'penalty': ['0', '0'], 'horse': ['Chipanda', 'Legerity'], 'pricestarting': ['$3.50', '$2.50F'], 'colours': ['Royal Blue', 'Purple, Gold Checks, Quartered Cap'], 'saddlecloth': ['8', '1'], 'description': ['B F 2 Sepoy x Lobola (Anabaa(USA))', 'B C 2 Snitzel x Simply Spiteful(USA) (Speightstown(USA))'], 'barrier': ['5', '1'], 'weightvariation': ['0', '0'], 'finished': ['1', '2'], 'variedweight': ['54', '57.5'], 'goodtrack': ['0-0-0-0', '3-1-0-1 $77150.00'], 'owners': ['Godolphin ', 'Highgrove Stud Syndicate (Mgr: R T Gilbert)'], 'decimalmargin': ['0.00', '0.50'], 'dob': ['2013-10-08T00:00:00', '2013-08-30T00:00:00'], 'thisdistance': ['0-0-0-0', '0-0-0-0'], 'age': ['3', '3'], 'heavytrack': ['0-0-0-0', '0-0-0-0'], 'career': ['2-0-0-2 $30225.00', '4-1-1-1 $85075.00']})
Or:
d = defaultdict(list)
for sample in root.xpath('//race/nomination'):
dct = sample.attrib
for k in atts:
d[k].append(dct[k])
print(d)
For any missing keys/attributes, to set a default value of None:
for sample in root.xpath('//race/nomination'):
print(dict(zip(atts, map(sample.attrib.get, atts))))
dict lookups are O(1) so I don't think you are going to get a much more efficient way to pull the attributes you want.