I have html file which I want to parse in pySpark.
Example:
<MainStruct Rank="1">
<Struct Name="A">
<Struct Name="AA">
<Struct Name="AAA">
<Field Name="F1">Data</Field>
</Struct>
<Struct Name="ListPart">
<List Name="ListName">
<Struct Name="S1">
<Field Name="F1">AAA</Field>
<Field Name="F2">BBB</Field>
<Field Name="F3">CCC</Field>
</Struct>
<Struct Name="S1">
<Field Name="F1">XXX</Field>
<Field Name="F2">GGG</Field>
<Field Name="F3">BBB</Field>
</Struct>
</List>
</Struct>
</Struct>
</Struct>
</FullStudy>
rdd_html = spark.sparkContext.wholeTextFiles(path_to_XML, minPartitions=1000, use_unicode=True)
df_html = spark.createDataFrame(rdd_html,['filename', 'content'])
rdd_map = df_html.rdd.map(lambda x: xmltodict(x['content'],'mainstruct'))
df_map = spark.createDataFrame(rdd_map)
df_map.display()
but in my Notebook output I have problem with list elements. They are parsed inсorrectly.
>object
>AA:
>ListPart:
ListName: "[{S1={F1=AAA, F2=BBB, F3=CCC}}, {S1={F1=XXX, F2=GGG, F3=BBB}}]"
>AAA:
F1: "Data"
List element represents as one string line.
My function to parse it:
def xmltodict(content,first_tag=''):
#Content from xml File
content = re.sub('\n', '', content)
content = re.sub('\r', '', content)
content = re.sub('>\s+<', '><', content)
data = unicodedata.normalize('NFKD', content)
soup = BeautifulSoup(data, 'lxml')
body = soup.find('body')
if(first_tag.strip()!=''):
struct = body.find(first_tag)
else:
struct=body
return parser(struct)
def parser(struct):
struct_all = struct.findAll(True, recursive=False)
struct_dict = {}
for strc in struct_all:
tag = strc.name
tag_name_prop = strc.attrs['name']
if tag == 'struct':
d = parser(strc)
el = {tag_name_prop: d}
struct_dict.update(el)
elif tag == 'field':
v = strc.text
struct_dict[tag_name_prop] = v
elif tag == 'list':
l_elem = []
for child in strc.contents:
soap_child = BeautifulSoup(str(child), 'lxml').find('body')
l_elem.append(parser(soap_child))
el = {tag_name_prop: l_elem}
struct_dict.update(el)
with open (result.txt,'w') as file:
file.write(json.dumps(struct_dict))
return struct_dict
the result in txt file is that I want to receive:
"A": { "AA": {
"AAA": {"F1": "Data"},
"ListPart": {
"ListName": [
{
"S1": {"F1": "AAA",
"F2": "BBB",
"F3": "CCC"
}
},
{
"S1": { "F1": "XXX",
"F2": "GGG",
"F3": "BBB"
}}]
}}}
but in my notebook output I have problem with list elements. They are parsed inсorrectly.
>object
>AA:
>ListPart:
ListName: "[{S1={F1=AAA, F2=BBB, F3=CCC}}, {S1={F1=XXX, F2=GGG, F3=BBB}}]"
>AAA:
F1: "Data"
Why list represents as one string line? Why are there "=" symbols instead of ":"?
I finally resolved my problem. The reason was that i should define schema and use it.
df_map = spark.createDataFrame(rdd_map,schema)