I have a very frustrating problem that I don't know how to solve. The following Python script processes a bunch of XML documents from a directory and it extracts information from them. With that information, it creates a csv file.
import re
import time
import csv
from lxml import etree as et
from pathlib import Path
from joblib import Parallel, delayed
from tqdm import tqdm
import ftfy
st = time.time()
XMLDIR = Path('/Users/josepm.fontana/Downloads/CICA_CORPUS_XML_CLEAN')
files = [e for e in XMLDIR.iterdir() if e.is_file()]
xml_doc = [f for f in files if f.with_suffix(".xml")]
myCSV_FILE = "/Volumes/SanDisk1TB/_CORPUS_WORK/CSVs/TestDataSet19-6-23_YZh.csv"
time_log = Path(
'/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/log_time_pathlib.txt')
results = Path(
'/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/TestDataSet19-6-23_YZh.txt')
tok_path = et.XPath('//tok | //dtok')
def xml_extract(xml_doc):
root_element = et.parse(xml_doc).getroot()
autor = None
data = None
tipus = None
dialecte = None
header = root_element.find("header")
if header is not None:
for el in header:
if el.get("type") == "autor":
autor = el.text
autor = ftfy.fix_text(autor)
elif el.get("type") == "data":
data = el.text
data = ftfy.fix_text(data)
elif el.get("type") == "tipologia":
tipus = el.text
tipus = ftfy.fix_text(tipus)
elif el.get("type") == "dialecte":
dialecte = el.text
dialecte = ftfy.fix_text(dialecte)
all_toks = tok_path(root_element)
matching_toks = filter(lambda tok: tok.get('xpos') is not None and tok.get(
'xpos').startswith('A') and not (tok.get('xpos').startswith('AX')), all_toks)
for el in matching_toks:
preceding_tok = el.xpath(
"./preceding-sibling::tok[1][@lemma and @xpos]")
preceding_tok_with_dtoks = el.xpath(
"./preceding-sibling::tok[1][not(@lemma) and not(@xpos)]"
)
following_dtok_of_dtok = el.xpath("./preceding-sibling::dtok[1]")
if el.tag == 'tok':
tok_dtok = 'tok'
Adj = "".join(el.itertext())
Adj_lemma = el.get('lemma')
Adj_xpos = el.get('xpos')
Adj = ftfy.fix_text(Adj)
elif el.tag == 'dtok':
tok_dtok = 'dtok'
Adj = el.get('form')
Adj_lemma = el.get('lemma')
Adj_xpos = el.get('xpos')
Adj = ftfy.fix_text(Adj)
pos = all_toks.index(el)
RelevantPrecedingElements = all_toks[max(pos - 6, 0):pos]
RelevantFollowingElements = all_toks[pos + 1:max(pos + 6, 1)]
if RelevantPrecedingElements:
prec1 = RelevantPrecedingElements[-1]
else:
prec1 = None
if RelevantFollowingElements:
foll1 = RelevantFollowingElements[0]
else:
foll1 = None
ElementsContext = all_toks[max(pos - 6, 0):pos + 1]
context_list = []
if ElementsContext:
for elem in ElementsContext:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
Adj = f"<{Adj}>"
for elem in RelevantFollowingElements:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
fol_lem = foll1.get('lemma') if foll1 is not None else None
prec_lem = prec1.get('lemma') if prec1 is not None else None
fol_xpos = foll1.get('xpos') if foll1 is not None else None
prec_xpos = prec1.get('xpos') if prec1 is not None else None
fol_form = None
if foll1 is not None:
if foll1.tag == "tok":
fol_form = foll1.text
elif foll1.tag == "dtok":
fol_form = foll1.get("form")
prec_form = None
if prec1 is not None:
if prec1.tag == "tok":
prec_form = prec1.text
elif prec1.tag == "dtok":
prec_form = prec1.get("form")
context = " ".join(context_list).replace(
" ,", ",").replace(" .", ".").replace(" ", " ").replace(" ", " ")
llista = [
context,
prec_form,
Adj,
fol_form,
prec_lem,
Adj_lemma,
fol_lem,
prec_xpos,
Adj_xpos,
fol_xpos,
tok_dtok,
xml_doc.name,
autor,
data,
tipus,
dialecte,
]
writer = csv.writer(csv_file, delimiter=";")
writer.writerow(llista)
with open(results, "a") as Results:
Results.write(f"@@@ {context} @@@\n\n")
Results.write(f"Source: {xml_doc.name}\n\n\n")
with open(myCSV_FILE, "a+", encoding="UTF8", newline='') as csv_file:
#Parallel(n_jobs=-1, prefer="threads")(delayed(xml_extract)(xml_doc) for xml_doc in tqdm(files))
Parallel(n_jobs=-1, prefer="threads")(delayed(xml_extract)(xml_doc) for xml_doc in tqdm(files) if not xml_doc.name.startswith("."))
elapsed_time = time.time() - st
with open(
time_log, "a"
) as Myfile:
Myfile.write(f"\n \n The end: The whole process took {elapsed_time} \n")
The text file that is created is perfect UTF-8. All of the XML documents have been double checked and triple checked to make sure they are all also properly formated as UTF-8.
At the end of every row of the csv file that is created, however, there is the "\x{0D}" character.
I do not understand this at all. This script was based on the following script that creates properly formatted csv files where this problem does not occur. The main difference is that in the problematic code I introduced parallelization via the 'joblib' library because otherwise it took forever to process all those files.
import re
import time
import csv
from lxml import etree as et
from pathlib import Path
st = time.time()
#XMLDIR = Path('/Volumes/SanDisk1TB/_CORPUS_WORK/CICA_WORKING_NEW')
XMLDIR = Path('/Users/josepm.fontana/Downloads/CICA_CORPUS_XML_CLEAN')
files = [e for e in XMLDIR.iterdir() if e.is_file()]
xml_doc = [f for f in files if f.with_suffix(".xml")]
myCSV_FILE = "/Volumes/SanDisk1TB/_CORPUS_WORK/CSVs/clitic_context_testTEST2.csv"
time_log = Path('/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/log_time_pathlib.txt')
results = Path('/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/resultsTEST2.txt')
tok_path = et.XPath('//tok')
def xml_extract(root_element):
all_toks = tok_path(root_element)
matching_toks = filter(lambda tok: re.match(r'^[EeLl][LlOoAa][Ss]*$', "".join(tok.itertext())) is not None and not(tok.get('xpos').startswith('D')), all_toks)
for el in matching_toks:
fake_clitic = "".join(el.itertext())
pos = all_toks.index(el)
RelevantPrecedingElements = all_toks[max(pos - 6, 0):pos]
print(RelevantPrecedingElements)
prec1 = RelevantPrecedingElements[-1]
#foll1 = all_toks[pos + 1]
RelevantFollowingElements = all_toks[pos + 1:max(pos + 6, 1)]
#prec1 = RelevantFollowingElements[]
#foll1 = all_toks[pos + 1]
print(RelevantFollowingElements)
foll1 = RelevantFollowingElements[0]
context_list = []
context_clean = []
for elem in RelevantPrecedingElements:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
context_clean.append(elem_text)
# adjective = '<' + str(el.text) + '>'
fake_clitic = f"<{fake_clitic}>"
fake_clitic_clean = f"{el.text}"
print(fake_clitic)
context_list.append(fake_clitic)
context_clean.append(fake_clitic_clean)
for elem in RelevantFollowingElements:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
context_clean.append(elem_text)
lema_fol = foll1.get('lemma') if foll1 is not None else None
lema_prec = prec1.get('lemma') if prec1 is not None else None
xpos_fol = foll1.get('xpos') if foll1 is not None else None
xpos_prec = prec1.get('xpos') if prec1 is not None else None
form_fol = foll1.text if foll1 is not None else None
form_prec = prec1.text if prec1 is not None else None
context = " ".join(context_list)
clean_context = " ".join(context_clean).replace(" ,", ",").replace(" .", ".")
print(f"Context is: {context}")
llista = [
context,
lema_prec,
xpos_prec,
form_prec,
fake_clitic,
lema_fol,
xpos_fol,
form_fol,
]
writer = csv.writer(csv_file, delimiter=";")
writer.writerow(llista)
with open(
results, "a"
) as Results:
Results.write(f"@@@ {context} @@@\n\n")
Results.write(f"{clean_context}\n\n")
Results.write(f"Source: {xml_doc.name}\n\n\n")
with open(myCSV_FILE, "a+", encoding="UTF8", newline="") as csv_file:
for xml_doc in files:
if xml_doc.name.startswith("."):
continue
doc = xml_doc.stem # this was
print(doc)
start_file_time_beforeParse = time.time()
print(start_file_time_beforeParse)
print(
f"{time.time() - st} seconds after the beginning of the process I'm starting to get the root of {xml_doc.name}"
)
file_root = et.parse(xml_doc).getroot()
xml_extract(file_root)
print(
f"I ran through {xml_doc.name} in {time.time() - start_file_time_beforeParse} seconds!"
)
with open(
time_log, "a"
) as Myfile:
Myfile.write("Time it took to getroot and parse ")
Myfile.write(xml_doc.name)
Myfile.write("\n")
Myfile.write("Time it took to loop through the entire ")
Myfile.write(xml_doc.name)
Myfile.write(" is: ")
Myfile.write(f"{time.time() - start_file_time_beforeParse} seconds!")
Myfile.write("\n")
Myfile.write("\n")
elapsed_time = time.time() - st
with open(
time_log, "a"
) as Myfile:
Myfile.write(f"\n \n The end: The whole process took {elapsed_time} \n")
print("Execution time:", elapsed_time, "seconds")
I would greatly appreciate any help you can offer. This is really frustrating.
Here is a link to some sample XML files like the ones I'm trying to process:
EDIT:
Adaptation of Zach Young's script for problematic task:
import csv
import re
import time
from pathlib import Path
from lxml import etree as et
beg_main = time.time()
#xmls_dir = Path("./xmls")
xmls_dir = Path('/PathTo/CLEAN_COMP_TEST2')
files = [e for e in xmls_dir.iterdir() if e.is_file()]
xml_files = [f for f in files if f.with_suffix(".xml")]
csv_path = Path("/PathTo/My_Output.csv")
csv_file = open(csv_path, "w", newline="", encoding="utf-8")
writer = csv.writer(csv_file, delimiter=";")
results_path = Path("/PathTo/my_results.txt")
results = open(results_path, "w", encoding="utf-8")
times_path = Path("/PathTo/my_times.txt")
times = open(times_path, "w", encoding="utf-8")
tok_path = et.XPath('//tok | //dtok')
def xml_extract(doc_root, fname: str):
all_toks = tok_path(doc_root)
matching_toks = filter(
lambda tok:
tok.get('xpos') is not None and tok.get
(
'xpos').startswith('A') and not (tok.get('xpos').startswith('AX')
),
all_toks
)
for el in matching_toks:
preceding_tok = el.xpath(
"./preceding-sibling::tok[1][@lemma and @xpos]")
preceding_tok_with_dtoks = el.xpath(
"./preceding-sibling::tok[1][not(@lemma) and not(@xpos)]"
)
following_dtok_of_dtok = el.xpath("./preceding-sibling::dtok[1]")
if el.tag == 'tok':
tok_dtok = 'tok'
Adj = "".join(el.itertext())
Adj_lemma = el.get('lemma')
Adj_xpos = el.get('xpos')
elif el.tag == 'dtok':
tok_dtok = 'dtok'
Adj = el.get('form')
Adj_lemma = el.get('lemma')
Adj_xpos = el.get('xpos')
pos = all_toks.index(el)
RelevantPrecedingElements = all_toks[max(pos - 6, 0):pos]
RelevantFollowingElements = all_toks[pos + 1:max(pos + 6, 1)]
if RelevantPrecedingElements:
prec1 = RelevantPrecedingElements[-1]
else:
prec1 = None
if RelevantFollowingElements:
foll1 = RelevantFollowingElements[0]
else:
foll1 = None
ElementsContext = all_toks[max(pos - 6, 0):pos + 1]
context_list = []
if ElementsContext:
for elem in ElementsContext:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
Adj = f"<{Adj}>"
for elem in RelevantFollowingElements:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
fol_lem = foll1.get('lemma') if foll1 is not None else None
prec_lem = prec1.get('lemma') if prec1 is not None else None
fol_xpos = foll1.get('xpos') if foll1 is not None else None
prec_xpos = prec1.get('xpos') if prec1 is not None else None
fol_form = None
if foll1 is not None:
if foll1.tag == "tok":
fol_form = foll1.text
elif foll1.tag == "dtok":
fol_form = foll1.get("form")
prec_form = None
if prec1 is not None:
if prec1.tag == "tok":
prec_form = prec1.text
elif prec1.tag == "dtok":
prec_form = prec1.get("form")
context = " ".join(context_list).replace(
" ,", ",").replace(" .", ".").replace(" ", " ").replace(" ", " ")
#print(f"Context is: {context}")
llista = [
context,
prec_form,
Adj,
fol_form,
prec_lem,
Adj_lemma,
fol_lem,
prec_xpos,
Adj_xpos,
fol_xpos,
tok_dtok,
xml_file.name,
autor,
data,
tipus,
dialecte,
]
writer.writerow(llista)
results.write(f"@@@ {context} @@@\n\n")
results.write(f"Source: {fname}\n\n\n")
for xml_file in xml_files:
if xml_file.name.startswith("."):
continue
beg_extract = time.time()
doc_root = et.parse(xml_file, parser=None).getroot()
obra = None
autor = None
data = None
tipus = None
dialecte = None
header = doc_root.find("header")
if header is not None:
for el in header:
if el.get("type") == "obra":
obra = el.text
elif el.get("type") == "autor":
autor = el.text
elif el.get("type") == "data":
data = el.text
elif el.get("type") == "tipologia":
tipus = el.text
elif el.get("type") == "dialecte":
dialecte = el.text
xml_extract(doc_root, xml_file.name)
times.write(f"Time to extract {xml_file.name}: {time.time() - beg_extract}s\n")
elapsed = time.time() - beg_main
times.write(f"\n \n The end: The whole process took {elapsed}s\n")
print("Execution time:", elapsed, "seconds")
Based on our little discussion in the comments, I recommend starting with something like the following. You can open all the files once for write at the very top, then reference them whereever you need to write (not in parallel, though, just synchronously):
import csv
import re
import time
from pathlib import Path
from lxml import etree as et
beg_main = time.time()
xmls_dir = Path("./xmls")
files = [e for e in xmls_dir.iterdir() if e.is_file()]
xml_files = [f for f in files if f.with_suffix(".xml")]
csv_path = Path("./my_output.csv")
csv_file = open(csv_path, "w", newline="", encoding="utf-8")
writer = csv.writer(csv_file, delimiter=";")
results_path = Path("./my_results.txt")
results = open(results_path, "w", encoding="utf-8")
times_path = Path("./my_times.txt")
times = open(times_path, "w", encoding="utf-8")
tok_path = et.XPath("//tok")
def xml_extract(doc_root, fname: str):
all_toks = tok_path(doc_root)
matching_toks = filter(
lambda tok: (
re.match(r"^[EeLl][LlOoAa][Ss]*$", "".join(tok.itertext())) is not None
and not (tok.get("xpos").startswith("D"))
),
all_toks,
)
for el in matching_toks:
fake_clitic = "".join(el.itertext())
pos = all_toks.index(el)
RelevantPrecedingElements = all_toks[max(pos - 6, 0) : pos]
prec1 = RelevantPrecedingElements[-1]
RelevantFollowingElements = all_toks[pos + 1 : max(pos + 6, 1)]
foll1 = RelevantFollowingElements[0]
context_list = []
context_clean = []
for elem in RelevantPrecedingElements:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
context_clean.append(elem_text)
fake_clitic = f"<{fake_clitic}>"
fake_clitic_clean = f"{el.text}"
context_list.append(fake_clitic)
context_clean.append(fake_clitic_clean)
for elem in RelevantFollowingElements:
elem_text = "".join(elem.itertext())
assert elem_text is not None
context_list.append(elem_text)
context_clean.append(elem_text)
lema_fol = foll1.get("lemma") if foll1 is not None else None
lema_prec = prec1.get("lemma") if prec1 is not None else None
xpos_fol = foll1.get("xpos") if foll1 is not None else None
xpos_prec = prec1.get("xpos") if prec1 is not None else None
form_fol = foll1.text if foll1 is not None else None
form_prec = prec1.text if prec1 is not None else None
context = " ".join(context_list)
clean_context = " ".join(context_clean).replace(" ,", ",").replace(" .", ".")
llista = [
context,
lema_prec,
xpos_prec,
form_prec,
fake_clitic,
lema_fol,
xpos_fol,
form_fol,
]
writer.writerow(llista)
results.write(f"@@@ {context} @@@\n\n")
results.write(f"{clean_context}\n\n")
results.write(f"Source: {fname}\n\n\n")
for xml_file in xml_files:
if xml_file.name.startswith("."):
continue
beg_extract = time.time()
doc_root = et.parse(xml_file, parser=None).getroot()
xml_extract(doc_root, xml_file.name)
times.write(f"Time to extract {xml_file.name}: {time.time() - beg_extract}s\n")
elapsed = time.time() - beg_main
times.write(f"\n \n The end: The whole process took {elapsed}s\n")
print("Execution time:", elapsed, "seconds")
When the program exits, Python will close the files for you, so you don't need all the with open(...)
and the indentation.
I ran this version and your version on the 16 XML files you shared.
On my machine that makes some difference over opening the files inside extract_xml. Mine runs about in about 80% the time as (20% faster than?) yours. I have dual-channel SSDs though, so my read/writes are fast. If you don't have that kind of hardware, opening/writing/closing will take longer. I don't know if it's enough to see the slowdown you experienced, though. To process all 16 files in the ZIP you shared, mine ran in 0.0055 seconds, and yours ran in only 0.0066 seconds. Also, in my trials I found that just commenting out your print/debug statements saved time too.
Try out my code on the sample XMLs you shared and see what it runs in compared to yours.
As for the weird write error, you'll always get that with multiple agents trying to write at once. If you really want/need to pursue parallelism, you'll need to figure out how to sync the writes so only one process at a time tries/can write to any one file... which might defeat the whole reason you wanted to parallelize in the first place.
Lemme know how this turns out for you. Good luck!