Search code examples
pythoncsvutf-8character-encoding

Non UTF-8 compliant character "\x{0D}" at the end of output csv rows


I have a very frustrating problem that I don't know how to solve. The following Python script processes a bunch of XML documents from a directory and it extracts information from them. With that information, it creates a csv file.

import re
import time
import csv
from lxml import etree as et
from pathlib import Path
from joblib import Parallel, delayed
from tqdm import tqdm
import ftfy


st = time.time()

XMLDIR = Path('/Users/josepm.fontana/Downloads/CICA_CORPUS_XML_CLEAN')
files = [e for e in XMLDIR.iterdir() if e.is_file()]
xml_doc = [f for f in files if f.with_suffix(".xml")]
myCSV_FILE = "/Volumes/SanDisk1TB/_CORPUS_WORK/CSVs/TestDataSet19-6-23_YZh.csv"
time_log = Path(
    '/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/log_time_pathlib.txt')
results = Path(
    '/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/TestDataSet19-6-23_YZh.txt')


tok_path = et.XPath('//tok | //dtok')


def xml_extract(xml_doc):

    root_element = et.parse(xml_doc).getroot()
    autor = None
    data = None
    tipus = None
    dialecte = None

    header = root_element.find("header")
    if header is not None:
        for el in header:

            if el.get("type") == "autor":
                autor = el.text
                autor = ftfy.fix_text(autor)
            elif el.get("type") == "data":
                data = el.text
                data = ftfy.fix_text(data)
            elif el.get("type") == "tipologia":
                tipus = el.text
                tipus = ftfy.fix_text(tipus)
            elif el.get("type") == "dialecte":
                dialecte = el.text
                dialecte = ftfy.fix_text(dialecte)


    all_toks = tok_path(root_element)

    matching_toks = filter(lambda tok: tok.get('xpos') is not None and tok.get(
        'xpos').startswith('A') and not (tok.get('xpos').startswith('AX')), all_toks)

    for el in matching_toks:
        preceding_tok = el.xpath(
            "./preceding-sibling::tok[1][@lemma and @xpos]")
        preceding_tok_with_dtoks = el.xpath(
            "./preceding-sibling::tok[1][not(@lemma) and not(@xpos)]"
        )
        following_dtok_of_dtok = el.xpath("./preceding-sibling::dtok[1]")

        if el.tag == 'tok':
            tok_dtok = 'tok'
            Adj = "".join(el.itertext())
            Adj_lemma = el.get('lemma')
            Adj_xpos = el.get('xpos')
            Adj = ftfy.fix_text(Adj)

        elif el.tag == 'dtok':
            tok_dtok = 'dtok'
            Adj = el.get('form')
            Adj_lemma = el.get('lemma')
            Adj_xpos = el.get('xpos')
            Adj = ftfy.fix_text(Adj)

        pos = all_toks.index(el)

        RelevantPrecedingElements = all_toks[max(pos - 6, 0):pos]

        RelevantFollowingElements = all_toks[pos + 1:max(pos + 6, 1)]
        
        
        if RelevantPrecedingElements:
            
            prec1 = RelevantPrecedingElements[-1]
        else:
            prec1 = None

        if RelevantFollowingElements:
            foll1 = RelevantFollowingElements[0]
        else:
            foll1 = None
        
        
        
        

        ElementsContext = all_toks[max(pos - 6, 0):pos + 1]

        context_list = []
        
        
        
        if ElementsContext:
            for elem in ElementsContext:
                elem_text = "".join(elem.itertext())
                assert elem_text is not None
                context_list.append(elem_text)

        Adj = f"<{Adj}>"

        for elem in RelevantFollowingElements:
            elem_text = "".join(elem.itertext())
            assert elem_text is not None
            context_list.append(elem_text)

        fol_lem = foll1.get('lemma') if foll1 is not None else None
        prec_lem = prec1.get('lemma') if prec1 is not None else None
        fol_xpos = foll1.get('xpos') if foll1 is not None else None
        prec_xpos = prec1.get('xpos') if prec1 is not None else None
        
        
        fol_form = None
        if foll1 is not None:
            if foll1.tag == "tok":
                fol_form = foll1.text
            elif foll1.tag == "dtok":
                fol_form = foll1.get("form")
            
        prec_form = None
        if prec1 is not None:

            if prec1.tag == "tok":
                prec_form = prec1.text
            elif prec1.tag == "dtok":
                prec_form = prec1.get("form")

        context = " ".join(context_list).replace(
            " ,", ",").replace(" .", ".").replace("   ", " ").replace("  ", " ")

        llista = [
            context,
            prec_form,
            Adj,
            fol_form,
            prec_lem,
            Adj_lemma,
            fol_lem,
            prec_xpos,
            Adj_xpos,
            fol_xpos,
            tok_dtok,
            xml_doc.name,
            autor,
            data,
            tipus,
            dialecte,
        ]

        writer = csv.writer(csv_file, delimiter=";")
        writer.writerow(llista)
        with open(results, "a") as Results:
            Results.write(f"@@@ {context} @@@\n\n")
            Results.write(f"Source: {xml_doc.name}\n\n\n")


with open(myCSV_FILE, "a+", encoding="UTF8", newline='') as csv_file:




    #Parallel(n_jobs=-1,  prefer="threads")(delayed(xml_extract)(xml_doc) for xml_doc in tqdm(files))
    Parallel(n_jobs=-1, prefer="threads")(delayed(xml_extract)(xml_doc) for xml_doc in tqdm(files) if not xml_doc.name.startswith("."))

    

elapsed_time = time.time() - st

with open(
    time_log, "a"
) as Myfile:
    Myfile.write(f"\n \n The end: The whole process took {elapsed_time} \n")

The text file that is created is perfect UTF-8. All of the XML documents have been double checked and triple checked to make sure they are all also properly formated as UTF-8.

At the end of every row of the csv file that is created, however, there is the "\x{0D}" character.

I do not understand this at all. This script was based on the following script that creates properly formatted csv files where this problem does not occur. The main difference is that in the problematic code I introduced parallelization via the 'joblib' library because otherwise it took forever to process all those files.

import re
import time
import csv
from lxml import etree as et
from pathlib import Path


st = time.time()

#XMLDIR = Path('/Volumes/SanDisk1TB/_CORPUS_WORK/CICA_WORKING_NEW')
XMLDIR = Path('/Users/josepm.fontana/Downloads/CICA_CORPUS_XML_CLEAN')
files = [e for e in XMLDIR.iterdir() if e.is_file()]
xml_doc = [f for f in files if f.with_suffix(".xml")]
myCSV_FILE = "/Volumes/SanDisk1TB/_CORPUS_WORK/CSVs/clitic_context_testTEST2.csv"
time_log = Path('/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/log_time_pathlib.txt')
results = Path('/Volumes/SanDisk1TB/_CORPUS_WORK/TEXT_FILES/resultsTEST2.txt')



tok_path = et.XPath('//tok')

def xml_extract(root_element):

    all_toks = tok_path(root_element)

    matching_toks = filter(lambda tok: re.match(r'^[EeLl][LlOoAa][Ss]*$', "".join(tok.itertext())) is not None and not(tok.get('xpos').startswith('D')), all_toks)

    for el in matching_toks: 

        fake_clitic = "".join(el.itertext())
        pos = all_toks.index(el)


        RelevantPrecedingElements = all_toks[max(pos - 6, 0):pos]
        print(RelevantPrecedingElements)

        prec1 = RelevantPrecedingElements[-1]
        #foll1 = all_toks[pos + 1]



        RelevantFollowingElements = all_toks[pos + 1:max(pos + 6, 1)]
       #prec1 = RelevantFollowingElements[]
        #foll1 = all_toks[pos + 1]
        print(RelevantFollowingElements)

        foll1 = RelevantFollowingElements[0]


        context_list = []
        context_clean = []

        for elem in RelevantPrecedingElements:
            elem_text = "".join(elem.itertext())
            assert elem_text is not None
            context_list.append(elem_text)            
            context_clean.append(elem_text)

        # adjective = '<' + str(el.text) + '>'
        fake_clitic = f"<{fake_clitic}>"
        fake_clitic_clean = f"{el.text}"

        print(fake_clitic)
        context_list.append(fake_clitic)
        context_clean.append(fake_clitic_clean)

        for elem in RelevantFollowingElements:
            elem_text = "".join(elem.itertext())
            assert elem_text is not None
            context_list.append(elem_text)
            context_clean.append(elem_text)



        lema_fol = foll1.get('lemma') if foll1 is not None else None
        lema_prec = prec1.get('lemma') if prec1 is not None else None
        xpos_fol = foll1.get('xpos') if foll1 is not None else None
        xpos_prec = prec1.get('xpos') if prec1 is not None else None
        form_fol = foll1.text if foll1 is not None else None
        form_prec = prec1.text if prec1 is not None else None

        context = " ".join(context_list)
        clean_context = " ".join(context_clean).replace(" ,", ",").replace(" .", ".")
        print(f"Context is: {context}")


        llista = [
            context,
            lema_prec,
            xpos_prec,
            form_prec,
            fake_clitic,
            lema_fol,
            xpos_fol,
            form_fol,
        ]

        writer = csv.writer(csv_file, delimiter=";")
        writer.writerow(llista)
        with open(
            results, "a"
        ) as Results:
            Results.write(f"@@@ {context} @@@\n\n")
            Results.write(f"{clean_context}\n\n")
            Results.write(f"Source: {xml_doc.name}\n\n\n")

with open(myCSV_FILE, "a+", encoding="UTF8", newline="") as csv_file:

    for xml_doc in files:
        if xml_doc.name.startswith("."):
            continue
        doc = xml_doc.stem # this was 
        print(doc)
        start_file_time_beforeParse = time.time()
        print(start_file_time_beforeParse)
        print(
            f"{time.time() - st} seconds after the beginning of the process I'm starting to get the root of {xml_doc.name}"
        )
        file_root = et.parse(xml_doc).getroot()
        xml_extract(file_root)
        print(
            f"I ran through {xml_doc.name} in {time.time() - start_file_time_beforeParse} seconds!"
        )
        with open(
            time_log, "a"
        ) as Myfile:
            Myfile.write("Time it took to getroot and parse ")
            Myfile.write(xml_doc.name)
            Myfile.write("\n")
            Myfile.write("Time it took to loop through the entire ")
            Myfile.write(xml_doc.name)
            Myfile.write(" is: ")
            Myfile.write(f"{time.time() - start_file_time_beforeParse} seconds!")
            Myfile.write("\n")
            Myfile.write("\n")

elapsed_time = time.time() - st


with open(
    time_log, "a"
) as Myfile:
    Myfile.write(f"\n \n The end: The whole process took {elapsed_time} \n")


print("Execution time:", elapsed_time, "seconds")

I would greatly appreciate any help you can offer. This is really frustrating.

Here is a link to some sample XML files like the ones I'm trying to process:

Sample XML files

EDIT:

Adaptation of Zach Young's script for problematic task:

import csv
import re
import time

from pathlib import Path

from lxml import etree as et

beg_main = time.time()

#xmls_dir = Path("./xmls")

xmls_dir = Path('/PathTo/CLEAN_COMP_TEST2')
files = [e for e in xmls_dir.iterdir() if e.is_file()]
xml_files = [f for f in files if f.with_suffix(".xml")]

csv_path = Path("/PathTo/My_Output.csv")


csv_file = open(csv_path, "w", newline="", encoding="utf-8")
writer = csv.writer(csv_file, delimiter=";")

results_path = Path("/PathTo/my_results.txt")


results = open(results_path, "w", encoding="utf-8")

times_path = Path("/PathTo/my_times.txt")
times = open(times_path, "w", encoding="utf-8")

tok_path = et.XPath('//tok | //dtok')

def xml_extract(doc_root, fname: str):
    all_toks = tok_path(doc_root)    
    
    matching_toks = filter(
        lambda tok: 
            tok.get('xpos') is not None and tok.get
            (
        'xpos').startswith('A') and not (tok.get('xpos').startswith('AX')
        ), 
        all_toks
        )
    
    for el in matching_toks:
        preceding_tok = el.xpath(
            "./preceding-sibling::tok[1][@lemma and @xpos]")
        preceding_tok_with_dtoks = el.xpath(
            "./preceding-sibling::tok[1][not(@lemma) and not(@xpos)]"
        )
        following_dtok_of_dtok = el.xpath("./preceding-sibling::dtok[1]")

        if el.tag == 'tok':
            tok_dtok = 'tok'
            Adj = "".join(el.itertext())
            Adj_lemma = el.get('lemma')
            Adj_xpos = el.get('xpos')

        elif el.tag == 'dtok':
            tok_dtok = 'dtok'
            Adj = el.get('form')
            Adj_lemma = el.get('lemma')
            Adj_xpos = el.get('xpos')

        pos = all_toks.index(el)

        RelevantPrecedingElements = all_toks[max(pos - 6, 0):pos]

        RelevantFollowingElements = all_toks[pos + 1:max(pos + 6, 1)]

        
        if RelevantPrecedingElements:
            
            prec1 = RelevantPrecedingElements[-1]
        else:
            prec1 = None

        if RelevantFollowingElements:
            foll1 = RelevantFollowingElements[0]
        else:
            foll1 = None

                
        ElementsContext = all_toks[max(pos - 6, 0):pos + 1]


        context_list = []
        
        if ElementsContext:
            for elem in ElementsContext:
                elem_text = "".join(elem.itertext())
                assert elem_text is not None
                context_list.append(elem_text)



   
        
        Adj = f"<{Adj}>"

        

        for elem in RelevantFollowingElements:
            elem_text = "".join(elem.itertext())
            assert elem_text is not None
            context_list.append(elem_text)


        fol_lem = foll1.get('lemma') if foll1 is not None else None
        prec_lem = prec1.get('lemma') if prec1 is not None else None
        fol_xpos = foll1.get('xpos') if foll1 is not None else None
        prec_xpos = prec1.get('xpos') if prec1 is not None else None
        

        fol_form = None

        if foll1 is not None:
            if foll1.tag == "tok":
                fol_form = foll1.text
            elif foll1.tag == "dtok":
                fol_form = foll1.get("form")
                
        prec_form = None
        if prec1 is not None:

            if prec1.tag == "tok":
                prec_form = prec1.text
            elif prec1.tag == "dtok":
                prec_form = prec1.get("form")
                
        context = " ".join(context_list).replace(
            " ,", ",").replace(" .", ".").replace("   ", " ").replace("  ", " ")

        #print(f"Context is: {context}")
        

        llista = [
            context,
            prec_form,
            Adj,
            fol_form,
            prec_lem,
            Adj_lemma,
            fol_lem,
            prec_xpos,
            Adj_xpos,
            fol_xpos,
            tok_dtok,
            xml_file.name,
            autor,
            data,
            tipus,
            dialecte,
        ]

        writer.writerow(llista)
        results.write(f"@@@ {context} @@@\n\n")
        results.write(f"Source: {fname}\n\n\n")


for xml_file in xml_files:
    if xml_file.name.startswith("."):
        continue

    beg_extract = time.time()
    doc_root = et.parse(xml_file, parser=None).getroot()
    obra = None
    autor = None
    data = None
    tipus = None
    dialecte = None

    header = doc_root.find("header")
    if header is not None:
        for el in header:
            if el.get("type") == "obra":
                obra = el.text
            elif el.get("type") == "autor":
                autor = el.text
            elif el.get("type") == "data":
                data = el.text
            elif el.get("type") == "tipologia":
                tipus = el.text
            elif el.get("type") == "dialecte":
                dialecte = el.text

    xml_extract(doc_root, xml_file.name)

    times.write(f"Time to extract {xml_file.name}: {time.time() - beg_extract}s\n")

elapsed = time.time() - beg_main
times.write(f"\n \n The end: The whole process took {elapsed}s\n")

print("Execution time:", elapsed, "seconds")


Solution

  • Based on our little discussion in the comments, I recommend starting with something like the following. You can open all the files once for write at the very top, then reference them whereever you need to write (not in parallel, though, just synchronously):

    import csv
    import re
    import time
    
    from pathlib import Path
    
    from lxml import etree as et
    
    beg_main = time.time()
    
    xmls_dir = Path("./xmls")
    files = [e for e in xmls_dir.iterdir() if e.is_file()]
    xml_files = [f for f in files if f.with_suffix(".xml")]
    
    csv_path = Path("./my_output.csv")
    csv_file = open(csv_path, "w", newline="", encoding="utf-8")
    writer = csv.writer(csv_file, delimiter=";")
    
    results_path = Path("./my_results.txt")
    results = open(results_path, "w", encoding="utf-8")
    
    times_path = Path("./my_times.txt")
    times = open(times_path, "w", encoding="utf-8")
    
    tok_path = et.XPath("//tok")
    
    def xml_extract(doc_root, fname: str):
        all_toks = tok_path(doc_root)
    
        matching_toks = filter(
            lambda tok: (
                re.match(r"^[EeLl][LlOoAa][Ss]*$", "".join(tok.itertext())) is not None
                and not (tok.get("xpos").startswith("D"))
            ),
            all_toks,
        )
    
        for el in matching_toks:
            fake_clitic = "".join(el.itertext())
            pos = all_toks.index(el)
    
            RelevantPrecedingElements = all_toks[max(pos - 6, 0) : pos]
    
            prec1 = RelevantPrecedingElements[-1]
    
            RelevantFollowingElements = all_toks[pos + 1 : max(pos + 6, 1)]
    
            foll1 = RelevantFollowingElements[0]
    
            context_list = []
            context_clean = []
    
            for elem in RelevantPrecedingElements:
                elem_text = "".join(elem.itertext())
                assert elem_text is not None
                context_list.append(elem_text)
                context_clean.append(elem_text)
    
            fake_clitic = f"<{fake_clitic}>"
            fake_clitic_clean = f"{el.text}"
    
            context_list.append(fake_clitic)
            context_clean.append(fake_clitic_clean)
    
            for elem in RelevantFollowingElements:
                elem_text = "".join(elem.itertext())
                assert elem_text is not None
                context_list.append(elem_text)
                context_clean.append(elem_text)
    
            lema_fol = foll1.get("lemma") if foll1 is not None else None
            lema_prec = prec1.get("lemma") if prec1 is not None else None
            xpos_fol = foll1.get("xpos") if foll1 is not None else None
            xpos_prec = prec1.get("xpos") if prec1 is not None else None
            form_fol = foll1.text if foll1 is not None else None
            form_prec = prec1.text if prec1 is not None else None
    
            context = " ".join(context_list)
            clean_context = " ".join(context_clean).replace(" ,", ",").replace(" .", ".")
    
            llista = [
                context,
                lema_prec,
                xpos_prec,
                form_prec,
                fake_clitic,
                lema_fol,
                xpos_fol,
                form_fol,
            ]
    
            writer.writerow(llista)
    
            results.write(f"@@@ {context} @@@\n\n")
            results.write(f"{clean_context}\n\n")
            results.write(f"Source: {fname}\n\n\n")
    
    for xml_file in xml_files:
        if xml_file.name.startswith("."):
            continue
    
        beg_extract = time.time()
        doc_root = et.parse(xml_file, parser=None).getroot()
        xml_extract(doc_root, xml_file.name)
    
        times.write(f"Time to extract {xml_file.name}: {time.time() - beg_extract}s\n")
    
    elapsed = time.time() - beg_main
    times.write(f"\n \n The end: The whole process took {elapsed}s\n")
    
    print("Execution time:", elapsed, "seconds")
    

    When the program exits, Python will close the files for you, so you don't need all the with open(...) and the indentation.

    I ran this version and your version on the 16 XML files you shared.

    On my machine that makes some difference over opening the files inside extract_xml. Mine runs about in about 80% the time as (20% faster than?) yours. I have dual-channel SSDs though, so my read/writes are fast. If you don't have that kind of hardware, opening/writing/closing will take longer. I don't know if it's enough to see the slowdown you experienced, though. To process all 16 files in the ZIP you shared, mine ran in 0.0055 seconds, and yours ran in only 0.0066 seconds. Also, in my trials I found that just commenting out your print/debug statements saved time too.

    Try out my code on the sample XMLs you shared and see what it runs in compared to yours.

    As for the weird write error, you'll always get that with multiple agents trying to write at once. If you really want/need to pursue parallelism, you'll need to figure out how to sync the writes so only one process at a time tries/can write to any one file... which might defeat the whole reason you wanted to parallelize in the first place.

    Lemme know how this turns out for you. Good luck!