Search code examples
pythoncsvrow

all data variables in the same row CSV with Python


# coding=utf-8
# Libreria RegEx de Python.
import re
# Libreria para rutas.
import os
import csv

# function betwwen: return the value between two words a and b
def between(value, a, b):
    pos_a = value.find(a)  # Find and validate before-part.
    if pos_a == -1: return ""  # Find and validate after part.
    pos_b = value.rfind(b)
    if pos_b == -1: return ""  # Return middle part.
    adjusted_pos_a = pos_a + len(a)
    if adjusted_pos_a >= pos_b: return ""
    return value[adjusted_pos_a:pos_b]

# function scan folder DiarioOficial
def scan_folder():
    # directory 'path'
    path = '/Users/anna/PycharmProjects/extractData/DiarioOficial'
    # contador de ficheros del path
    count = 0

    # creation csv as csvFile
    with open('All_Companies1.csv', 'a') as csvFile:
        # iterate all paths in the folder DiarioOficial without name
        for (path, dirnames, file_names) in os.walk(path):
            # iterate over all the files in the path (+ file_name)
            for file_name in file_names:
                # Add extension that is required
                if file_name.endswith(".txt"):
                    # summatory count files in DiarioOficial folder
                    count = count + 1
                    # concatenation path + file name
                    file_path=os.path.join(path, file_name)
                    #print(file_path)
                    # open and read the file path
                    mensaje = open(file_path).read()
                    # Replace a newline for a space
                    mensaje = mensaje.replace("\n","")

                    # Company Name
                    keywords_cap = ['SpA', 'SPA', 'LIMITADA', 'LTDA', 'S.A.', 'E.I.R.L.', 'S.L.']
                    # re.escape to solve the problem with metacharacters in keyword_obj
                    keywords_cap = map(re.escape, keywords_cap)
                    # sorting the items by lengh in descending order
                    keywords_cap.sort(key=len, reverse=True)
                    obj = re.compile(r'[:,;.]\s*"?([^:,;.]*?(?<!\w)(?:{}))'.format('|'.join(keywords_cap)))
                    if obj:
                        # To obtain the first match obj.search(mensaje).group(1)
                        company_name = obj.search(mensaje)
                    else:
                        company_name = "None"

                    # CVE Number of the file
                    regex = r"\s*CVE\s+([^|]*)"
                    matches = re.search(regex, mensaje)
                    if matches:
                        company_cve = matches.group(1).strip()
                    else:
                        company_cve = "None"

                    # Section of diariooficial.interior.gob.cl
                    company_sect = between(mensaje, 'SECCIÓN', 'Núm.')
                    if company_sect:
                        company_sect = company_sect
                    else:
                        company_sect = "None"

                    # Name of the person that constitutes the company
                    company_ceo = re.search(r'\sante mí,\s+([^,]*)', mensaje)
                    if company_ceo:
                        company_ceo = company_ceo.group(1)
                    else:
                        company_ceo = "None"

                    # File Number from Section
                    num_reg = r'\sNúm.\s+([^|]*)'
                    match_num = re.search(num_reg, mensaje)
                    if match_num:
                        company_numsect = match_num.group(1)
                    else:
                        company_numsect = "None"

                    # Social Capital ($)
                    cap = r"\s*(CAPITAL:\s+([^-]*)|Capital social:\s+([^-]*)|Capital:\s+([^-]*)|Capital:\s+([^,]*))"
                    caps = re.search(cap, mensaje)
                    if caps:
                        company_capital = caps.group()
                    else:
                        company_capital = 'None'

                    csvData = [company_name, company_cve, company_sect, company_ceo, company_numsect, company_capital]
                    headers = ['COMPANY NAME', 'CVE', 'SECTION','CEO NAME','NUMBER SECTOR','COMPANY CAPITAL']
                    writer = csv.writer(csvFile, delimiter=',') # create a csv delimited by comma
                    writer.writerow(headers)  # print the header row
                    writer.writerow(csvData)  # print the Data in csv
            # Number of txt files
            print (count)
scan_folder()

I have this script that create a csv with the data extracted from a text in specific path. In spite of the errors that can be on RegEx, mainly it extracts parts of text that it keeps them in variables and the printa in a csv. Each company must have a single line in this csv. In this way, when the csv is opened, the number of companies and all the information can be visualized by variables.

My problem is that when I see the CSV called, in this case, All_companies1, the data is not put in the same row, they jump.

Also, the titles are repeated, and I do not want them to repeat themselves

Also, the titles are repeated, and I do not want them to repeat themselves.


Solution

  • First try changing the mode for the csvFile from a (append) to w (write), also check if the editor you're using actual uses the comma as the column delimiter for csv files, since in the above picture is seems as if the comma is seen by the editor as a normal character.

    Also remove any carriage return characters (\n \r) from your string before printing it, this can be done in the following code.

    csvData = [str(data).replace('\n', '').replace('\r', '') for data in csvData]

    Note: if by any chance this works, there might be a problem with with having empty rows in the csv file beteen each two elements, this can be fixed by changing with open('All_Companies1.csv', 'a') as csvFile to with open('All_Companies1.csv', 'a', newline='') as csvFile