# coding=utf-8
# Libreria RegEx de Python.
import re
# Libreria para rutas.
import os
import csv
# function betwwen: return the value between two words a and b
def between(value, a, b):
pos_a = value.find(a) # Find and validate before-part.
if pos_a == -1: return "" # Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return "" # Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
# function scan folder DiarioOficial
def scan_folder():
# directory 'path'
path = '/Users/anna/PycharmProjects/extractData/DiarioOficial'
# contador de ficheros del path
count = 0
# creation csv as csvFile
with open('All_Companies1.csv', 'a') as csvFile:
# iterate all paths in the folder DiarioOficial without name
for (path, dirnames, file_names) in os.walk(path):
# iterate over all the files in the path (+ file_name)
for file_name in file_names:
# Add extension that is required
if file_name.endswith(".txt"):
# summatory count files in DiarioOficial folder
count = count + 1
# concatenation path + file name
file_path=os.path.join(path, file_name)
#print(file_path)
# open and read the file path
mensaje = open(file_path).read()
# Replace a newline for a space
mensaje = mensaje.replace("\n","")
# Company Name
keywords_cap = ['SpA', 'SPA', 'LIMITADA', 'LTDA', 'S.A.', 'E.I.R.L.', 'S.L.']
# re.escape to solve the problem with metacharacters in keyword_obj
keywords_cap = map(re.escape, keywords_cap)
# sorting the items by lengh in descending order
keywords_cap.sort(key=len, reverse=True)
obj = re.compile(r'[:,;.]\s*"?([^:,;.]*?(?<!\w)(?:{}))'.format('|'.join(keywords_cap)))
if obj:
# To obtain the first match obj.search(mensaje).group(1)
company_name = obj.search(mensaje)
else:
company_name = "None"
# CVE Number of the file
regex = r"\s*CVE\s+([^|]*)"
matches = re.search(regex, mensaje)
if matches:
company_cve = matches.group(1).strip()
else:
company_cve = "None"
# Section of diariooficial.interior.gob.cl
company_sect = between(mensaje, 'SECCIÓN', 'Núm.')
if company_sect:
company_sect = company_sect
else:
company_sect = "None"
# Name of the person that constitutes the company
company_ceo = re.search(r'\sante mí,\s+([^,]*)', mensaje)
if company_ceo:
company_ceo = company_ceo.group(1)
else:
company_ceo = "None"
# File Number from Section
num_reg = r'\sNúm.\s+([^|]*)'
match_num = re.search(num_reg, mensaje)
if match_num:
company_numsect = match_num.group(1)
else:
company_numsect = "None"
# Social Capital ($)
cap = r"\s*(CAPITAL:\s+([^-]*)|Capital social:\s+([^-]*)|Capital:\s+([^-]*)|Capital:\s+([^,]*))"
caps = re.search(cap, mensaje)
if caps:
company_capital = caps.group()
else:
company_capital = 'None'
csvData = [company_name, company_cve, company_sect, company_ceo, company_numsect, company_capital]
headers = ['COMPANY NAME', 'CVE', 'SECTION','CEO NAME','NUMBER SECTOR','COMPANY CAPITAL']
writer = csv.writer(csvFile, delimiter=',') # create a csv delimited by comma
writer.writerow(headers) # print the header row
writer.writerow(csvData) # print the Data in csv
# Number of txt files
print (count)
scan_folder()
I have this script that create a csv with the data extracted from a text in specific path. In spite of the errors that can be on RegEx, mainly it extracts parts of text that it keeps them in variables and the printa in a csv. Each company must have a single line in this csv. In this way, when the csv is opened, the number of companies and all the information can be visualized by variables.
My problem is that when I see the CSV called, in this case, All_companies1, the data is not put in the same row, they jump.
Also, the titles are repeated, and I do not want them to repeat themselves
First try changing the mode for the csvFile
from a
(append) to w
(write), also check if the editor you're using actual uses the comma as the column delimiter for csv
files, since in the above picture is seems as if the comma is seen by the editor as a normal character.
Also remove any carriage return characters (\n \r
) from your string before printing it, this can be done in the following code.
csvData = [str(data).replace('\n', '').replace('\r', '') for data in csvData]
Note:
if by any chance this works, there might be a problem with with having empty rows in the csv
file beteen each two elements, this can be fixed by changing with open('All_Companies1.csv', 'a') as csvFile
to with open('All_Companies1.csv', 'a', newline='') as csvFile