I am messing around with python, and am trying to make a simple data cleaning program. I'm trying to pass the title
values from the read_excel
module, to the output
module. But, it keeps saying name title is not defined
. Here is my code:
import os
import pandas as pd
import math
class Item():
__name = ""
__cost = 0
__gender = ""
__prime = ""
def has_all_properties(self):
return bool(self.__name and not math.isnan(self.__cost) and self.__gender and self.__prime)
def clean(self,wanted_cost,wanted_gender,wanted_prime):
return bool(self.__name and self.__gender == wanted_gender and self.__cost <= wanted_cost and self.__prime == wanted_prime)
def __init__(self, name, cost, gender, prime):
self.__name = name
self.__cost = cost
self.__gender = gender
self.__prime = prime
def __eq__(self, other):
return (self.__name == other.__name and self.__cost == other.__cost and self.__gender == other.__gender and self.__prime == other.__prime)
def __hash__(self):
return hash((self.__name, self.__cost, self.__gender, self.__prime))
def __repr__(self):
return f"Item({self.__name},{self.__cost},{self.__gender},{self.__prime})"
def tuple(self):
return self.__name, self.__cost, self.__gender, self.__prime
def read_excel(filetype):
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith(filetype):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
df = df[['name', 'cost', 'used_by', 'prime']]
title = list(df.columns.values)
print(title)
array = df.values.tolist()
print(array)
return array
return output(title)
def process(array):
mylist = {Item(*k) for k in array}
print(mylist)
filtered = {obj for obj in mylist if obj.has_all_properties()}
clean = {obj for obj in filtered if obj.clean(20,"male","yes")}
result = list(clean)
print(result)
def output(where, sort_data, title):
t_list = [obj.tuple() for obj in sort_data]
output = pd.DataFrame(t_list, columns = title)
output.to_excel(where, index = False, header = True)
if __name__ == "__main__":
inputfile = read_excel('.XLSX')
processdata = process(inputfile)
result = output('clean_data.xlsx', processdata, title)
can you show me waht to do instead? Thank you for the help
I find one of the most easy to understand way of solving my current issue. So, I just break down the read_excel
definition and make a get_header
and get_list
definition. Here is my solution:
import os
import pandas as pd
import math
class Item():
__name = ""
__cost = 0
__gender = ""
__prime = ""
def has_all_properties(self):
return bool(self.__name and not math.isnan(self.__cost) and self.__gender and self.__prime)
def clean(self,wanted_cost,wanted_gender,wanted_prime):
return bool(self.__name and self.__gender == wanted_gender and self.__cost <= wanted_cost and self.__prime == wanted_prime)
def __init__(self, name, cost, gender, prime):
self.__name = name
self.__cost = cost
self.__gender = gender
self.__prime = prime
def __eq__(self, other):
return (self.__name == other.__name and self.__cost == other.__cost and self.__gender == other.__gender and self.__prime == other.__prime)
def __hash__(self):
return hash((self.__name, self.__cost, self.__gender, self.__prime))
def __repr__(self):
return f"Item({self.__name},{self.__cost},{self.__gender},{self.__prime})"
def tuple(self):
return self.__name, self.__cost, self.__gender, self.__prime
def read_excel(filetype):
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith(filetype):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
df = df[['name', 'cost', 'used_by', 'prime']]
return df
def get_list(dataframe):
array = dataframe.values.tolist()
print(array)
return array
def get_header(dataframe):
title = list(dataframe.columns.values)
print(title)
return title
def process(array):
mylist = {Item(*k) for k in array}
print(mylist)
filtered = {obj for obj in mylist if obj.has_all_properties()}
clean = {obj for obj in filtered if obj.clean(20,"male","yes")}
result = list(clean)
print(result)
t_list = [obj.tuple() for obj in result]
return t_list
def output(where, sort_data, title):
output = pd.DataFrame(sort_data, columns = title)
output.to_excel(where, index = False, header = True)
if __name__ == "__main__":
inputfile = read_excel('.XLSX')
array = get_list(inputfile)
header = get_header(inputfile)
processdata = process(array)
result = output('clean_data.xlsx', processdata, header)