Search code examples
pythonpandasdata-cleaninggoogle-cloud-storage

Getting the filename with on_bad_line


I am trying to get the filename of where the bad line is happening. I have a function for getting all the bad lines and printing it to a .txt file, but when I pass in a parameter for the filename, it just prints all the filenames.

This is the bad line function:

    def badlines_collect(self, bad_line: list[str]) -> None:
        badline_lst.append(bad_line)
        today = date.today()
        todaytime = datetime.datetime.now().strftime("%Y%m%d")
        with open("bad_line1_{}.txt".format(todaytime), 'w') as fp:
            for line in badline_lst:
                fp.write("Today's date: " + str(today) + currentfile + ": {}\n".format(line))
        fp.close()
        print(badline_lst)
        return None

This is the function where I am calling it and passing in a parameter to get the filename:

    def getCSV(self, cur_publisher):
        """
        :return:
        """
        print(bucket_name + '/' + cur_publisher)
        dfm = pd.DataFrame()
        filename = list(self.bucket.list_blobs(prefix=cur_publisher))
        print(filename)
        for file_name in filename:
            if '.csv' in str(file_name.name):
                print("Crawling on File {} ......\n".format(file_name.name))
                currentfile = file_name.name
                print(currentfile)
                blop = self.bucket.blob(blob_name = "{}".format(file_name.name))
                data = blop.download_as_string()
                df = pd.read_csv(io.BytesIO(data), encoding='utf-8', sep=",", engine='python',
                                 on_bad_lines=self.badlines_collect)
                if (df.count().sum()) > 0:
                    df.insert(0, "filename", file_name.name)
                    dfm = pd.concat([dfm, df], ignore_index=True)
                    dfm = pd.concat([dfm, df], ignore_index=True)
                    dfm = dfm.rename_axis(index='', columns="index")
                    print(dfm)
                else:
                    pass
                    print("{} is empty \n".format(file_name.name))
            else:
                pass
        return self.stack

The result I get is all the filenames in the gcs bucket printed into the bad_line1.txt and not the bad line errors


Solution

  • def badlines_collect(self, bad_line: list[str], filename: str) -> None:
        badline_lst.append(bad_line)
        today = date.today()
        todaytime = datetime.datetime.now().strftime("%Y%m%d")
        with open("C:\\badline_log_{}.txt".format(todaytime),
                  'w') as fp:
            for line in badline_lst:
                fp.write(filename +" " + "Today's date: " + str(today) + ": {}\n".format(line))
        fp.close()
    
    df = pd.read_csv(io.BytesIO(data), encoding='utf-8', sep=",",engine='python', on_bad_lines=lambda x: self.badlines_collect(x, file_name.name))
    

    I have able to get the filename doing it this way