Search code examples
pythonjsonloopsappendfile-watcher

File watcher loop cannot continue where it left off when re-running the code


I have create this File Watcher Loop, when I run the code, it scans a specific folder for .json files and append to an 'output.xls' file. Then the code continues to run in a loop, scanning new files in the folder, and repeat the process. This works just fine, however, when I stop the code (laptop turn-off, or something), new files are still being added to the folder, and then when I re-run the code, I cannot continue where I left off, I have to delete the output.xls file and start over again.

Is there a way for this to save the history of the files already appended when I stop the code, and continue adding files that have not been appended when I re-run the code?

import os
import glob
import json
import pandas as pd
import time
from datetime import datetime
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows

def jsonFilesInDirectory(my_dir: str):
    # Get a list of JSON files in the directory
    json_files = glob.glob(os.path.join(my_dir, "*.json"))
    return json_files

def clean_value(value):
    # Clean up numeric values by removing unnecessary characters
    return float(value.replace('\xa0s', '').replace('\xa0ms', '').replace(',', ''))

def doThingsWithNewFiles(fileDiff: list, my_dir: str, workbook):
    for file_name in fileDiff:
        file_path = os.path.join(my_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                json_data = json.load(file)

                # Extract data from the JSON file
                url = json_data["finalUrl"]
                fetch_time = json_data["fetchTime"]

                audits = json_data["audits"]
                fcp_metric = audits["first-contentful-paint"]["id"]
                fcp_value = audits["first-contentful-paint"]["displayValue"]
                fcp_score = audits["first-contentful-paint"]["score"]
                lcp_metric = audits["largest-contentful-paint"]["id"]
                lcp_value = audits["largest-contentful-paint"]["displayValue"]
                lcp_score = audits["largest-contentful-paint"]["score"]
                fmp_metric = audits["first-meaningful-paint"]["id"]
                fmp_value = audits["first-meaningful-paint"]["displayValue"]
                fmp_score = audits["first-meaningful-paint"]["score"]
                si_metric = audits["speed-index"]["id"]
                si_value = audits["speed-index"]["displayValue"]
                si_score = audits["speed-index"]["score"]
                tbt_metric = audits["total-blocking-time"]["id"]
                tbt_value = audits["total-blocking-time"]["displayValue"]
                tbt_score = audits["total-blocking-time"]["score"]
                cls_metric = audits["cumulative-layout-shift"]["id"]
                cls_value = audits["cumulative-layout-shift"]["displayValue"]
                cls_score = audits["cumulative-layout-shift"]["score"]

                categories = json_data["categories"]
                perf_metric = categories["performance"]["id"]
                perf_value = 0
                perf_score = categories["performance"]["score"]

                # Clean up values and format the fetch time
                cleaned_fcp_value = clean_value(fcp_value)
                cleaned_lcp_value = clean_value(lcp_value)
                cleaned_fmp_value = clean_value(fmp_value)
                cleaned_si_value = clean_value(si_value)
                cleaned_tbt_value = clean_value(tbt_value)
                datetime_obj = datetime.strptime(fetch_time, "%Y-%m-%dT%H:%M:%S.%fZ")
                cleaned_fetch_time = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")

                # Create a data dictionary for the DataFrame
                data_dict = {
                    "fetch_time": [cleaned_fetch_time] * 7,
                    "url": [url] * 7,
                    "metric": ["performance","first_contentful_paint", "largest_contentful_paint",
                               "first-meaningful-paint", "speed-index", "total-blocking-time",
                               "cumulative-layout-shift"],
                    "value": [perf_value, cleaned_fcp_value, cleaned_lcp_value,
                              cleaned_fmp_value, cleaned_si_value, cleaned_tbt_value,
                              cls_value],
                    "score": [perf_score, fcp_score, lcp_score, fmp_score, si_score, tbt_score, cls_score]
                }

                df = pd.DataFrame(data_dict)

                # Append the DataFrame to the Excel file
                sheet_name = "Sheet1"
                if sheet_name in workbook.sheetnames:
                    sheet = workbook[sheet_name]
                    startrow = sheet.max_row
                    for row in dataframe_to_rows(df, index=False, header=False):
                        sheet.append(row)
                else:
                    sheet = workbook.create_sheet(sheet_name)
                    for row in dataframe_to_rows(df, index=False, header=True):
                        sheet.append(row)

                print(f"Data extracted from {file_name} and appended to the Excel file")

            except KeyError as e:
                print(f"KeyError occurred while processing file '{file_name}': {e}")
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError occurred while processing file '{file_name}': {e}")
            except Exception as e:
                print(f"An error occurred while processing file '{file_name}': {e}")

def fileWatcher(my_dir: str, pollTime: int):
    excel_file_path = os.path.join(my_dir, 'output.xlsx')
    existingFiles = []

    # Check if the output file already exists
    if os.path.isfile(excel_file_path):
        try:
            workbook = openpyxl.load_workbook(excel_file_path)
            existingFiles = jsonFilesInDirectory(my_dir)
            # Process the existing JSON files and append data to the Excel file
            doThingsWithNewFiles(existingFiles, my_dir, workbook)
            print("Existing JSON files processed and data appended to the Excel file")
        except openpyxl.utils.exceptions.InvalidFileException:
            workbook = openpyxl.Workbook()
    else:
        workbook = openpyxl.Workbook()

    # Check for new files at startup
    newFileList = jsonFilesInDirectory(my_dir)
    fileDiff = listComparison(existingFiles, newFileList)
    existingFiles = newFileList

    if len(fileDiff) > 0:
        # Process the new files and append data to the Excel file
        doThingsWithNewFiles(fileDiff, my_dir, workbook)

        # Save the Excel file
        workbook.save(excel_file_path)
        print(f"DataFrame exported to {excel_file_path}")

    while True:
        time.sleep(pollTime)

        # Get the updated list of JSON files in the directory
        newFileList = jsonFilesInDirectory(my_dir)

        # Find the difference between the previous and new file lists
        fileDiff = listComparison(existingFiles, newFileList)
        existingFiles = newFileList

        if len(fileDiff) > 0:
            # Process the new files and append data to the Excel file
            doThingsWithNewFiles(fileDiff, my_dir, workbook)

            # Save the Excel file
            workbook.save(excel_file_path)
            print(f"DataFrame exported to {excel_file_path}")


def listComparison(originalList: list, newList: list):
    # Compare two lists and return the differences
    differencesList = [x for x in newList if x not in originalList]
    return differencesList

my_dir = r"Z:"
pollTime = 60

fileWatcher(my_dir, pollTime)

Solution

  • You can create a text file that has the list of scanned files stored.

    Updated your code, to read if exists and write the text file.

    from datetime import datetime
    import glob
    import json
    import openpyxl
    from openpyxl.utils.dataframe import dataframe_to_rows
    import os
    import pandas as pd
    import time
    
    
    def jsonFilesInDirectory(my_dir: str):
        # Get a list of JSON files in the directory
        json_files = glob.glob(os.path.join(my_dir, "*.json"))
        
        return json_files
    
    
    def clean_value(value):
        # Clean up numeric values by removing unnecessary characters
        return float(value.replace('\xa0s', '').replace('\xa0ms', '').replace(',', ''))
    
    
    def doThingsWithNewFiles(fileDiff: list, my_dir: str, workbook):
        for file_name in fileDiff:
            file_path = os.path.join(my_dir, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                try:
                    json_data = json.load(file)
    
                    # Extract data from the JSON file
                    url = json_data["finalUrl"]
                    fetch_time = json_data["fetchTime"]
    
                    audits = json_data["audits"]
                    fcp_metric = audits["first-contentful-paint"]["id"]
                    fcp_value = audits["first-contentful-paint"]["displayValue"]
                    fcp_score = audits["first-contentful-paint"]["score"]
                    lcp_metric = audits["largest-contentful-paint"]["id"]
                    lcp_value = audits["largest-contentful-paint"]["displayValue"]
                    lcp_score = audits["largest-contentful-paint"]["score"]
                    fmp_metric = audits["first-meaningful-paint"]["id"]
                    fmp_value = audits["first-meaningful-paint"]["displayValue"]
                    fmp_score = audits["first-meaningful-paint"]["score"]
                    si_metric = audits["speed-index"]["id"]
                    si_value = audits["speed-index"]["displayValue"]
                    si_score = audits["speed-index"]["score"]
                    tbt_metric = audits["total-blocking-time"]["id"]
                    tbt_value = audits["total-blocking-time"]["displayValue"]
                    tbt_score = audits["total-blocking-time"]["score"]
                    cls_metric = audits["cumulative-layout-shift"]["id"]
                    cls_value = audits["cumulative-layout-shift"]["displayValue"]
                    cls_score = audits["cumulative-layout-shift"]["score"]
    
                    categories = json_data["categories"]
                    perf_metric = categories["performance"]["id"]
                    perf_value = 0
                    perf_score = categories["performance"]["score"]
    
                    # Clean up values and format the fetch time
                    cleaned_fcp_value = clean_value(fcp_value)
                    cleaned_lcp_value = clean_value(lcp_value)
                    cleaned_fmp_value = clean_value(fmp_value)
                    cleaned_si_value = clean_value(si_value)
                    cleaned_tbt_value = clean_value(tbt_value)
                    datetime_obj = datetime.strptime(fetch_time, "%Y-%m-%dT%H:%M:%S.%fZ")
                    cleaned_fetch_time = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")
    
                    # Create a data dictionary for the DataFrame
                    data_dict = {
                        "fetch_time": [cleaned_fetch_time] * 7,
                        "url": [url] * 7,
                        "metric": [
                            "performance",
                            "first_contentful_paint",
                            "largest_contentful_paint",
                            "first-meaningful-paint",
                            "speed-index",
                            "total-blocking-time",
                            "cumulative-layout-shift"
                        ],
                        "value": [
                            perf_value, 
                            cleaned_fcp_value, 
                            cleaned_lcp_value,
                            cleaned_fmp_value,
                            cleaned_si_value,
                            cleaned_tbt_value,
                            cls_value
                        ],
                        "score": [
                            perf_score, 
                            fcp_score, 
                            lcp_score, 
                            fmp_score, 
                            si_score, 
                            tbt_score, 
                            cls_score]
                    }
    
                    df = pd.DataFrame(data_dict)
    
                    # Append the DataFrame to the Excel file
                    sheet_name = "Sheet1"
                    if sheet_name in workbook.sheetnames:
                        sheet = workbook[sheet_name]
    
                    else:
                        sheet = workbook.create_sheet(sheet_name)
    
                    for row in dataframe_to_rows(df, index=False, header=True):
                        sheet.append(row)
    
                    print(f"Data extracted from {file_name} and appended to the Excel file")
    
                except KeyError as e:
                    print(f"KeyError occurred while processing file '{file_name}': {e}")
                
                except json.JSONDecodeError as e:
                    print(f"JSONDecodeError occurred while processing file '{file_name}': {e}")
                
                except Exception as e:
                    print(f"An error occurred while processing file '{file_name}': {e}")
    
    
    def fileWatcher(my_dir: str, pollTime: int):
        excel_file_path = os.path.join(my_dir, 'output.xlsx')
        
        existingFiles = []
        if os.path.exists(os.path.join(os.getcwd(), 'scanned_files.txt')):
            with open('scanned_files.txt', 'a+') as f:
                existingFiles = f.read().split('\n')
    
        # Check if the output file already exists
        if os.path.isfile(excel_file_path):
            try:
                workbook = openpyxl.load_workbook(excel_file_path)
        
            except openpyxl.utils.exceptions.InvalidFileException:
                workbook = openpyxl.Workbook()
    
        else:
            workbook = openpyxl.Workbook()
            
            
        # Process the existing JSON files and append data to the Excel file
        
        if not "Sheet1" in workbook.sheetnames:
            doThingsWithNewFiles(existingFiles, my_dir, workbook)
            print("Existing JSON files processed and data appended to the Excel file")
        
        # Check for new files at startup
        while True:
            time.sleep(pollTime)
    
            # Get the updated list of JSON files in the directory
            newFileList = jsonFilesInDirectory(my_dir)
    
            # Find the difference between the previous and new file lists
            fileDiff = listComparison(existingFiles, newFileList)
            existingFiles = newFileList
    
            if len(fileDiff) > 0:
                # Process the new files and append data to the Excel file
                doThingsWithNewFiles(fileDiff, my_dir, workbook)
    
                # Save the Excel file
                workbook.save(excel_file_path)
                print(f"DataFrame exported to {excel_file_path}")
    
            with open('scanned_files.txt', 'w') as f:
                f.write('\n'.join(existingFiles))
    
    
    def listComparison(originalList: list, newList: list):
        # Compare two lists and return the differences
        differencesList = [x for x in newList if x not in originalList]
        
        return differencesList
    
    
    my_dir = r"Z:"
    pollTime = 60
    
    fileWatcher(my_dir, pollTime)
    

    Couldn't test the code, let me know if there's any issue with this.