I want to watch two different Directory for excel file modification(timestamp) and after modification I want to call one API HTTP Post request to one endpoint, I have already wrote below code using Python Watchdog and requests library, but facing two error in the same.
Problem 1:- event(event.event_type == 'modified') is getting trigger two times on one file modification, which causing to send two post data request. So what is the correct event type to watch file modification in watchdog library, which will make this conditional code to true only once.
Problem 2:- In the 'Watcher' class, in function 'start' I am not able to assign Handler() value to the event_handler variable. what mistake I am making here ?
please guide me to fix this or any other better approach. thank you in an advance
import time
from time import sleep
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import json
import requests
import pandas as pd
import os
from collections import defaultdict
class Handler(FileSystemEventHandler):
def __init__(self, path):
super().__init__()
self.path = path
self.files = defaultdict(lambda:0)
def on_modified(self, event):
if event.is_directory:
return None
elif (event.event_type == 'modified' and event.src_path.endswith('.csv')):
# api_url = 'http://10.166.72.3:8080/promo/summary?userId=abc'
stats = os.stat(event.src_path).st_ctime
sleep(5)
if stats - self.files[event.src_path] > 1:
df = pd.read_csv(self.path)
df1 = df.to_json(orient='records')
df1.replace("\\","")
print(df1)
self.files[event.src_path] = stats
#r = requests.post(url=api_url, json=df1)
#print(r.status_code, r.reason, r.text)
class Watcher:
def __init__(self, directory, handler):
self.directory = directory
self.handler = handler
self.observer = Observer()
def start(self):
#event_handler = Handler()
self.observer.schedule( self.handler, self.directory, recursive=True)
self.observer.start()
def stop(self):
self.observer.stop()
def join(self):
self.observer.join()
if __name__ == '__main__':
handler1 = Handler('C:\\Users\\BPM_admin\\Desktop\\OCR_RPA\\FirstOCR\\Diageo\\Output\\InvoiceMasterData.csv')
handler2 = Handler('C:\\Users\\BPM_admin\\Desktop\\OCR_RPA\\SecondOCR\\Diageo\\Output\\AgreementMasterData.csv')
w1 = Watcher("C:\\Users\\BPM_admin\\Desktop\\OCR_RPA\\FirstOCR\\Diageo\\Output", handler1)
w2 = Watcher("C:\\Users\\BPM_admin\\Desktop\\OCR_RPA\\SecondOCR\\Diageo\\Output", handler2)
w1.start()
w2.start()
try:
while True:
time.sleep(5)
except:
w1.stop()
w2.stop()
print("Error")
w1.join()
w2.join()
Problem 1: Event "modified" fired twice
This issue appears because multiple operations can occur when you save a file, data are changed, then the metadata (last modified ...). It can be hard to handle depending on what you need and the frequency of changes if there are many users.
First you should limit the files watched by testing the extension to avoid temporary files and all other formats. Then i suggest to save the last time of modification of the file to be able to compare it between two events, and fire the API call only if the delay is over X seconds.
import os
from collections import defaultdict
class Handler(FileSystemEventHandler):
def __init__(self):
super().__init__()
# this dict will store the filenames and the time
self.files = defaultdict(lambda:0)
def on_any_event(self, event):
if event.is_directory:
return None
elif (event.event_type == 'modified' and
event.src_path.endswith(('.xls','.xlsx'))) :
# here we get the last change timestamp
stats = os.stat(event.src_path).st_mtime
# delay of 1 sec minimum between 2 changes
# self.files[event.src_path] is set to 0 thanks to the defaultdict
if stats - self.files[event.src_path] > 1:
print('api call with ', event.src_path)
# do what you need
# then update the time for this file
self.files[event.src_path] = stats
Problem 2: Pass handler parameter
You get an error when you instantiate Handler()
because you create a parameter path
in the contructor :
class Handler(FileSystemEventHandler):
def __init__(self, path):
super().__init__()
self.path = path
It seems you don't use it inside Handler
, maybe you could remove this parameter ? Otherwise just give the path you want to handle like :
def start(self):
event_handler = Handler(self.directory) # or Handler() if you remove path
self.observer.schedule(event_handler, self.directory, recursive=True)
self.observer.start()