Search code examples
pythonpython-datetime

Datetime object compare


I am trying to find recent news that were published within the last 2 hours.

Approach

My goal is to take a datetime object of today and compare it to a date of an article which I scraped from the web.

First I compare the datetime by date and then by hour.

Issue

However it seems that even when given a correct date it says it isn't in the correct range.

False fail:

Code

from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from datetime import timedelta
from datetime import datetime


def newz(stock_1):
    list_now=stock_1
    #list_now=chr(list_now)
    new_list=list_now
    list_now=[list_now]
    print("Stock:{}".format(list_now))
    n = 1 #the # of article headlines displayed per ticker
    tickers= list_now

    new_words = {
            'Insider Sells':-3.4,
            'common':2.0,
            'up':3.4,
            'bankruptcy':-3.4,
            'underperforms':-3.4,
            'overperforms':3.4,
            'outperforms':3.4,
            'overbought':-3.4,
            'oversold':3.4,
            'down':2.0,

            }   

    finviz_url = 'https://finviz.com/quote.ashx?t='
    news_tables = {}

    for ticker in tickers:
        url = finviz_url + ticker
        req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
        resp = urlopen(req)    
        html = BeautifulSoup(resp, features="lxml")
        #print(html)
        news_table = html.find(id='news-table')
        news_tables[ticker] = news_table

    try:
        for ticker in tickers:
            df = news_tables[ticker]
            df_tr = df.findAll('tr')
    
            print ('\n')
            print ('Recent News Headlines for {}: '.format(ticker))
        
            for i, table_row in enumerate(df_tr):
                a_text = table_row.a.text
                td_text = table_row.td.text
                td_text = td_text.strip()
                print("{0}  {1}".format(a_text,td_text))
                td_text=str(td_text)
                a_text=str(a_text)
                # looks specfically for investigation or shareholder alert or investigation alert
                result=a_text.find("INVESTIGATION")
                result=int(result)

                result_1=a_text.find("SHAREHOLDER")
                result_1=int(result_1)

                result_2=a_text.find("ALERT")
                result_2=int(result_2)

                result_3=a_text.find("INVESTOR")
                result_3=int(result_3)

                result_4=a_text.find("NOTICE")
                result_4=int(result_4)



                if (result>=0 or result_1>=0) and result_2>=0:
                    print("Fails: Under Investigation")
                    return True
                elif result>=0 :
                    print("Fails: Under Investigation")
                    return True
                elif result_3>=0 and result_2>=0 :
                    print("Fails: Under Investigation")

                elif result_1>=0 and result_4>=0 :
                    print("Fails: Under Investigation")

                elif result_1>=0 and result_2>=0 :
                    print("Fails: Under Investigation")

                elif result==-1 and result_1==-1 and result_2==-1:
                    pass                    
                       

                if i == n-1:
                    break
    except AttributeError:
        return True
            
    
    td_text=td_text.split("-",2)
    #print("A",td_text)
    month=td_text[0]
    day=td_text[1]
    year=td_text[2]

  
   

    months={'Jan': 1, 'Feb':2, 'Mar': 3, 'Apr':4,'May':5, 'Jun': 6, 'Jul':7,'Aug':8,'Sep':9,'Oct':10, 'Nov':11,'Dec':12}
    month=months.pop(month)
    #print("MOnth",month)
    #print("Year",year)
    #print("Day",day)
    hour=year[3:5]
    #print("Hour data",hour)
    meridian=year[-1:-2]
    #print("Meridian",meridian)
    year=year[0:2]
    year=str(year)
    day=str(day)
    month=str(month)
    year="20"+year


    
    # date we are stripping from web
    t=year+'-'+month+'-'+day+'-'+hour
    #print(t)
    t_hour = datetime.strptime(hour, '%H')
    t = datetime.strptime(t, '%Y-%m-%d-%H')
    #print(t)
   
    
    
        
    
    # todays date in datetime object
    today =datetime.today().strftime('%Y-%m-%d-%H')
    today = datetime.strptime(today, '%Y-%m-%d-%H')

    
    hr_margin=timedelta(hours= 2)

    margin = timedelta(days = 1)
        

    

    #print( "Earnings date:{} ".format(t))
    #print("Today:",today)

    diff_minus=today - margin
    #print(diff_minus)
    diff_plus=today + margin

    diff_hr_plus= today +hr_margin
    diff_hr_minus= today-hr_margin
    #print(diff_plus)

    #t_hour=t_hour[10::]

    if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
        return True



def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):      

if diff_plus<= t <= diff_plus:
   print("Day is good")
   print("Max allowed date {}".format(diff_plus))
   print("Min allowed date {}".format(diff_minus))
   print('Stripped Datetime {}'.format(t))
   
   #print("Measured time hr",t_hour)
   if  diff_hr_minus<=t_hour <=diff_hr_plus:
       print("Hour is good")
       print("Max allowed Hr {}".format(diff_hr_plus))
       print("Min allowed Hr {}".format(diff_hr_minus))
       print('Stripped Datetime {}'.format(t))
       
       print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
       print("\n")
       return False
   else:
       print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
       print("Max allowed Hr {}".format(diff_hr_plus))
       print("Min allowed Hr {}".format(diff_hr_minus))
       print("\n")
       return True

else:
    print("News is NOT up to date by Day!! {} ".format(t))
    print("Max allowed date {}".format(diff_plus))
    print("Min allowed date {}".format(diff_minus))
    print("\n")
    return True

Solution

  • See how I reduced your given code to a minimal reproducible example with a few steps:

    1. remove the web-scraping (as not essential for the issue)
    2. remove all the comments that do not explain
    3. remove empty lines that do not help to structure
    4. (optionally) add a test (e.g. a function-call) that shows the issue

    Fixed issue

    Since you claimed issue with datetime comparison I found one inconsistency:

    In day comparison you have:

    if diff_plus<= t <= diff_plus:
    

    In hour comparison you have:

       if  diff_hr_minus<=t_hour <=diff_hr_plus:
    

    Things to fix:

    • adjust the lower boundary for day to diff_min
    • can simplify date-parsing using strptime with appropriate format literal:
    • just compare the hour as 24-hour-range integer, using t.hour

    Minimal Reproducible Example

    from datetime import timedelta
    from datetime import datetime
    
    def compared_date_from_td(td_text):    
        """Returns datetime for input of format 'Jan-24-22 05:48PM'."""
        # date from web
        t = datetime.strptime(td_text.strip(), '%b-%d-%y %H:%M%p')
        print("Earnings date: {} ".format(t))
        
        # todays date in datetime object
        today = datetime.today()
        print("Today: {}".format(today))
    
        margin = timedelta(days = 1)
        diff_minus = today - margin
        diff_plus = today + margin
    
        t_hour = t.hour  # get the hour part of datetime t
    
        diff_hr_plus = today.hour + 2
        diff_hr_minus = today.hour - 2
    
        if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
            return True
    
    
    def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):      
        if diff_minus <= t <= diff_plus:  # issue fixed! was: diff_plus<= t <= diff_plus
            print("Day is good")
            print("Max allowed date {}".format(diff_plus))
            print("Min allowed date {}".format(diff_minus))
            print('Stripped Datetime {}'.format(t))
    
            #print("Measured time hr",t_hour)
            if  diff_hr_minus <= t_hour <= diff_hr_plus:
               print("Hour is good")
               print("Max allowed Hr {}".format(diff_hr_plus))
               print("Min allowed Hr {}".format(diff_hr_minus))
               print('Stripped Datetime {}'.format(t))
               
               print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
               print("\n")
               return False
            else:
               print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
               print("Max allowed Hr {}".format(diff_hr_plus))
               print("Min allowed Hr {}".format(diff_hr_minus))
               print("\n")
               return True
    
        else:
            print("News is NOT up to date by Day!! {} ".format(t))
            print("Max allowed date {}".format(diff_plus))
            print("Min allowed date {}".format(diff_minus))
            print("\n")
            return True
    
    
    date_text = 'Jan-24-22 05:48PM'
    not_uptodate = compared_date_from_td(date_text)
    print("date: {}, compared as not_uptodate => {}".format(date_text, not_uptodate))
    

    This outputs:

    Earnings date: 2022-01-24 05:48:00 
    Today: 2022-01-25 00:54:16.122160
    Day is good
    Max allowed date 2022-01-26 00:54:16.122160
    Min allowed date 2022-01-24 00:54:16.122160
    Stripped Datetime 2022-01-24 05:48:00
    News is NOT up to date by Hour!! Time :2022-01-25 00:54:16.122160 Story Hit: 2022-01-24 05:48:00
    Max allowed Hr 2
    Min allowed Hr -2
    
    
    date: Jan-24-22 05:48PM, compared as not_uptodate => True
    

    Note: Some formatting needs to be adjusted to make sense like Max allowed Hr 2 and so on. Also the comparison returns True if oudside margins as "NOT up to date" (like in the above example where hour differs more than 2 hours).

    Proper solution

    Did I get your goal right: Test if datetime is within last 2 hours.

    (1) Given a text, which was scraped from web, you would determine the format and parse it to datetime.

    (2) Given datetime, you would calculate the timedelta to now. Then test if this is less than 2 hours:

    from datetime import datetime, timedelta
    
    scrapedText = 'Jan-24-22 05:48PM'
    newsTime = datetime.strptime(scrapedText, '%b-%d-%y %H:%M%p')  # (1) parse datetime
    
    if datetime.now() - newsTime < timedelta(hours=2):  # (2) whithin last 2 hours
        # news is less than 2 hours ago
    

    See also: How to find information from the last 24 hours