I am trying to find recent news that were published within the last 2 hours.
My goal is to take a datetime
object of today and compare it to a date of an article which I scraped from the web.
First I compare the datetime by date and then by hour.
However it seems that even when given a correct date it says it isn't in the correct range.
False fail:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from datetime import timedelta
from datetime import datetime
def newz(stock_1):
list_now=stock_1
#list_now=chr(list_now)
new_list=list_now
list_now=[list_now]
print("Stock:{}".format(list_now))
n = 1 #the # of article headlines displayed per ticker
tickers= list_now
new_words = {
'Insider Sells':-3.4,
'common':2.0,
'up':3.4,
'bankruptcy':-3.4,
'underperforms':-3.4,
'overperforms':3.4,
'outperforms':3.4,
'overbought':-3.4,
'oversold':3.4,
'down':2.0,
}
finviz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
for ticker in tickers:
url = finviz_url + ticker
req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'})
resp = urlopen(req)
html = BeautifulSoup(resp, features="lxml")
#print(html)
news_table = html.find(id='news-table')
news_tables[ticker] = news_table
try:
for ticker in tickers:
df = news_tables[ticker]
df_tr = df.findAll('tr')
print ('\n')
print ('Recent News Headlines for {}: '.format(ticker))
for i, table_row in enumerate(df_tr):
a_text = table_row.a.text
td_text = table_row.td.text
td_text = td_text.strip()
print("{0} {1}".format(a_text,td_text))
td_text=str(td_text)
a_text=str(a_text)
# looks specfically for investigation or shareholder alert or investigation alert
result=a_text.find("INVESTIGATION")
result=int(result)
result_1=a_text.find("SHAREHOLDER")
result_1=int(result_1)
result_2=a_text.find("ALERT")
result_2=int(result_2)
result_3=a_text.find("INVESTOR")
result_3=int(result_3)
result_4=a_text.find("NOTICE")
result_4=int(result_4)
if (result>=0 or result_1>=0) and result_2>=0:
print("Fails: Under Investigation")
return True
elif result>=0 :
print("Fails: Under Investigation")
return True
elif result_3>=0 and result_2>=0 :
print("Fails: Under Investigation")
elif result_1>=0 and result_4>=0 :
print("Fails: Under Investigation")
elif result_1>=0 and result_2>=0 :
print("Fails: Under Investigation")
elif result==-1 and result_1==-1 and result_2==-1:
pass
if i == n-1:
break
except AttributeError:
return True
td_text=td_text.split("-",2)
#print("A",td_text)
month=td_text[0]
day=td_text[1]
year=td_text[2]
months={'Jan': 1, 'Feb':2, 'Mar': 3, 'Apr':4,'May':5, 'Jun': 6, 'Jul':7,'Aug':8,'Sep':9,'Oct':10, 'Nov':11,'Dec':12}
month=months.pop(month)
#print("MOnth",month)
#print("Year",year)
#print("Day",day)
hour=year[3:5]
#print("Hour data",hour)
meridian=year[-1:-2]
#print("Meridian",meridian)
year=year[0:2]
year=str(year)
day=str(day)
month=str(month)
year="20"+year
# date we are stripping from web
t=year+'-'+month+'-'+day+'-'+hour
#print(t)
t_hour = datetime.strptime(hour, '%H')
t = datetime.strptime(t, '%Y-%m-%d-%H')
#print(t)
# todays date in datetime object
today =datetime.today().strftime('%Y-%m-%d-%H')
today = datetime.strptime(today, '%Y-%m-%d-%H')
hr_margin=timedelta(hours= 2)
margin = timedelta(days = 1)
#print( "Earnings date:{} ".format(t))
#print("Today:",today)
diff_minus=today - margin
#print(diff_minus)
diff_plus=today + margin
diff_hr_plus= today +hr_margin
diff_hr_minus= today-hr_margin
#print(diff_plus)
#t_hour=t_hour[10::]
if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
return True
def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
if diff_plus<= t <= diff_plus:
print("Day is good")
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print('Stripped Datetime {}'.format(t))
#print("Measured time hr",t_hour)
if diff_hr_minus<=t_hour <=diff_hr_plus:
print("Hour is good")
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print('Stripped Datetime {}'.format(t))
print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("\n")
return False
else:
print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print("\n")
return True
else:
print("News is NOT up to date by Day!! {} ".format(t))
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print("\n")
return True
See how I reduced your given code to a minimal reproducible example with a few steps:
Since you claimed issue with datetime comparison I found one inconsistency:
In day comparison you have:
if diff_plus<= t <= diff_plus:
In hour comparison you have:
if diff_hr_minus<=t_hour <=diff_hr_plus:
Things to fix:
diff_min
strptime
with appropriate format literal:hour
as 24-hour-range integer, using t.hour
from datetime import timedelta
from datetime import datetime
def compared_date_from_td(td_text):
"""Returns datetime for input of format 'Jan-24-22 05:48PM'."""
# date from web
t = datetime.strptime(td_text.strip(), '%b-%d-%y %H:%M%p')
print("Earnings date: {} ".format(t))
# todays date in datetime object
today = datetime.today()
print("Today: {}".format(today))
margin = timedelta(days = 1)
diff_minus = today - margin
diff_plus = today + margin
t_hour = t.hour # get the hour part of datetime t
diff_hr_plus = today.hour + 2
diff_hr_minus = today.hour - 2
if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
return True
def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
if diff_minus <= t <= diff_plus: # issue fixed! was: diff_plus<= t <= diff_plus
print("Day is good")
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print('Stripped Datetime {}'.format(t))
#print("Measured time hr",t_hour)
if diff_hr_minus <= t_hour <= diff_hr_plus:
print("Hour is good")
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print('Stripped Datetime {}'.format(t))
print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("\n")
return False
else:
print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print("\n")
return True
else:
print("News is NOT up to date by Day!! {} ".format(t))
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print("\n")
return True
date_text = 'Jan-24-22 05:48PM'
not_uptodate = compared_date_from_td(date_text)
print("date: {}, compared as not_uptodate => {}".format(date_text, not_uptodate))
This outputs:
Earnings date: 2022-01-24 05:48:00
Today: 2022-01-25 00:54:16.122160
Day is good
Max allowed date 2022-01-26 00:54:16.122160
Min allowed date 2022-01-24 00:54:16.122160
Stripped Datetime 2022-01-24 05:48:00
News is NOT up to date by Hour!! Time :2022-01-25 00:54:16.122160 Story Hit: 2022-01-24 05:48:00
Max allowed Hr 2
Min allowed Hr -2
date: Jan-24-22 05:48PM, compared as not_uptodate => True
Note: Some formatting needs to be adjusted to make sense like Max allowed Hr 2
and so on. Also the comparison returns True
if oudside margins as "NOT up to date" (like in the above example where hour differs more than 2 hours).
Did I get your goal right: Test if datetime is within last 2 hours.
(1) Given a text, which was scraped from web, you would determine the format and parse it to datetime.
(2) Given datetime, you would calculate the timedelta to now. Then test if this is less than 2 hours:
from datetime import datetime, timedelta
scrapedText = 'Jan-24-22 05:48PM'
newsTime = datetime.strptime(scrapedText, '%b-%d-%y %H:%M%p') # (1) parse datetime
if datetime.now() - newsTime < timedelta(hours=2): # (2) whithin last 2 hours
# news is less than 2 hours ago