I'd like to check the new posts every set time (using apscheduler) on the site where needs to be logged in and receive messages from telegram bot.
import requests
from bs4 import BeautifulSoup
import os
import telegram
import sys
from apscheduler.schedulers.blocking import BlockingScheduler
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
def scraping():
headers = {'User-Agent':'Mozilla/5.0'}
LOGIN_URL = 'Login page url'
LOGIN_DATA = {
"user_id":"id",
"password":"pw",
"keep_signed":"Y"
}
with requests.Session() as s:
login_req = s.post(LOGIN_URL, data=LOGIN_DATA, headers=headers)
url = "address"
req = s.get(url, headers=headers)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.select('#css values')
latest_title = title[0].text
token = "certain value"
bot = telegram.Bot(token=token)
chat_id = 'id'
with open(os.path.join(BASE_DIR, 'latest.txt'), 'r+') as f_read:
before = f_read.readline()
if before != latest_title:
bot.sendMessage(chat_id=chat_id, text= latest_title)
f_read.close()
with open(os.path.join(BASE_DIR, 'latest.txt'), 'w+') as f_write:
f_write.write(latest_title)
f_write.close()
scheduler = BlockingScheduler()
scheduler.add_job(scraping, 'interval', seconds=30)
scheduler.start()
With this code, the login process is also included in every interval and it's inefficient.
How can I check the posts repeatedly but keep the session alive with only one login?
I've had a similar issue before, and solved it by storing the session as a pickled object in redis.
When you try to login, get the pickled session, unpickle, and then try to use it. If it is no longer a valid session (for example, they time out your login session on the api), then create a new session.
something along these lines might work:
import pickle
import redis
redis_client = redis.Redis(host='localhost', port=6379, db=0)
conn = None
def connect(self):
if conn is None:
conn = # your login code here
redis_client.set(
"connection", pickle.dumps(# your session here)
)
connection = redis_client.get("connection")
conn = pickle.loads(connection) if connection else None
connect()
# make connection is not already connected.
timeout = time.time() + 60 * 3 # 3 mins from now
while True:
try:
connected = # code to check if you are connected.. for example get a url.
if not connected:
raise AssertionError()
break
except (AssertionError, ConnectionResetError) as e:
if time.time() <= timeout:
time.sleep(30) # wait 30 sec before retrying
# recreate login
connect()
continue
elif time.time() > timeout:
raise ValueError("Connection failed after timeout.")
else:
raise e