Search code examples
pythonapscheduler

How to login only once when using apscheduler?


I'd like to check the new posts every set time (using apscheduler) on the site where needs to be logged in and receive messages from telegram bot.

import requests
from bs4 import BeautifulSoup
import os
import telegram
import sys
from apscheduler.schedulers.blocking import BlockingScheduler

BASE_DIR = os.path.dirname(os.path.abspath(__file__))

def scraping():
    headers = {'User-Agent':'Mozilla/5.0'}

    LOGIN_URL = 'Login page url'
    LOGIN_DATA = {
    "user_id":"id",
    "password":"pw",
    "keep_signed":"Y"
    }

    with requests.Session() as s:
        login_req = s.post(LOGIN_URL, data=LOGIN_DATA, headers=headers)

        url = "address"
        req = s.get(url, headers=headers) 
        html = req.text      
        soup = BeautifulSoup(html, 'html.parser')           
        title = soup.select('#css values') 
        latest_title = title[0].text 

        token =  "certain value"
        bot = telegram.Bot(token=token)
        chat_id = 'id'

    with open(os.path.join(BASE_DIR, 'latest.txt'), 'r+') as f_read:
        before = f_read.readline()
        if before != latest_title:
            bot.sendMessage(chat_id=chat_id, text= latest_title)
        f_read.close()

    with open(os.path.join(BASE_DIR, 'latest.txt'), 'w+') as f_write:
        f_write.write(latest_title)
        f_write.close()

scheduler = BlockingScheduler()        
scheduler.add_job(scraping, 'interval', seconds=30) 

scheduler.start()

With this code, the login process is also included in every interval and it's inefficient.
How can I check the posts repeatedly but keep the session alive with only one login?


Solution

  • I've had a similar issue before, and solved it by storing the session as a pickled object in redis.

    When you try to login, get the pickled session, unpickle, and then try to use it. If it is no longer a valid session (for example, they time out your login session on the api), then create a new session.

    something along these lines might work:

    import pickle
    import redis
    
    redis_client = redis.Redis(host='localhost', port=6379, db=0)
    
    conn = None
    
    def connect(self):
        if conn is None:
            conn = # your login code here
    
            redis_client.set(
                "connection", pickle.dumps(# your session here)
            )
    
    connection = redis_client.get("connection")
    
    conn = pickle.loads(connection) if connection else None
    
    connect()
    
    # make connection is not already connected.
    timeout = time.time() + 60 * 3  # 3 mins from now
    while True:
        try:
            connected = # code to check if you are connected.. for example get a url.
            if not connected:
                 raise AssertionError()
                     break
    
            except (AssertionError, ConnectionResetError) as e:
                 if time.time() <= timeout:
                     time.sleep(30)  # wait 30 sec before retrying
                     # recreate login
                     connect()
                     continue
                 elif time.time() > timeout:
                     raise ValueError("Connection failed after timeout.")
                 else:
                     raise e