Search code examples
python-3.xbeautifulsouppython-requestsweb-crawlersession-cookies

500 status error on python web crawler for cubetutor (magic the gathering site)


This is my code that was working:


import requests
from bs4 import BeautifulSoup as bs4
cookies = {
    'JSESSIONID': '15EA1C17E103E8206BAFFF73FA157231',
}

headers = {
    'Pragma': 'no-cache',
    'Origin': 'http://www.cubetutor.com',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-US,en;q=0.9',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept': '*/*',
    'Cache-Control': 'no-cache',
    'X-Requested-With': 'XMLHttpRequest',
    'Connection': 'keep-alive',
    'Referer': 'http://www.cubetutor.com/topcardsbyset/1;jsessionid=15EA1C17E103E8206BAFFF73FA157231',
}

data = {
  't:ac': '1',
  't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
  'setSelect': '10E',
  't:zoneid': 'topCardsZone'
}

response = bs4(requests.post('http://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, cookies=cookies, data=data).json()['content'], 'lxml')
text = response.text.split("\n")
file = text.pop(0)
link = response.find_all("a")
arr = {file : {}}
for i in range(len(link)):
    arr[file][text[i]] = link[i]

print(file)
for i in arr[file]:
    print(i," : ", arr[file][i])

All the stuff from text = response.text.split("\n") on is not important (not finished and not where the issue is). This code up top is all working fine but it requires me to go to the site and look in the network requests/headers and get all this info every time. I am trying to automate the process a bit but the second i try to create my own jsession id so that i didnt have to keep going to the site it gives me a 500 status error. At first i thought it might be because the jsession id and the headers didnt match so i changed it to also equal session.headers as you can see below but that didnt fix anything. The code is at https://github.com/icarus612/spiderPY-magicTG/ if you want to look at the other commits. Any help would be great.

import requests
from bs4 import BeautifulSoup as soup

data = {
  't:ac': '1',
  't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
  'setSelect': '10E',
  't:zoneid': 'topCardsZone'
}

session = requests.Session()
r = session.get('http://www.cubetutor.com')
cookies = session.cookies.get_dict()
headers = session.headers
headers['Referer'] =  f"http://www.cubetutor.com/topcardsbyset/1;jsessionid={cookies['JSESSIONID']}"
headers['Pragma'] = 'no-cache'
headers['Origin'] = 'http://www.cubetutor.com'
headers['X-Requested-With'] = 'XMLHttpRequest'

print(r)
response = session.get('http://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, data=data, cookies=cookies)
print(response)
#text = response.find_all(class_='compareCubeColumn').text
#link = response.find_all("a")


Solution

  • This seems to work (didn't include the bs4 parsing):

    import requests
    
    def get_session_cookie():
        resp = requests.get("https://www.cubetutor.com")
        if resp.ok and resp.cookies:
            return resp.cookies.get_dict()
    
    
    def post_query():
        session_cookie = get_session_cookie()
        headers = {
            'Pragma': 'no-cache',
            'Origin': 'https://www.cubetutor.com',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US,en;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Cache-Control': 'no-cache',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
            'Referer': 'https://www.cubetutor.com/topcardsbyset/1;jsessionid=15EA1C17E103E8206BAFFF73FA157231',
        }
        data = {
          't:ac': '1',
          't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
          'setSelect': '10E',
          't:zoneid': 'topCardsZone'
        }
        return requests.post('https://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, cookies=session_cookie, data=data).json()
    
    # resp = post_query()
    # print(resp)
    # {u'content': u"<div class='centeredContainer'><div class='compa ...