I'm a novice Python user and am having trouble resolving a key error. I have checked similar questions but am still unable to resolve my issue. Below is the error, followed by the code. I'd appreciate any insights.
Error (in line 61):
KeyError: "['Major Office Locations', 'Major Office', '# of International Offices', '# of Attorneys', 'Major Departments', '# of U.S offices'] not in index"
Code:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import re
import concurrent.futures
import sys
URL = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
pg = 0
info = []
rank = input('Enter Rank\n 2 -All Law Firms\n 20 -IP Law Firms\n 275 - IP boutique law firms\n')
year = input('Which Year? ')
filename = input('Filename? ')
while True:
pg += 1
params = {'rank': rank,
'year': year,
'category': 'LBACCompany',
'pg': pg}
response = requests.get(URL, params=params)
if len(json.loads(response.content)) == 0:
break
print(pg)
info = info + json.loads(response.content)
def run(i):
response = requests.get('http://vault.com' + info[i]['URL'])
print(i)
bs = BeautifulSoup(response.content, 'lxml')
US_OFFICES = bs.find('strong', {'class': 'inlineOnly'}, text='No. of U.S. Offices:').next_sibling.strip()
info[i]['# of U.S offices'] = US_OFFICES
INT_OFFICES = bs.find('strong', {'class': 'inlineOnly'}, text='No. of International Offices:').next_sibling.strip()
info[i]['# of International Offices'] = INT_OFFICES
MAJOR_OFFICE = bs.find('div', {'class': 'col-lg-12 col-md-4'}).p.text.strip()
info[i]['Major Office'] = MAJOR_OFFICE
MAJOR_OFFICE_LOC = bs.find('strong', text='Major Office Locations').parent.p.text.strip()
info[i]['Major Office Locations'] = MAJOR_OFFICE_LOC
MAJOR_DEP = bs.find('strong', text='Major Departments').parent.p.text.strip()
info[i]['Major Departments'] = MAJOR_DEP
ATT = bs.find('strong', text='Firm Stats').parent.p.text
ATT = re.search(r'Total No. Attorneys \d\d\d\d:\r\n.*', ATT)
if ATT is not None:
ATT = re.search(r'\r\n.*[0-9K+]', ATT.group()).group().strip()
info[i]['# of Attorneys'] = ATT
else:
info[i]['# of Attorneys'] = ''
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
executor.map(run, list(range(len(info))))
df = pd.DataFrame(data=info)
df = df[['Year', 'Title', 'Rank', '# of U.S offices',
'# of International Offices', 'Major Office', 'Major Office Locations',
'Major Departments', '# of Attorneys']]
df.to_excel(filename, index=False)
print('DONE!!!!!!!!!!!!!!!!!!')
It looks like you're trying to set the headers for the dataframe (if I'm guessing right). If that's the case, you can just pass the names in when creating the dataframe by passing in the columns
keyword argument, like so:
df = pd.DataFrame(data=info, columns=['Year', 'Title', 'Rank', '# of U.S offices', '# of International Offices', 'Major Office', 'Major Office Locations', 'Major Departments', '# of Attorneys'])