Search code examples
pythonpandaspycharmkeyerror

Key Error in Python, using Pycharm and Pandas


I'm a novice Python user and am having trouble resolving a key error. I have checked similar questions but am still unable to resolve my issue. Below is the error, followed by the code. I'd appreciate any insights.

Error (in line 61): 

KeyError: "['Major Office Locations', 'Major Office', '# of International Offices', '# of Attorneys', 'Major Departments', '# of U.S offices'] not in index"

Code:

import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import re
import concurrent.futures
import sys

URL = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
pg = 0
info = []
rank = input('Enter Rank\n 2 -All Law Firms\n 20 -IP Law Firms\n 275 - IP boutique law firms\n')
year = input('Which Year?  ')
filename = input('Filename? ')
while True:
    pg += 1
    params = {'rank': rank,
              'year': year,
              'category': 'LBACCompany',
              'pg': pg}
    response = requests.get(URL, params=params)
    if len(json.loads(response.content)) == 0:
        break
    print(pg)
    info = info + json.loads(response.content)


def run(i):
    response = requests.get('http://vault.com' + info[i]['URL'])
    print(i)
    bs = BeautifulSoup(response.content, 'lxml')

    US_OFFICES = bs.find('strong', {'class': 'inlineOnly'}, text='No. of U.S. Offices:').next_sibling.strip()
    info[i]['# of U.S offices'] = US_OFFICES

    INT_OFFICES = bs.find('strong', {'class': 'inlineOnly'}, text='No. of International Offices:').next_sibling.strip()
    info[i]['# of International Offices'] = INT_OFFICES

    MAJOR_OFFICE = bs.find('div', {'class': 'col-lg-12 col-md-4'}).p.text.strip()
    info[i]['Major Office'] = MAJOR_OFFICE

    MAJOR_OFFICE_LOC = bs.find('strong', text='Major Office Locations').parent.p.text.strip()
    info[i]['Major Office Locations'] = MAJOR_OFFICE_LOC

    MAJOR_DEP = bs.find('strong', text='Major Departments').parent.p.text.strip()
    info[i]['Major Departments'] = MAJOR_DEP

    ATT = bs.find('strong', text='Firm Stats').parent.p.text
    ATT = re.search(r'Total No. Attorneys \d\d\d\d:\r\n.*', ATT)
    if ATT is not None:
        ATT = re.search(r'\r\n.*[0-9K+]', ATT.group()).group().strip()
        info[i]['# of Attorneys'] = ATT
    else:
        info[i]['# of Attorneys'] = ''


with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    executor.map(run, list(range(len(info))))

df = pd.DataFrame(data=info)
df = df[['Year', 'Title', 'Rank', '# of U.S offices',
         '# of International Offices', 'Major Office', 'Major Office Locations',
         'Major Departments', '# of Attorneys']]
df.to_excel(filename, index=False)
print('DONE!!!!!!!!!!!!!!!!!!')

Solution

  • It looks like you're trying to set the headers for the dataframe (if I'm guessing right). If that's the case, you can just pass the names in when creating the dataframe by passing in the columns keyword argument, like so:

    df = pd.DataFrame(data=info, columns=['Year', 'Title', 'Rank', '# of U.S offices', '# of International Offices', 'Major Office', 'Major Office Locations', 'Major Departments', '# of Attorneys'])