I'm trying to use this code to scrape IPOs data from nasdaq webpage.
The code can scrap, but the result in my DataFrame is NaN
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
from datetime import datetime
# Define dates
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 5, 31)
dates = pd.period_range(start_date, end_date, freq='M')
# Create an empty DataFrame
df = pd.DataFrame(columns=['Company Name', 'Symbol', 'Market', 'Price', 'Shares'])
# Set the URL and headers
url = 'https://www.nasdaq.com/markets/ipos/activity.aspx?tab=pricings&month=%s'
headers = {'User-Agent': 'non-profit learning project'}
# Scrape IPO data for each date
for idx in dates:
print(f'Fetching data for {idx}')
result = requests.get(url % idx, headers=headers)
sleep(30)
content = result.content
if 'There is no data for this month' not in str(content):
table = pd.read_html(content)[0]
print(table)
df = pd.concat([df, table], ignore_index=True)
soup = BeautifulSoup(content, features="lxml")
links = soup.find_all('a', id=re.compile('two_column_main_content_rptPricing_company_\d'))
print(f"Length of table vs length of links: {table.shape[0] - len(links)}")
for link in links:
df['Link'].append(link['href'])
# Print the resulting DataFrame
print(df)
##This is the result:
Fetching data for 2023-01
Unnamed: 0 Unnamed: 1
0 NaN NaN
Length of table vs length of links: 1
Fetching data for 2023-02
Unnamed: 0 Unnamed: 1
0 NaN NaN
Length of table vs length of links: 1
Fetching data for 2023-03
Unnamed: 0 Unnamed: 1
0 NaN NaN
Length of table vs length of links: 1
Fetching data for 2023-04
Unnamed: 0 Unnamed: 1
0 NaN NaN
Length of table vs length of links: 1
Fetching data for 2023-05
Unnamed: 0 Unnamed: 1
0 NaN NaN
Length of table vs length of links: 1
Company Name Symbol Market Price Shares Unnamed: 0 Unnamed: 1
0 NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN
It seems that the code successfully fetched data for each month within the specified date range. However, there are some issues with the resulting DataFrame, as indicated by the presence of NaN values in the columns.
I want the data of the IPOs to make a model, any ideas how could that be achieved? Thanks
Don't parse HTML content but use the public API:
import pandas as pd
import requests
url = 'https://api.nasdaq.com/api/ipo/calendar'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0'}
start_date = '2023-1-1'
end_date = '2023-5-31'
periods = pd.period_range(start_date, end_date, freq='M')
dfs = []
for period in periods:
data = requests.get(url, headers=headers, params={'date': period}).json()
df = pd.json_normalize(data['data']['priced'], 'rows')
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
Output:
>>> df
dealID proposedTickerSymbol companyName proposedExchange proposedSharePrice sharesOffered pricedDate dollarValueOfSharesOffered dealStatus
0 1225815-104715 BREA Brera Holdings PLC NASDAQ Capital 5.00 1,705,000 1/27/2023 $8,525,000 Priced
1 890697-104848 TXO TXO Energy Partners, L.P. NYSE 20.00 5,000,000 1/27/2023 $100,000,000 Priced
2 405880-103426 GNLX GENELUX CORP NASDAQ Capital 6.00 2,500,000 1/26/2023 $15,000,000 Priced
3 1241592-105143 QSG QuantaSing Group Ltd NASDAQ Global 12.50 3,250,000 1/25/2023 $40,625,000 Priced
4 1225290-104329 CVKD Cadrenal Therapeutics, Inc. NASDAQ Capital 5.00 1,400,000 1/20/2023 $7,000,000 Priced
.. ... ... ... ... ... ... ... ... ...
64 1210259-102635 SGE Strong Global Entertainment, Inc. NYSE MKT 4.00 1,000,000 5/16/2023 $4,000,000 Priced
65 1254469-106197 SLRN ACELYRIN, Inc. NASDAQ Global Select 18.00 30,000,000 5/05/2023 $540,000,000 Priced
66 1239799-104989 ALCYU Alchemy Investments Acquisition Corp 1 NASDAQ Global 10.00 10,000,000 5/05/2023 $100,000,000 Priced
67 1243360-105271 KVUE Kenvue Inc. NYSE 22.00 172,812,560 5/04/2023 $3,801,876,320 Priced
68 1190851-101486 GODNU Golden Star Acquisition Corp NASDAQ Global 10.00 6,000,000 5/02/2023 $60,000,000 Priced
[69 rows x 9 columns]