I try to below code for get the anchor tag href
and nested anchor tag value for the following URL https://www.tradeindia.com/
but not generate the exact output. below code only get the single page URL output, can any one please suggest?
import concurrent
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def get_page(url):
response = requests.get(url)
return response.content
def extract_links(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
links = [a['href'] for a in soup.find_all('a', href=True)]
return links
def process_page(url):
html_content = get_page(url)
links = extract_links(html_content)
return links
def main():
start_url = 'https://www.tradeindia.com/'
# Fetch the initial page
start_page_content = get_page(start_url)
# Extract links from the initial page
start_page_links = extract_links(start_page_content)
all_links = set(start_page_links)
# Use ThreadPoolExecutor to parallelize the process
with ThreadPoolExecutor(max_workers=5) as executor:
# Submit tasks for processing each link concurrently
future_to_url = {executor.submit(process_page, url): url for url in start_page_links}
# Iterate through completed tasks and update the set of all links
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
links_on_page = future.result()
all_links.update(links_on_page)
except Exception as e:
print(f"Error processing {url}: {e}")
# Print all the extracted links
print("All Links:")
print(len(all_links))
for link in all_links:
print(link)
if __name__ == "__main__":
main()
You get urls from start page
and you add them to all_links
but this will not run executor.submit()
with new urls.
It may need more complex code for this. It may need while loop which creates new submit
using links_on_page
and keeps on list, and another loop which checks if there are new results and get them to add to list which it will use to create new submit
. etc.
And frankly, I would rather use scrapy for this because it already uses threading
and code will be much simpler.
Normally scrapy
needs to generate project
with many files and folders but this code you can run without creating project. You can put all in one file and run as any other script - python script.py
This script also automatically write result in file .csv
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['www.tradeindia.com']
start_urls = ['https://www.tradeindia.com/']
def parse(self, response):
print('\n>>> url:', response.url, '\n')
links = response.css('a::attr(href)').extract()
# create items which it will save in file `CSV`
for url in links:
yield {'url': url}
# create requests with URL so it will process next pages
for url in links:
yield response.follow(url)
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'CONCURRENT_REQUESTS': 10, # default: 16
#'RANDOMIZE_DOWNLOAD_DELAY': True, # default: True
'DOWNLOAD_DELAY': 2, # delays between requests to simulate real human - from `0.5*delay` to `1.5*delay`
#'LOG_LEVEL': 'INFO', # less information on screen
'FEEDS': {'output.csv': {'format': 'csv'}}, # save in file CSV, JSON or XML
})
c.crawl(MySpider)
c.start()
scrapy
has also class LinkExtractor for this. And even special CrawlSpider for this.