Search code examples
pythontextbeautifulsoup

read urls from a text file


hello guys I want to put more URL on a text file and read them 1 by 1 with the code below I want to open urls links from a text file, i want to run the script to extract data from 10 links for example not only from 1 how is the code created Thank you so much for help

import requests
from bs4 import BeautifulSoup
import csv

final_data = []
url = "https://denver.craigslist.org/search/cto?purveyor-input=owner&postedToday=1"
r = requests.get(url)
data = r.text

soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all(class_="result-row")

for details in get_details:
    getclass = details.find_all(class_="hdrlnk")
    for link in getclass:
        link1 = link.get("href")
        sublist = []
        sublist.append(link1)
        final_data.append(sublist)
print(final_data)

filename = "link.txt"
with open("./"+filename, "w") as csvfile:
    csvfile = csv.writer(csvfile, delimiter = ",")
    csvfile.writerow("")
    for i in range(0, len(final_data)):
        csvfile.writerow(final_data[i])

Solution

  • If you have every url in new line then simply open file, read all text and split on \n to get list of lines (without (\n))

    with open('input.txt') as fh
        text = fh.read()
        all_links = text.split('\n')
    

    or shorter

    with open('input.txt') as fh
        all_links = fh.read().split('\n')
    

    And later you have to use for-loop to run code for all urls

    # - before loop -
    
    final_data = []
    
    # - loop -
    
    for url in all_links:
    
        # ... code ...
    
    # - after loop -
    
    print(final_data)
    
    # ... write in csv ...
    

    EDIT:

    import requests
    from bs4 import BeautifulSoup
    import csv
    
    # - before loop -
    
    #all_links = [
    #    "https://denver.craigslist.org/search/cto?purveyor-input=owner&postedToday=1",
    #]
    
    with open('input.txt') as fh:
        all_links = fh.read().split('\n')
    
    final_data = []
    
    # - loop -
    
    for url in all_links:
        print('url:', url)
        
        response = requests.get(url)
        #print('[DEBUG] code:', response.status_code)
    
        soup = BeautifulSoup(response.text, "html.parser")
        all_rows = soup.find_all(class_="result-row")
    
        for row in all_rows:
            all_links = row.find_all(class_="hdrlnk")
            for link in all_links:
                href = link.get("href")
                final_data.append( [href] )
                print('   >', href)
                
        print('----------')
        
    # - after loop -
    
    #print(final_data)
    
    filename = "output.csv"   # no need to add `./` 
    
    with open(filename, "w") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=",")
        csv_writer.writerow( ["links"] )
        csv_writer.writerows( final_data )  # with `s` at the end