Search code examples
pythonpython-3.xcsvexport-to-csvpython-requests-html

Python CSV Writer only writing last scraped item processed


So my scraper is only sending the last two items to csv from last page it processed.I can not figure out where i am doing wrong it prints output perfectly fine.May be experience set of eyes able to help.

Code Below:

from requests_html import HTMLSession
import csv
import time


 def get_links(url):
    _request = _session.get(url)
    items = _request.html.find('li.product-grid-view.product.sale')
    links = []
    for item in items:
         links.append(item.find('a', first=True).attrs['href'])

   # print(len(links))

    return links


 def get_product(link):
     _request = _session.get(link)

      title = _request.html.find('h2', first=True).full_text
      price = _request.html.find('span.woocommerce-Price-amount.amount bdi')[1].full_text
      sku = _request.html.find('span.sku', first=True).full_text
      categories = _request.html.find('span.posted_in', first=True).full_text.replace('Categories:', "").strip()
      brand = _request.html.find('span.posted_in')[1].full_text.replace('Brand:', "").strip()
      #print(brand)

       product = {
         'Title': title,
         'Price': price,
         'SKU': sku,
         'Categories': categories,
         'Brand': brand
       }

    #print(product)
     return product


if __name__ == '__main__':
    for page in range(1, 4):

        url = 'https://www.thebassplace.com/product-category/basses/4-string/'
    
        if page == 1:
           parse_url = url
        else:
            parse_url = f'https://www.thebassplace.com/product-category/basses/4-string/page/{page}/'

       _session = HTMLSession()

        links = get_links(parse_url)
        results = []

        for link in links:
            results.append(get_product(link))
            time.sleep(1)
            #print(len(results))


with open('on_sale_bass.csv', 'w', newline='', encoding='utf-8') as csv_file:
    
    writer = csv.DictWriter(csv_file, fieldnames=results[0].keys())
    writer.writeheader()

    for row in results:
        writer.writerow(row)

when i try to append records are written in csv but headers are repeating for each page iteration.


Solution

  • The problem was in the statement results = [], inside the range loop. You emptied the results on each iteration of the range(1, 4) loop. Thus, you were getting only what the last iteration brought in.

    Note, I made the _session as global, but in this case it would be reasonable, in my opinion (feel free to correct), to just pass it between functions. Now, try this out.

    from requests_html import HTMLSession
    import csv
    import time
    
    
    def get_links(url):
        global _session
        _request = _session.get(url)
        items = _request.html.find('li.product-grid-view.product.sale')
        links = []
        for item in items:
            links.append(item.find('a', first=True).attrs['href'])
        return links
    
    
    def get_product(link):
        global _session
        _request = _session.get(link)
        title = _request.html.find('h2', first=True).full_text
        price = _request.html.find('span.woocommerce-Price-amount.amount bdi')[1].full_text
        sku = _request.html.find('span.sku', first=True).full_text
        categories = _request.html.find('span.posted_in', first=True).full_text.replace('Categories:', "").strip()
        brand = _request.html.find('span.posted_in')[1].full_text.replace('Brand:', "").strip()
        product = {
            'Title': title,
            'Price': price,
            'SKU': sku,
            'Categories': categories,
            'Brand': brand
        }
        return product
    
    
    if __name__ == '__main__':
        results = []
        for page in range(1, 4):
            url = 'https://www.thebassplace.com/product-category/basses/4-string/'
            if page == 1:
                parse_url = url
            else:
                parse_url = f'https://www.thebassplace.com/product-category/basses/4-string/page/{page}/'
        
            _session = HTMLSession()
            links = get_links(parse_url)
    
            for link in links:
                product = get_product(link)
                results.append(product)
                #time.sleep(1)
                
        with open('on_sale_bass.csv', 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=results[0].keys())
            writer.writeheader()
            for row in results:
                writer.writerow(row)
    

    What I get as an example:

    enter image description here