Search code examples
goweb-scrapinghtml-parsing

How can I parse value in a spanfrom a web page?


I am trying to scrape a list of names of top products from an e-commerce site. However the result is empty. Want to know what is missing. The output is: Visiting: https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/ End of scraping: https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/

code:

package main

import (
    "encoding/csv"
    "fmt"
    "log"
    "os"

    "github.com/gocolly/colly"
)

func main() {
    fetchURL := "https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/"
    fileName := "results.csv"
    file, err := os.Create(fileName)
    if err != nil {
        log.Fatal("ERROR: Could not create file %q: %s\n", fileName, err)
        return
    }
    defer file.Close()
    writer := csv.NewWriter(file)
    defer writer.Flush()


    writer.Write([]string{"Sl. No."})


    c := colly.NewCollector()


    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting: ", r.URL)
    })

    c.OnHTML(`.a-section a-spacing-none aok-relative`, func(e *colly.HTMLElement) {
        number := e.ChildText(".zg-badge-text")
        name := e.ChildText(".p13n-sc-truncated")

        writer.Write([]string{
            number,
            name,

    })


    c.Visit(fetchURL)
    fmt.Println("End of scraping: ", fetchURL)
}

Solution

  • You need to add the User-Agent header in order for it to return data. Also it seems p13n-sc-truncated is a generated class name. You can use the following for example :

    package main
    
    import (
        "log"
        "strings"
        "github.com/gocolly/colly"
    )
    
    type AmazonData struct {
        Index int
        Link string
        Title string
    }
    
    func main() {
        c := colly.NewCollector()
    
        var data []AmazonData
        count := 1
    
        c.OnHTML(`#zg-ordered-list`, func(e *colly.HTMLElement) {
            e.ForEach("li .zg-item", func(_ int, elem *colly.HTMLElement) {
                link := elem.DOM.Find("a")
                linkHref, _ := link.Attr("href")
                data = append(data, AmazonData{
                    Index: count,
                    Link: linkHref,
                    Title: strings.TrimSpace(link.Find("div").Text()),
                })
                count++
            })
            log.Println(data)
        })
    
        c.OnRequest(func(r *colly.Request) {
            r.Headers.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
        })
    
        c.Visit("https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0/")
    }