I'm working on a webcrawler which should be working like this:
It seems like the code below is somehow working, like when I try to crawl some sites, I get some images to download.
(even I dont understand the images I get, cause I cant find them on the website, it seems like the crawler does not start with the startpage of the website).
After a few images (~25-500), the crawler is done and stops, no errors, it just stops. I tried this with multiple websites and after a few images it just stops. I think the crawler somehow ignores step 3.
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
var (
currWebsite string = "https://www.youtube.com"
imageCount int = 0
crawlWebsite string
)
func processElement(index int, element *goquery.Selection) {
href, exists := element.Attr("href")
if exists && strings.HasPrefix(href, "http") {
crawlWebsite = href
response, err := http.Get(crawlWebsite)
if err != nil {
log.Fatalf("error on current website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal("Error loading HTTP response body.", err)
}
document.Find("img").Each(func(index int, element *goquery.Selection) {
imgSrc, exists := element.Attr("src")
if strings.HasPrefix(imgSrc, "http") && exists {
fileName := fmt.Sprintf("./images/img" + strconv.Itoa(imageCount) + ".jpg")
currWebsite := fmt.Sprint(imgSrc)
fmt.Println("[+]", currWebsite)
DownloadFile(fileName, currWebsite)
imageCount++
}
})
}
}
func main() {
err := os.MkdirAll("./images/", 0777)
if err != nil {
log.Fatalln("error on creating directory")
}
response, err := http.Get(currWebsite)
if err != nil {
log.Fatalln("error on searching website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
document.Find("a").Each(processElement)
}
func DownloadFile(filepath string, url string) {
response, err := http.Get(url)
if err != nil {
log.Fatalln("error getting the website infos")
}
defer response.Body.Close()
if response.StatusCode != 200 {
log.Fatalln("received non 200 response code")
}
file, err := os.Create(filepath)
if err != nil {
log.Fatalf("error creating file at %v\n", filepath)
}
defer file.Close()
_, err = io.Copy(file, response.Body)
if err != nil {
log.Fatalln("error copy file from src to dst")
}
}
(even I dont understand the images I get, cause I cant find them on the website, it seems like the crawler does not start with the startpage of the website).
Yes you are right. Your code will not download images from the start page because the only thing it is fetching from start page are all anchor tag elements and then calling processElement()
for each anchor element found on the start page -
response, err := http.Get(currWebsite)
if err != nil {
log.Fatalln("error on searching website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
document.Find("a").Each(processElement) // Here
To download all images from start page, you should define another function processUrl()
to do the work of fetching img
elements and download images but then in processElement()
function you just need to get the href
link and invoke processUrl()
on that link -
func processElement(index int, element *goquery.Selection) {
href, exists := element.Attr("href")
if exists && strings.HasPrefix(href, "http") {
crawlWebsite = href
processUrl(crawlWebsite)
}
}
func processUrl(crawlWebsite string) {
response, err := http.Get(crawlWebsite)
if err != nil {
log.Fatalf("error on current website")
}
defer response.Body.Close()
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal("Error loading HTTP response body.", err)
}
document.Find("img").Each(func(index int, element *goquery.Selection) {
imgSrc, exists := element.Attr("src")
if strings.HasPrefix(imgSrc, "http") && exists {
fileName := fmt.Sprintf("./images/img" + strconv.Itoa(imageCount) + ".jpg")
currWebsite := fmt.Sprint(imgSrc)
fmt.Println("[+]", currWebsite)
DownloadFile(fileName, currWebsite)
imageCount++
}
})
}
Now just crawl images from start page before processing all links -
func main() {
...
document, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatalln("Error loading HTTP response body. ", err)
}
// First crawl images from start page url
processUrl(currWebsite)
document.Find("a").Each(processElement)
}