I'm trying to build a web-scraper using Go, I'm fairly new to the language and I'm not sure what I'm doing wrong while using the html parser. I'm trying to parse the html to find anchor tags but I keep getting html.TokenTypeEnd instead.
package main
import (
"fmt"
"golang.org/x/net/html"
"io/ioutil"
"net/http"
)
func GetHtml(url string) (text string, resp *http.Response, err error) {
var bytes []byte
if url == "https://www.coastal.edu/scs/employee" {
resp, err = http.Get(url)
if err != nil {
fmt.Println("There seems to ben an error with the Employee Console.")
}
bytes, err = ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("Cannot read byte response from Employee Console.")
}
text = string(bytes)
} else {
fmt.Println("Issue with finding URL. Looking for: " + url)
}
return text, resp, err
}
func main() {
htmlSrc, response, err := GetHtml("https://www.coastal.edu/scs/employee")
if err != nil {
fmt.Println("Cannot read HTML source code.")
}
_ = htmlSrc
htmlTokens := html.NewTokenizer(response.Body)
i := 0
for i < 1 {
tt := htmlTokens.Next()
fmt.Printf("%T", tt)
switch tt {
case html.ErrorToken:
fmt.Println("End")
i++
case html.TextToken:
fmt.Println(tt)
case html.StartTagToken:
t := htmlTokens.Token()
isAnchor := t.Data == "a"
if isAnchor {
fmt.Println("We found an anchor!")
}
}
}
I'm getting html.TokenTypeEnd whenever I'm printing
fmt.Printf("%T", tt)
The application reads to the end of the body in GetHtml
. The tokenizer returns html.TokenTypeEnd
because read on the body returns EOF.
Use this code:
htmlTokens := html.NewTokenizer(strings.NewReader(htmlSrc))
to create the tokenizer.
Also, close the response body inGetHtml
to prevent a connection leak.
The code can be simplified to:
response, err := http.Get("https://www.coastal.edu/scs/employee")
if err != nil {
log.Fatal(err)
}
defer response.Body.Close()
htmlTokens := html.NewTokenizer(response.Body)
loop:
for {
tt := htmlTokens.Next()
fmt.Printf("%T", tt)
switch tt {
case html.ErrorToken:
fmt.Println("End")
break loop
case html.TextToken:
fmt.Println(tt)
case html.StartTagToken:
t := htmlTokens.Token()
isAnchor := t.Data == "a"
if isAnchor {
fmt.Println("We found an anchor!")
}
}
}