[go-nuts] Simple web crawler question. How to avoid crawling visited links?

gdzack Sun, 24 Sep 2017 07:13:10 -0700

Hi I am learning Golang concurrency and trying to build a simple Website 
crawler. I managed to crawl all the links of the pages of any depth of 
website. But I still have one problem to tackle: how to avoid crawling 
visited links that are previously crawled?


Here is my code. Hope you guys can shed some light. Thank you in advance.

package main
import (
    "fmt"
    "log"
    "net/http"
    "os"
    "strings"

    "golang.org/x/net/html")

func main() {
    if len(os.Args) != 2 {
        fmt.Println("Usage: crawl [URL].")
    }

    url := os.Args[1]
    if !strings.HasPrefix(url, "http://";) {
        url = "http://"; + url
    }

    for link := range newCrawl(url, 1) {
        fmt.Println(link)
    }}

func newCrawl(url string, num int) chan string {
    ch := make(chan string, 20)

    go func() {
        crawl(url, 1, ch)
        close(ch)
    }()

    return ch}

func crawl(url string, n int, ch chan string) {
    if n < 1 {
        return
    }
    resp, err := http.Get(url)
    if err != nil {
        log.Fatalf("Can not reach the site. Error = %v\n", err)
        os.Exit(1)
    }

    b := resp.Body
    defer b.Close()

    z := html.NewTokenizer(b)

    nextN := n - 1
    for {
        token := z.Next()

        switch token {
        case html.ErrorToken:
            return
        case html.StartTagToken:
            current := z.Token()
            if current.Data != "a" {
                continue
            }
            result, ok := getHrefTag(current)
            if !ok {
                continue
            }

            hasProto := strings.HasPrefix(result, "http")
            if hasProto {
                done := make(chan struct{})
                go func() {
                    crawl(result, nextN, ch)
                    close(done)
                }()
                <-done
                ch <- result
            }
        }
    }}

func getHrefTag(token html.Token) (result string, ok bool) {
    for _, a := range token.Attr {
        if a.Key == "href" {
            result = a.Val
            ok = true
            break
        }
    }
    return}

-- 
You received this message because you are subscribed to the Google Groups 
"golang-nuts" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to golang-nuts+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

[go-nuts] Simple web crawler question. How to avoid crawling visited links?

Reply via email to