Hi I am learning Golang concurrency and trying to build a simple Website crawler. I managed to crawl all the links of the pages of any depth of website. But I still have one problem to tackle: how to avoid crawling visited links that are previously crawled?
Here is my code. Hope you guys can shed some light. Thank you in advance. package main import ( "fmt" "log" "net/http" "os" "strings" "golang.org/x/net/html") func main() { if len(os.Args) != 2 { fmt.Println("Usage: crawl [URL].") } url := os.Args[1] if !strings.HasPrefix(url, "http://") { url = "http://" + url } for link := range newCrawl(url, 1) { fmt.Println(link) }} func newCrawl(url string, num int) chan string { ch := make(chan string, 20) go func() { crawl(url, 1, ch) close(ch) }() return ch} func crawl(url string, n int, ch chan string) { if n < 1 { return } resp, err := http.Get(url) if err != nil { log.Fatalf("Can not reach the site. Error = %v\n", err) os.Exit(1) } b := resp.Body defer b.Close() z := html.NewTokenizer(b) nextN := n - 1 for { token := z.Next() switch token { case html.ErrorToken: return case html.StartTagToken: current := z.Token() if current.Data != "a" { continue } result, ok := getHrefTag(current) if !ok { continue } hasProto := strings.HasPrefix(result, "http") if hasProto { done := make(chan struct{}) go func() { crawl(result, nextN, ch) close(done) }() <-done ch <- result } } }} func getHrefTag(token html.Token) (result string, ok bool) { for _, a := range token.Attr { if a.Key == "href" { result = a.Val ok = true break } } return} -- You received this message because you are subscribed to the Google Groups "golang-nuts" group. To unsubscribe from this group and stop receiving emails from it, send an email to golang-nuts+unsubscr...@googlegroups.com. For more options, visit https://groups.google.com/d/optout.