you must remember where you've been. for example, you might: a. convert each candidate URL to a canonical form (absolute path)
b. look for canonical url in a map before visiting. If it was not there, insert and visit, if it was there, do nothing this is enough. On Sun, Sep 24, 2017 at 4:05 AM, <gdz...@gmail.com> wrote: > Hi I am learning Golang concurrency and trying to build a simple Website > crawler. I managed to crawl all the links of the pages of any depth of > website. But I still have one problem to tackle: how to avoid crawling > visited links that are previously crawled? > > Here is my code. Hope you guys can shed some light. Thank you in advance. > > package main > import ( > "fmt" > "log" > "net/http" > "os" > "strings" > > "golang.org/x/net/html") > > func main() { > if len(os.Args) != 2 { > fmt.Println("Usage: crawl [URL].") > } > > url := os.Args[1] > if !strings.HasPrefix(url, "http://") { > url = "http://" + url > } > > for link := range newCrawl(url, 1) { > fmt.Println(link) > }} > > func newCrawl(url string, num int) chan string { > ch := make(chan string, 20) > > go func() { > crawl(url, 1, ch) > close(ch) > }() > > return ch} > > func crawl(url string, n int, ch chan string) { > if n < 1 { > return > } > resp, err := http.Get(url) > if err != nil { > log.Fatalf("Can not reach the site. Error = %v\n", err) > os.Exit(1) > } > > b := resp.Body > defer b.Close() > > z := html.NewTokenizer(b) > > nextN := n - 1 > for { > token := z.Next() > > switch token { > case html.ErrorToken: > return > case html.StartTagToken: > current := z.Token() > if current.Data != "a" { > continue > } > result, ok := getHrefTag(current) > if !ok { > continue > } > > hasProto := strings.HasPrefix(result, "http") > if hasProto { > done := make(chan struct{}) > go func() { > crawl(result, nextN, ch) > close(done) > }() > <-done > ch <- result > } > } > }} > > func getHrefTag(token html.Token) (result string, ok bool) { > for _, a := range token.Attr { > if a.Key == "href" { > result = a.Val > ok = true > break > } > } > return} > > -- > You received this message because you are subscribed to the Google Groups > "golang-nuts" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to golang-nuts+unsubscr...@googlegroups.com. > For more options, visit https://groups.google.com/d/optout. > -- Michael T. Jones michael.jo...@gmail.com -- You received this message because you are subscribed to the Google Groups "golang-nuts" group. To unsubscribe from this group and stop receiving emails from it, send an email to golang-nuts+unsubscr...@googlegroups.com. For more options, visit https://groups.google.com/d/optout.