you must remember where you've been. for example, you might:

a. convert each candidate URL to a canonical form (absolute path)

b. look for canonical url in a map before visiting. If it was not there,
insert and visit, if it was there, do nothing

this is enough.


On Sun, Sep 24, 2017 at 4:05 AM, <gdz...@gmail.com> wrote:

> Hi I am learning Golang concurrency and trying to build a simple Website
> crawler. I managed to crawl all the links of the pages of any depth of
> website. But I still have one problem to tackle: how to avoid crawling
> visited links that are previously crawled?
>
> Here is my code. Hope you guys can shed some light. Thank you in advance.
>
> package main
> import (
>     "fmt"
>     "log"
>     "net/http"
>     "os"
>     "strings"
>
>     "golang.org/x/net/html")
>
> func main() {
>     if len(os.Args) != 2 {
>         fmt.Println("Usage: crawl [URL].")
>     }
>
>     url := os.Args[1]
>     if !strings.HasPrefix(url, "http://";) {
>         url = "http://"; + url
>     }
>
>     for link := range newCrawl(url, 1) {
>         fmt.Println(link)
>     }}
>
> func newCrawl(url string, num int) chan string {
>     ch := make(chan string, 20)
>
>     go func() {
>         crawl(url, 1, ch)
>         close(ch)
>     }()
>
>     return ch}
>
> func crawl(url string, n int, ch chan string) {
>     if n < 1 {
>         return
>     }
>     resp, err := http.Get(url)
>     if err != nil {
>         log.Fatalf("Can not reach the site. Error = %v\n", err)
>         os.Exit(1)
>     }
>
>     b := resp.Body
>     defer b.Close()
>
>     z := html.NewTokenizer(b)
>
>     nextN := n - 1
>     for {
>         token := z.Next()
>
>         switch token {
>         case html.ErrorToken:
>             return
>         case html.StartTagToken:
>             current := z.Token()
>             if current.Data != "a" {
>                 continue
>             }
>             result, ok := getHrefTag(current)
>             if !ok {
>                 continue
>             }
>
>             hasProto := strings.HasPrefix(result, "http")
>             if hasProto {
>                 done := make(chan struct{})
>                 go func() {
>                     crawl(result, nextN, ch)
>                     close(done)
>                 }()
>                 <-done
>                 ch <- result
>             }
>         }
>     }}
>
> func getHrefTag(token html.Token) (result string, ok bool) {
>     for _, a := range token.Attr {
>         if a.Key == "href" {
>             result = a.Val
>             ok = true
>             break
>         }
>     }
>     return}
>
> --
> You received this message because you are subscribed to the Google Groups
> "golang-nuts" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to golang-nuts+unsubscr...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.
>



-- 
Michael T. Jones
michael.jo...@gmail.com

-- 
You received this message because you are subscribed to the Google Groups 
"golang-nuts" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to golang-nuts+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to