From: sgf Date: Mon, 23 May 2022 15:24:37 +0000 (+0300) Subject: new(tour): Add solution for crawler exercise. X-Git-Url: https://gitweb.sgf-dma.tk/?a=commitdiff_plain;h=7853b22133c5c1d1953afc8307956a44660fedf4;p=go.git new(tour): Add solution for crawler exercise. --- diff --git a/crawl/crawl b/crawl/crawl new file mode 100755 index 0000000..e6caca3 Binary files /dev/null and b/crawl/crawl differ diff --git a/crawl/crawl.go b/crawl/crawl.go new file mode 100644 index 0000000..7b8512a --- /dev/null +++ b/crawl/crawl.go @@ -0,0 +1,136 @@ + +package main + +import ( + "fmt" + "sync" + "strconv" + //"time" + "strings" +) + +type Empty struct {} + +type Fetcher interface { + // Fetch returns the body of URL and + // a slice of URLs found on that page. + Fetch(url string) (body string, urls []string, err error) + //IsFetched(url string) bool +} + +var wg sync.WaitGroup + +var AlreadyFetched map[string]*Empty + +var mu sync.Mutex + +type index struct { + maxDepth int + parentPref string + funcNum int +} + +func (i index) String() string { + return i.parentPref + + strconv.Itoa(i.funcNum) + + strings.Repeat("_", i.maxDepth - len(i.parentPref) - 1) +} + + +// Crawl uses fetcher to recursively crawl +// pages starting with url, to a maximum of depth. +func Crawl(id index, url string, depth int, fetcher Fetcher) { + // TODO: Fetch URLs in parallel. + // TODO: Don't fetch the same URL twice. + // This implementation doesn't do either: + defer wg.Done() + + if depth <= 0 { + return + } + + fmt.Printf("Crawl() %q: started for %q\n", id, url) + if _, ok := AlreadyFetched[url]; ok { + fmt.Printf("Crawl() %q: Already fetched %q\n", id, url) + return + } else { + mu.Lock() + fmt.Printf("Crawl() %q: FETCHING %q\n", id, url) + AlreadyFetched[url] = &Empty{} + mu.Unlock() + } + body, urls, err := fetcher.Fetch(url) + if err != nil { + fmt.Println(err) + return + } + fmt.Printf("Crawl() %q: fetched %s %q\n", id, url, body) + + childId := index { maxDepth: id.maxDepth, + parentPref: id.parentPref + strconv.Itoa(id.funcNum), + } + for _, u := range urls { + wg.Add(1) + go Crawl(childId, u, depth-1, fetcher) + childId.funcNum++ + } + return +} + +func main() { + AlreadyFetched = make(map[string]*Empty) + wg.Add(1) + depth := 4 + id := index{maxDepth: depth} + Crawl(id, "https://golang.org/", depth, fetcher) + wg.Wait() +} + +// fakeFetcher is Fetcher that returns canned results. +type fakeFetcher map[string]*fakeResult + +type fakeResult struct { + body string + urls []string +} + +func (f fakeFetcher) Fetch(url string) (string, []string, error) { + if res, ok := f[url]; ok { + return res.body, res.urls, nil + } + return "", nil, fmt.Errorf("not found: %s", url) +} + +// fetcher is a populated fakeFetcher. +var fetcher = fakeFetcher{ + "https://golang.org/": &fakeResult{ + "The Go Programming Language", + []string{ + "https://golang.org/pkg/", + "https://golang.org/cmd/", + }, + }, + "https://golang.org/pkg/": &fakeResult{ + "Packages", + []string{ + "https://golang.org/", + "https://golang.org/cmd/", + "https://golang.org/pkg/fmt/", + "https://golang.org/pkg/os/", + }, + }, + "https://golang.org/pkg/fmt/": &fakeResult{ + "Package fmt", + []string{ + "https://golang.org/", + "https://golang.org/pkg/", + }, + }, + "https://golang.org/pkg/os/": &fakeResult{ + "Package os", + []string{ + "https://golang.org/", + "https://golang.org/pkg/", + }, + }, +} diff --git a/crawl/go.mod b/crawl/go.mod new file mode 100644 index 0000000..d85ba8f --- /dev/null +++ b/crawl/go.mod @@ -0,0 +1,3 @@ +module crawl + +go 1.17