new(tour): Add solution for crawler exercise.
authorsgf <sgf.dma@gmail.com>
Mon, 23 May 2022 15:24:37 +0000 (18:24 +0300)
committersgf <sgf.dma@gmail.com>
Mon, 23 May 2022 15:24:37 +0000 (18:24 +0300)
crawl/crawl [new file with mode: 0755]
crawl/crawl.go [new file with mode: 0644]
crawl/go.mod [new file with mode: 0644]

diff --git a/crawl/crawl b/crawl/crawl
new file mode 100755 (executable)
index 0000000..e6caca3
Binary files /dev/null and b/crawl/crawl differ
diff --git a/crawl/crawl.go b/crawl/crawl.go
new file mode 100644 (file)
index 0000000..7b8512a
--- /dev/null
@@ -0,0 +1,136 @@
+
+package main
+
+import (
+        "fmt"
+        "sync"
+        "strconv"
+        //"time"
+        "strings"
+)
+
+type Empty struct {}
+
+type Fetcher interface {
+        // Fetch returns the body of URL and
+        // a slice of URLs found on that page.
+        Fetch(url string) (body string, urls []string, err error)
+        //IsFetched(url string) bool
+}
+
+var wg sync.WaitGroup
+
+var AlreadyFetched map[string]*Empty
+
+var mu sync.Mutex
+
+type index struct {
+    maxDepth int
+    parentPref string
+    funcNum int
+}
+
+func (i index) String() string {
+    return i.parentPref +
+            strconv.Itoa(i.funcNum) +
+            strings.Repeat("_", i.maxDepth - len(i.parentPref) - 1)
+}
+
+
+// Crawl uses fetcher to recursively crawl
+// pages starting with url, to a maximum of depth.
+func Crawl(id index, url string, depth int, fetcher Fetcher) {
+        // TODO: Fetch URLs in parallel.
+        // TODO: Don't fetch the same URL twice.
+        // This implementation doesn't do either:
+        defer wg.Done()
+
+        if depth <= 0 {
+                return
+        }
+
+        fmt.Printf("Crawl() %q: started for %q\n", id, url)
+        if _, ok := AlreadyFetched[url]; ok {
+            fmt.Printf("Crawl() %q: Already fetched %q\n", id, url)
+            return
+        } else {
+            mu.Lock()
+            fmt.Printf("Crawl() %q: FETCHING %q\n", id, url)
+            AlreadyFetched[url] = &Empty{}
+            mu.Unlock()
+        }
+        body, urls, err := fetcher.Fetch(url)
+        if err != nil {
+                fmt.Println(err)
+                return
+        }
+        fmt.Printf("Crawl() %q: fetched %s %q\n", id, url, body)
+
+        childId := index { maxDepth: id.maxDepth,
+                           parentPref: id.parentPref + strconv.Itoa(id.funcNum),
+                         }
+        for _, u := range urls {
+                wg.Add(1)
+                go Crawl(childId, u, depth-1, fetcher)
+                childId.funcNum++
+        }
+        return
+}
+
+func main() {
+        AlreadyFetched = make(map[string]*Empty)
+        wg.Add(1)
+        depth := 4
+        id := index{maxDepth: depth}
+        Crawl(id, "https://golang.org/", depth, fetcher)
+        wg.Wait()
+}
+
+// fakeFetcher is Fetcher that returns canned results.
+type fakeFetcher map[string]*fakeResult
+
+type fakeResult struct {
+        body string
+        urls []string
+}
+
+func (f fakeFetcher) Fetch(url string) (string, []string, error) {
+        if res, ok := f[url]; ok {
+                return res.body, res.urls, nil
+        }
+        return "", nil, fmt.Errorf("not found: %s", url)
+}
+
+// fetcher is a populated fakeFetcher.
+var fetcher = fakeFetcher{
+        "https://golang.org/": &fakeResult{
+                "The Go Programming Language",
+                []string{
+                        "https://golang.org/pkg/",
+                        "https://golang.org/cmd/",
+                },
+        },
+        "https://golang.org/pkg/": &fakeResult{
+                "Packages",
+                []string{
+                        "https://golang.org/",
+                        "https://golang.org/cmd/",
+                        "https://golang.org/pkg/fmt/",
+                        "https://golang.org/pkg/os/",
+                },
+        },
+        "https://golang.org/pkg/fmt/": &fakeResult{
+                "Package fmt",
+                []string{
+                        "https://golang.org/",
+                        "https://golang.org/pkg/",
+                },
+        },
+        "https://golang.org/pkg/os/": &fakeResult{
+                "Package os",
+                []string{
+                        "https://golang.org/",
+                        "https://golang.org/pkg/",
+                },
+        },
+}
diff --git a/crawl/go.mod b/crawl/go.mod
new file mode 100644 (file)
index 0000000..d85ba8f
--- /dev/null
@@ -0,0 +1,3 @@
+module crawl
+
+go 1.17