2017-02-22 50 views
1

我發現了一個很好的網絡無效鏈接檢查器。但如何通過使用goroutine來更改它的完整樣本?網頁是:How To Crawl A Website In Golang。代碼動態添加將被搜索到的pending切片的網址。但是我使用goroutine來做到這一點有些困難。使用goroutine更改樣本?

package main 
import (
    "crypto/tls" 
    "errors" 
    "fmt" 
    "golang.org/x/net/html" 
    "io" 
    "net/http" 
    "net/url" 
    "strings" 
    "time" 
) 
var alreadyCrawledList []string 
var pending []string 
var brokenLinks []string 
const localHostWithPort = "localhost:8080" 
func IsLinkInPendingQueue(link string) bool { 
    for _, x := range pending { 
     if x == link { 
      return true 
     } 
    } 
    return false 
} 
func IsLinkAlreadyCrawled(link string) bool { 
    for _, x := range alreadyCrawledList { 
     if x == link { 
      return true 
     } 
    } 
    return false 
} 
func AddLinkInAlreadyCrawledList(link string) { 
    alreadyCrawledList = append(alreadyCrawledList, link) 
} 
func AddLinkInPendingQueue(link string) { 
    pending = append(pending, link) 
} 
func AddLinkInBrokenLinksQueue(link string) { 
    brokenLinks = append(brokenLinks, link) 
} 
func main() { 
    start := time.Now() 
    AddLinkInPendingQueue("http://" + localHostWithPort) 
    for count := 0; len(pending) > 0; count++ { 
     x := pending[0] 
     pending = pending[1:] //it dynamicly change the search url 
     if err := crawlPage(x); err != nil { //how to use it by using goroutine? 
      t.Errorf(err.Error()) 
     } 
    } 
    duration := time.Since(start) 
    fmt.Println("________________") 
    count = 0 
    for _, l := range brokenLinks { 
     count++ 
     fmt.Println(count, "Broken. | ", l) 
    } 
    fmt.Println("Time taken:", duration) 
} 
func crawlPage(uri string) error { 
    if IsLinkAlreadyCrawled(uri) { 
     fmt.Println("Already visited: Ignoring uri | ", uri) 
     return nil 
    } 
    transport := &http.Transport{ 
     TLSClientConfig: &tls.Config{ 
      InsecureSkipVerify: true, 
     }, 
    } 
    client := http.Client{Transport: transport} 
    resp, err := client.Get(uri) 
    if err != nil { 
     fmt.Println("Got error: ", err.Error()) 
     return err 
    } 
    if resp.StatusCode != http.StatusOK { 
     AddLinkInBrokenLinksQueue(uri) 
     return errors.New(fmt.Sprintf("Got %v instead of 200", resp.StatusCode)) 
    } 
    defer resp.Body.Close() 
    links := ParseLinks(resp.Body) 
    links = ConvertLinksToLocalHost(links) 
    for _, link := range links { 
     if !InOurDomain(link) { 
      continue 
     } 
     absolute := FixURL(link, uri) 
     if !IsLinkAlreadyCrawled(absolute) && !IsLinkInPendingQueue(absolute) && absolute != uri { // Don't enqueue a page twice! 
      AddLinkInPendingQueue(absolute) 
     } 
    } 
    AddLinkInAlreadyCrawledList(uri) 
    return nil 
} 
func InOurDomain(link string) bool { 
    uri, err := url.Parse(link) 
    if err != nil { 
     return false 
    } 
    if uri.Scheme == "http" || uri.Scheme == "https" { 
     if uri.Host == localHostWithPort { 
      return true 
     } 
     return false 
    } 
    return true 
} 
func ConvertLinksToLocalHost(links []string) []string { 
    var convertedLinks []string 
    for _, link := range links { 
     convertedLinks = append(convertedLinks, strings.Replace(link, "leantricks.com", localHostWithPort, 1)) 
    } 
    return convertedLinks 
} 
func FixURL(href, base string) string { 
    uri, err := url.Parse(href) 
    if err != nil { 
     return "" 
    } 
    baseURL, err := url.Parse(base) 
    if err != nil { 
     return "" 
    } 
    uri = baseURL.ResolveReference(uri) 
    return uri.String() 
} 
func ParseLinks(httpBody io.Reader) []string { 
    var links []string 
    page := html.NewTokenizer(httpBody) 
    for { 
     tokenType := page.Next() 
     if tokenType == html.ErrorToken { 
      return links 
     } 

     token := page.Token() 
     switch tokenType { 
     case html.StartTagToken: 
      fallthrough 
     case html.SelfClosingTagToken: 
      switch token.DataAtom.String() { 
      case "a": 
       fallthrough 
      case "link": 
       fallthrough 
      case "script": 
       for _, attr := range token.Attr { 
        if attr.Key == "href" { 
         links = append(links, attr.Val) 
        } 
       } 
      } 
     } 
    } 
} 
+0

我建議你考慮看看的[示例](https://github.com/adonovan/gopl.io /blob/master/ch8/crawl3/findlinks.go)來自着名書籍的併發網絡爬蟲。 –

回答

1

你可以調用crawlPage(),並同時處理與互斥alreadyCrawledListpendingbrokenLinks變量(不那麼高性能雖然)。另一方面,代碼需要進行很多修改才能獲得更高的性能。

我用4個鏈接做了一個快速檢查,似乎有一半的時間。我做了一個示例代碼用一個簡單的HTTP服務器及其here

感謝, - Anoop

+0

這不是我想要的。因爲需要搜索「BrokenLinks」,但不是已知的結果。 @Maniankara – user3373877

+0

@ user3373877,你能解釋一下嗎?從上面的鏈接,你可以看到我已經測試了斷開的鏈接:https://github.com/maniankara/stackoverflow/blob/master/golang-web-crawler-with-goroutines/crawl.go#L210。此外,我還沒有修改從這裏的功能:http://leantricks.com/code/how-to-crawl-a-website-in-golang/ – Maniankara

+0

https://github.com/megaserg/validation-crawler/blob /master/crawler.go是我想要的。它使用goroutine和頻道。 – user3373877