Created
August 3, 2019 20:02
-
-
Save jamo/74f2654d41605f69cbc13204209ca949 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"encoding/json" | |
"flag" | |
"fmt" | |
"os" | |
"strings" | |
"github.com/gocolly/colly" | |
"github.com/gocolly/colly/queue" | |
) | |
type Result struct { | |
URL string `json:"url"` | |
Generator string `json:"generator"` | |
NextIDFound bool `json:"nextIDFound"` | |
NextDataFound bool `json:"nextDataFound"` | |
Error bool `json:"error"` | |
Status int `json:"status"` | |
} | |
func main() { | |
urlsFileName := flag.String("urls", "urls.csv", "Name of the top 10M") | |
resFileName := flag.String("result", "result.jl", "Name result file") | |
flag.Parse() | |
resFile, resFileErr := os.OpenFile(*resFileName, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) | |
if resFileErr != nil { | |
panic(resFileErr) | |
} | |
defer resFile.Close() | |
// Instantiate default collector | |
c := colly.NewCollector( | |
// MaxDepth is 2, so only the links on the scraped page | |
// and links on those pages are visited | |
//colly.MaxDepth(2), | |
//colly.Async(true), | |
) | |
//c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2}) | |
// create a request queue with 2 consumer threads | |
q, _ := queue.New( | |
20, // Number of consumer threads | |
&queue.InMemoryQueueStorage{MaxSize: 100000000000}, // Use default queue storage | |
) | |
c.OnRequest(func(r *colly.Request) { | |
fmt.Println("visiting", r.URL) | |
}) | |
c.OnError(func(r *colly.Response, err error) { | |
fmt.Println("Request URL:", r.Request.URL, "failed with response:", "\nError:", err.Error()) | |
retVal := Result{URL: r.Request.URL.String(), Status: r.StatusCode, Error: true} | |
jsonstr, _ := json.Marshal(retVal) | |
if _, err := resFile.WriteString(string(jsonstr) + "\n"); err != nil { | |
panic(err) | |
} | |
}) | |
c.OnHTML("html", func(e *colly.HTMLElement) { | |
retVal := Result{URL: e.Request.URL.String(), Error: false, Status: 200} | |
generator := e.ChildAttr(`meta[name="generator"]`, "content") | |
if generator != "" { | |
retVal.Generator = generator | |
} | |
nextID := e.ChildAttr(`#__next`, `id`) | |
if nextID != "" { | |
retVal.NextIDFound = true | |
} | |
nextdata := e.ChildAttr(`#__NEXT_DATA__`, `id`) | |
if nextdata != "" { | |
retVal.NextDataFound = true | |
} | |
jsonstr, _ := json.Marshal(retVal) | |
if _, err := resFile.WriteString(string(jsonstr) + "\n"); err != nil { | |
panic(err) | |
} | |
//link := e.Attr("href") | |
// Print link | |
//fmt.Printf("Link found: %q -> %s\n", e.Text, link) | |
// Visit link found on page | |
// Only those links are visited which are in AllowedDomains | |
//c.Visit(e.Request.AbsoluteURL(link)) | |
}) | |
crawlFile, crawlFileErr := os.OpenFile(*urlsFileName, os.O_RDONLY, 0600) | |
if crawlFileErr != nil { | |
panic(crawlFileErr) | |
} | |
defer crawlFile.Close() | |
scanner := bufio.NewScanner(crawlFile) | |
for scanner.Scan() { | |
q.AddURL(`http://` + strings.ReplaceAll(strings.Split(scanner.Text(), ",")[1], `"`, ``)) | |
} | |
fmt.Println(`ok, starting up`) | |
q.Run(c) | |
c.Wait() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment