Created
September 26, 2017 07:57
-
-
Save yogesh-desai/bf0d203afe2d151bbefbcc98b427a09c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"golang.org/x/net/html" | |
"net/http" | |
"net/url" | |
"runtime" | |
"strings" | |
"context" | |
"bytes" | |
"flag" | |
"sync" | |
"time" | |
"fmt" | |
"log" | |
"os" | |
// "github.com/PuerkitoBio/fetchbot" | |
// "github.com/PuerkitoBio/goquery" | |
cdp "github.com/knq/chromedp" | |
cdpr "github.com/knq/chromedp/runner" | |
) | |
var ( | |
baseurl string | |
// Command-line flags | |
seed = flag.String("seed", "https://jeremywho.com", "seed URL") | |
//seed = flag.String("seed", "https://www.tokopedia.com/", "seed URL") | |
cancelAfter = flag.Duration("cancelafter", 0, "automatically cancel the fetchbot after a given time") | |
cancelAtURL = flag.String("cancelat", "", "automatically cancel the fetchbot at a given URL") | |
stopAfter = flag.Duration("stopafter", 0, "automatically stop the fetchbot after a given time") | |
stopAtURL = flag.String("stopat", "", "automatically stop the fetchbot at a given URL") | |
memStats = flag.Duration("memstats", 5 * time.Minute, "display memory statistics at a given interval") | |
) | |
func DoExtract(chanURL chan string){ | |
time.Sleep(2 * time.Second) | |
for{ | |
url := <- chanURL | |
//var u string | |
// Append visited urls | |
// u := fmt.Sprintf("%v", url) | |
// u = g(url) | |
//urls = append(urls, url) | |
DoCDP(url) | |
} | |
} | |
func main() { | |
flag.Parse() | |
u, err := url.Parse(*seed) | |
check(err, "Error in parsing the seed url") | |
log.Println("The URL: ", u) | |
baseurl = u.String() | |
urlProcessor := make(chan string) | |
done := make(chan bool) | |
go processURL(urlProcessor, done) | |
go DoExtract(urlProcessor) | |
urlProcessor <- u.String() //fmt.Sprint(u) //"https://jeremywho.com" | |
// First mem stat print must be right after creating the fetchbot | |
if *memStats > 0 { | |
// Print starting stats | |
printMemStats() | |
// Run at regular intervals | |
runMemStats(*memStats) | |
// On exit, print ending stats after a GC | |
defer func() { | |
runtime.GC() | |
printMemStats() | |
}() | |
} | |
// if a stop or cancel is requested after some duration, launch the goroutine | |
// that will stop or cancel. | |
if *stopAfter > 0 || *cancelAfter > 0 { | |
after := *stopAfter | |
stopFunc := true | |
if *cancelAfter != 0 { | |
after = *cancelAfter | |
stopFunc = true | |
} | |
go func() { | |
c := time.After(after) | |
<-c | |
fmt.Println("The given timeout has occured. Exiting...") | |
done <- stopFunc | |
}() | |
} | |
<-done | |
fmt.Println("Done") | |
/* if _, err := os.Stat(pwd() + "/TokoProductDetails.csv"); !os.IsNotExist(err) { | |
log.Println("The output TSV file location: ", pwd() + "/TokoProductDetails.csv") | |
} else { | |
log.Println("Required data is not present in any of processed URLs.") | |
} | |
// Write the processed URLs to a file | |
fmt.Println("Total no. of URLs processed: ", len(urls), "\nThe Processed URLs are in the file: ", WriteProcessedUrlsToFile(urls)) | |
*/ | |
} | |
func runMemStats(tick time.Duration) { | |
var mu sync.Mutex | |
go func() { | |
c := time.Tick(tick) | |
for _ = range c { | |
mu.Lock() | |
printMemStats() | |
mu.Unlock() | |
} | |
}() | |
} | |
func printMemStats() { | |
var mem runtime.MemStats | |
runtime.ReadMemStats(&mem) | |
buf := bytes.NewBuffer(nil) | |
buf.WriteString(strings.Repeat("=", 72) + "\n") | |
buf.WriteString("Memory Profile:\n") | |
buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024)) | |
buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024)) | |
buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC)) | |
buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine())) | |
buf.WriteString(strings.Repeat("=", 72)) | |
log.Println(buf.String()) | |
} | |
// processURL checks the url is already visited or not. | |
//If not visited already, then set map = true and explore page for more links. | |
func processURL(urlProcessor chan string, done chan bool) { | |
visited := make(map[string]bool) | |
for { | |
select { | |
case url := <-urlProcessor: | |
if _, ok := visited[url]; ok { | |
continue | |
} else { | |
visited[url] = true | |
go exploreURL(url, urlProcessor) | |
} | |
case <-time.After(15 * time.Second): | |
fmt.Printf("Explored %d pages\n", len(visited)) | |
done <- truex | |
} | |
} | |
} | |
// exploreURL does HTTP GET and tokenize the response | |
func exploreURL(url string, urlProcessor chan string) { | |
fmt.Printf("Visiting %s.\n", url) | |
resp, err := http.Get(url) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
defer resp.Body.Close() | |
z := html.NewTokenizer(resp.Body) | |
for { | |
tt := z.Next() | |
if tt == html.ErrorToken { | |
return | |
} | |
if tt == html.StartTagToken { | |
t := z.Token() | |
if t.Data == "a" { | |
for _, a := range t.Attr { | |
if a.Key == "href" { | |
// if link is within jeremywho.com | |
if strings.HasPrefix(a.Val, baseurl) { | |
urlProcessor <- a.Val | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
//================================================================================ | |
//================================================================================ | |
// getProductInfo extract the required information by using chromedp package | |
func getProductInfo(urlstr, sel string, res *[]byte, pId, pUrl, url *string) cdp.Tasks { | |
return cdp.Tasks{ | |
cdp.Navigate(urlstr), | |
cdp.Sleep(5 * time.Second), | |
cdp.WaitVisible(sel, cdp.ByID), | |
cdp.EvaluateAsDevTools("document.getElementById('product-id').value;", pId), | |
cdp.EvaluateAsDevTools("document.getElementById('product-url').value;", pUrl), | |
cdp.EvaluateAsDevTools("document.getElementById('webyclip-widget-3').contentWindow.document.body.outerHTML;", res), | |
} | |
} | |
// isPresent checks the existance of webyclip-widget-3 element. | |
func isPresent(url string, res *[]byte) cdp.Tasks { | |
return cdp.Tasks{ | |
cdp.Navigate(url), | |
cdp.Sleep(15 * time.Second), | |
// cdp.EvaluateAsDevTools("document.getElementById('webyclip-thumbnails').childElementCount;", res), | |
cdp.EvaluateAsDevTools("if (document.getElementById('webyclip-thumbnails')) {document.getElementById('webyclip-thumbnails').childElementCount;} else {console.log('0')}", res), | |
} | |
} | |
//================================================================================ | |
// getVideoLinks returns the Youtube viedo links present in the iframe webyclip-widget-3. | |
// returns all the links which are comma seperated. | |
func getVideoLinks(buf []byte) string { | |
var videoLinks string | |
//Convert byte buffer to String | |
innerDoc := string(buf[:]) | |
tmp := strings.TrimSpace(innerDoc) | |
//Find the videolinks and create one final string | |
tmpStr := strings.Fields(tmp) | |
matchStr := "i.ytimg.com/vi/" | |
yUrl := "https://www.youtube.com/watch?v=" | |
for _, v := range tmpStr { | |
//log.Println("Contains: ", strings.Contains(v, "i.ytimg.com")) | |
if strings.Contains(v, matchStr) { | |
vv := strings.TrimPrefix(v, "src=\\\"//i.ytimg.com/vi/") | |
id := strings.Split(vv, "/") | |
//log.Println("https://www.youtube.com/watch?v=" + id[0]) | |
//log.Println("id: \tlen:\n",len(id), id) | |
youtubeLink := yUrl + id[0] | |
videoLinks += youtubeLink + "," | |
} | |
} | |
// return the video links | |
return videoLinks[:len(videoLinks)-1] | |
} | |
//======================================================================================== | |
func WriteToFile(filePath, record string) { | |
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND, 0644) | |
if err != nil { | |
// log.Println("File open failed for writing failure counts") | |
// return | |
log.Println("File doesn't exists. File will be created with the headers before adding data.") | |
// If file does not exists then create it with the header and write records. | |
file, err1 := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644) | |
if err1 != nil { | |
log.Println("File Open operation failed.") | |
return | |
} | |
defer file.Close() | |
header := fmt.Sprint("Product_ID" + "\t" + "Product_URL" + "\t" + "Youtube_Video_URLs") | |
file.WriteString(fmt.Sprintf("%s\n", header)) | |
file.WriteString(fmt.Sprintf("%s\n", record)) | |
return | |
} | |
defer f.Close() | |
log.Println("File exists Already. Adding the data for url.") | |
f.WriteString(fmt.Sprintf("%s\n", record)) | |
} | |
//================================================================================ | |
func WriteProcessedUrlsToFile(urls []string) string{ | |
filePath := pwd() + "/ProcessedURLs.csv" | |
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644) | |
check(err, "Error in file Open operation") | |
defer f.Close() | |
for _, url := range urls { | |
f.WriteString(fmt.Sprintf("%s\n", url)) | |
} | |
return filePath | |
} | |
//================================================================================ | |
// check checks the error, panics if not nil | |
func check(err error, str string){ | |
if err != nil { log.Fatalln(err, str) } | |
} | |
// pwd returns the current working directory through which the binary is invoked. | |
// used to save the csv file. | |
func pwd() string { | |
pwd, err := os.Getwd() | |
check(err, "Error in getting current workig dir.") | |
return pwd | |
} | |
//================================================================================ | |
func DoCDP(url string) { | |
// create context | |
ctxt, cancel := context.WithCancel(context.Background()) | |
defer cancel() | |
// create chrome instancefunc(map[string]interface{}) error | |
// c, err := cdp.New(ctxt, cdp.WithLog(log.Printf), cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1"))) | |
c, err := cdp.New(ctxt, cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1"))) | |
check(err, "Error in creating new cdp instance") | |
// run task list | |
var buf, buf1 []byte | |
var pId, pUrl string | |
// Check for the existence of the webyclip-widget-3 on the page | |
err = c.Run(ctxt, isPresent(url, &buf1)) | |
check(err, "Error in Run method of cdp") | |
if (len(buf1) == 0) || (bytes.EqualFold([]byte("0"), buf1)){ | |
log.Println("No webyclip-widget-3 on page:\n ", url) | |
// shutdown chrome | |
err = c.Shutdown(ctxt) | |
check(err, "Error in shutting down chrome") | |
// wait for chrome to finish | |
err = c.Wait() | |
check(err, "Error in wait to shutdown chrome") | |
return | |
//os.Exit(0) | |
} else { | |
//fmt.Println("In ELSE The status is: \t Len: ", len(buf), "\t", string(buf), " \t", buf) | |
// Exit the code if "webyclip-widget-3" is not present. | |
err = c.Run(ctxt, getProductInfo(url, `#webyclip-widget-3`, &buf, &pId, &pUrl, &url)) | |
check(err, "Error in Run method of cdp") | |
// shutdown chrome | |
err = c.Shutdown(ctxt) | |
check(err, "Error in shutting down chrome") | |
// wait for chrome to finish | |
err = c.Wait() | |
check(err, "Error in wait to shutdown chrome") | |
pLinks := getVideoLinks(buf) | |
record := fmt.Sprint(pId + "\t" + pUrl + "\t" + pLinks) | |
filePath := pwd() + "/TokoProductDetails.csv" | |
WriteToFile(filePath, record) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment