Created
April 4, 2019 17:45
-
-
Save nehayward/f3619366898b212f4e30f3d4f64fce2d to your computer and use it in GitHub Desktop.
Photo Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"encoding/csv" | |
"fmt" | |
"io" | |
"log" | |
"os" | |
"strings" | |
"github.com/gocolly/colly" | |
) | |
type company struct { | |
name string | |
url string | |
} | |
func main() { | |
if len(os.Args) < 2 { | |
fmt.Println("Please enter ./imageDownloader companies.csv") | |
return | |
} | |
companiesCSV, _ := os.Open(os.Args[1]) | |
reader := csv.NewReader(bufio.NewReader(companiesCSV)) | |
var companies []company | |
for { | |
line, error := reader.Read() | |
if error == io.EOF { | |
break | |
} else if error != nil { | |
log.Fatal(error) | |
} | |
companies = append(companies, company{ | |
name: line[0], | |
url: line[1], | |
}) | |
} | |
// fmt.Println(companies[8].url) | |
if !strings.HasPrefix(companies[8].url, "http") { | |
companies[8].url = "http://" + companies[8].url | |
} | |
os.MkdirAll("Companies", os.ModePerm) | |
findImage(companies[8].url) | |
// findImage("https://www.golove.org/staff") | |
// downloadImage(companies) | |
} | |
func downloadImage(companies []company) { | |
for i, company := range companies { | |
os.MkdirAll(fmt.Sprintf("Companies/%d", i+2), os.ModePerm) | |
findImage(company.url) | |
} | |
} | |
func findImage(url string) { | |
c := colly.NewCollector() | |
// Find and visit all links | |
c.OnHTML("a[href]", func(e *colly.HTMLElement) { | |
// fmt.Println("href") | |
// link := e.Attr("style") | |
// image2 := e.Text | |
// // Print link | |
// fmt.Println(image2) | |
// fmt.Printf("%s\n", link) | |
// Visit link found on page | |
// Only those links are visited which are in AllowedDomains | |
// c.Visit(e.Request.AbsoluteURL(link)) | |
}) | |
// Find and visit all links | |
c.OnHTML("img[src]", func(e *colly.HTMLElement) { | |
// fmt.Println("HERE") | |
fmt.Println(e.Attr("src")) | |
c.Visit(e.Attr("src")) | |
// Print link | |
// Visit link found on page | |
// Only those links are visited which are in AllowedDomains | |
// c.Visit(e.Request.AbsoluteURL(link)) | |
}) | |
// c.OnHTML("a[href]", func(e *colly.HTMLElement) { | |
// // fmt.Printf("%s\n", ) | |
// // Visit link found on page | |
// // Only those links are visited which are in AllowedDomains | |
// // c.Visit(e.Request.AbsoluteURL(link)) | |
// }) | |
// Set error handler | |
c.OnError(func(r *colly.Response, e error) { | |
log.Println("error:", e, r.Request.URL, string(r.Body)) | |
}) | |
c.OnRequest(func(r *colly.Request) { | |
fmt.Println("Visiting", r.URL) | |
}) | |
c.OnResponse(func(r *colly.Response) { | |
if strings.Index(r.Headers.Get("Content-Type"), "image") > -1 { | |
// r.Save(outputDir + r.FileName()) | |
// fmt.Println(r.FileName()) | |
// r.Save("Desktop/" + r.FileName()) | |
// e := r.Save("/Desktop/" + r.FileName) | |
// err := ioutil.WriteFile("testdata/hello", r.Body, 0644) | |
err := r.Save("Companies/" + r.FileName()) | |
if err != nil { | |
log.Fatal(err) | |
} | |
return | |
} | |
// handle further response types... | |
}) | |
c.Visit(url) | |
} | |
// ADD HTTP | |
// function addhttp(url string) { | |
// if (!preg_match("~^(?:f|ht)tps?://~i", $url)) { | |
// $url = "http://" . $url; | |
// } | |
// return $url; | |
// } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment