Skip to content

Instantly share code, notes, and snippets.

@nehayward
Created April 4, 2019 17:45
Show Gist options
  • Save nehayward/f3619366898b212f4e30f3d4f64fce2d to your computer and use it in GitHub Desktop.
Save nehayward/f3619366898b212f4e30f3d4f64fce2d to your computer and use it in GitHub Desktop.
Photo Scraper
package main
import (
"bufio"
"encoding/csv"
"fmt"
"io"
"log"
"os"
"strings"
"github.com/gocolly/colly"
)
type company struct {
name string
url string
}
func main() {
if len(os.Args) < 2 {
fmt.Println("Please enter ./imageDownloader companies.csv")
return
}
companiesCSV, _ := os.Open(os.Args[1])
reader := csv.NewReader(bufio.NewReader(companiesCSV))
var companies []company
for {
line, error := reader.Read()
if error == io.EOF {
break
} else if error != nil {
log.Fatal(error)
}
companies = append(companies, company{
name: line[0],
url: line[1],
})
}
// fmt.Println(companies[8].url)
if !strings.HasPrefix(companies[8].url, "http") {
companies[8].url = "http://" + companies[8].url
}
os.MkdirAll("Companies", os.ModePerm)
findImage(companies[8].url)
// findImage("https://www.golove.org/staff")
// downloadImage(companies)
}
func downloadImage(companies []company) {
for i, company := range companies {
os.MkdirAll(fmt.Sprintf("Companies/%d", i+2), os.ModePerm)
findImage(company.url)
}
}
func findImage(url string) {
c := colly.NewCollector()
// Find and visit all links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// fmt.Println("href")
// link := e.Attr("style")
// image2 := e.Text
// // Print link
// fmt.Println(image2)
// fmt.Printf("%s\n", link)
// Visit link found on page
// Only those links are visited which are in AllowedDomains
// c.Visit(e.Request.AbsoluteURL(link))
})
// Find and visit all links
c.OnHTML("img[src]", func(e *colly.HTMLElement) {
// fmt.Println("HERE")
fmt.Println(e.Attr("src"))
c.Visit(e.Attr("src"))
// Print link
// Visit link found on page
// Only those links are visited which are in AllowedDomains
// c.Visit(e.Request.AbsoluteURL(link))
})
// c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// // fmt.Printf("%s\n", )
// // Visit link found on page
// // Only those links are visited which are in AllowedDomains
// // c.Visit(e.Request.AbsoluteURL(link))
// })
// Set error handler
c.OnError(func(r *colly.Response, e error) {
log.Println("error:", e, r.Request.URL, string(r.Body))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.OnResponse(func(r *colly.Response) {
if strings.Index(r.Headers.Get("Content-Type"), "image") > -1 {
// r.Save(outputDir + r.FileName())
// fmt.Println(r.FileName())
// r.Save("Desktop/" + r.FileName())
// e := r.Save("/Desktop/" + r.FileName)
// err := ioutil.WriteFile("testdata/hello", r.Body, 0644)
err := r.Save("Companies/" + r.FileName())
if err != nil {
log.Fatal(err)
}
return
}
// handle further response types...
})
c.Visit(url)
}
// ADD HTTP
// function addhttp(url string) {
// if (!preg_match("~^(?:f|ht)tps?://~i", $url)) {
// $url = "http://" . $url;
// }
// return $url;
// }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment