wylee · September 25, 2012 18:55
diff --git a/gistfile1.go b/gistfile1.go
 /*
 This "script" scrapes a collection of images from Wikipedia.

 It reads a file containing a list of URLs (that were previously scraped
 from the index page of the collection). For each of those URLs, it
 retrieves the pointed-to page, locates a particular image URL within that
 page (using a regex), and then downloads the image.

 The image is written in 1MB chunks; progress is reported as each chunk is
 written. Downloaded images are named N.jpg where N corresponds to the line
 number in the URL file.

 If an image has already been downloaded, it will be skipped. A file is
 considered already downloaded if it has a JPEG EOF marker (FF D9) AND it
 has the same byte count as the source image; if only one of these
 conditions is true, the program will abort (in that case, delete the
 offending file and try again).

 If an image has been partially downloaded, the next attempt at downloading
 it will resume directly after the already downloaded bytes by adding a
 Range header to the request.

 Command line args:

 	-inputFile The file containing the list of URLs, one per line
 	-start Image to start from (line # in list of URLs); if not present,
 	       start from the first incomplete or missing image
 	-outputDir The directory into which images will be downloaded

 TODO:

 	- Resume after partial download
 		- Resume after suspend
 		- Retry after network failure
 	- Allow downloading of a single file
 */
 package main

 import (
 	"errors"
 	"flag"
 	"fmt"
 	"image/jpeg"
 	"io"
 	"io/ioutil"
 	"net/http"
 	"os"
 	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
 	"time"
 )

 const (
 	chunkSize    = 1 << 20
 	statMsg      = "\rDownloading %d/%d %s (%d/%d %.2f%%)..." // index, total, name, bytes read, total bytes, percent read
 	doneMsg      = "\nDownloaded %d/%d %s in %v."             // index, total, name, time
 	pauseTime    = 2 * time.Second
 	imgUrlRegexp = `<div class="fullImageLink".*><a href="(//upload.wikimedia.org/wikipedia/commons/.+/.+/.+\.jpg)">`
 )

 var (
 	start     int
 	inputFile string
 	outputDir string
 )

 func init() {
 	flag.StringVar(&inputFile, "inputFile", "", "File containing URLs, one per line")
 	flag.IntVar(&start, "start", 0, "Image to start at")
 	flag.StringVar(&outputDir, "outputDir", "", "Output directory")
 }

 func main() {
 	flag.Parse()

 	// Get the list of preview page URLs from the specified file
 	content, _ := ioutil.ReadFile(inputFile)
 	if content[len(content)-1] == '\n' {
 		content = content[:len(content)-1]
 	}
 	urls := strings.Split(string(content), "\n")

 	numImgs := len(urls)

 	start, err := findStart(start, outputDir, numImgs)
 	if err != nil {
 		die(err, 1)
 	}

 	urls = urls[start-1:]

 	fmt.Printf("Starting from image %d.\n", start)

 	for i, pageUrl := range urls {
 		startTime := time.Now()

 		ordinal := start + i
 		fileName := fmt.Sprintf("%d.jpg", ordinal)
 		path := filepath.Join(outputDir, fileName)

 		page, err := getPreviewPage(pageUrl)
 		if err != nil {
 			die(err, 2)
 		}

 		imgUrl, err := extractImgUrl(page)
 		if err != nil {
 			die(err, 2)
 		}

 		// Get last part of image URL for stat message
 		shortName := strings.Split(imgUrl, "wikipedia/commons")[1]

 		// Make HEAD request to get total number of bytes
 		headResp, _ := http.Head(imgUrl)
 		headResp.Body.Close()

 		targetSize, _ := strconv.ParseInt(headResp.Header["Content-Length"][0], 10, 64)
 		targetFile := NewTargetFile(path, ordinal, targetSize)

 		stat := func() {
 			bytesWritten := targetFile.currentSize
 			percentage := float32(bytesWritten) / float32(targetSize) * 100
 			fmt.Printf(statMsg, ordinal, numImgs, shortName, bytesWritten, targetSize, percentage)
 		}

 		stat()

 		if targetFile.isDownloaded() {
 			fmt.Printf("\n%s already downloaded; skipping.", path)
 		} else if targetFile.endsWithJpegEofMarker() || targetFile.currentSizeIsTargetSize() {
 			die(errors.New("File appears to be complete OR byte count matches but not both; too confused to continue."), 3)
 		} else {
 			for {
 				err := downloadImg(imgUrl, targetFile, stat)
 				// If we get an EOF here, it's from the TCP socket.
 				// This will happen when the image is fully transferred.
 				// It will also happen if the socket gets closed prematurely.
 				// In the latter case, retry.
 				if err != nil {
 					if err == io.EOF {
 						if targetFile.currentSize == targetSize {
 							fmt.Printf(doneMsg, targetFile.ordinal, numImgs, shortName, time.Now().Sub(startTime))
 							break
 						}
 						fmt.Printf("\nRetrying %s...\n", fileName)
 					} else {
 						die(err, 4)
 					}
 				}
 			}
 		}

 		targetFile.Close() // XXX: Do this here or somewhere else?

 		fmt.Printf("\nPausing for %d seconds...", (pauseTime / time.Second))
 		time.Sleep(pauseTime)
 	}
 }

 type TargetFile struct {
 	*os.File
 	ordinal     int
 	currentSize int64
 	targetSize  int64
 }

 func NewTargetFile(path string, ordinal int, targetSize int64) *TargetFile {
 	var currentSize int64 = 0
 	f, err := os.OpenFile(path, os.O_RDWR, 0664)
 	if os.IsNotExist(err) {
 		f, _ = os.Create(path)
 	} else if err != nil {
 		die(err, 100)
 	} else {
 		fileInfo, _ := f.Stat()
 		currentSize = fileInfo.Size()
 	}
 	return &TargetFile{f, ordinal, currentSize, targetSize}
 }

 func (f *TargetFile) Write(b []byte) (n int, err error) {
 	n, err = f.File.Write(b)
 	f.currentSize += int64(n)
 	return
 }

 func (f *TargetFile) isDownloaded() bool {
 	return f.currentSizeIsTargetSize() && f.endsWithJpegEofMarker()
 }

 func (f *TargetFile) currentSizeIsTargetSize() bool {
 	return f.currentSize == f.targetSize
 }

 func (f *TargetFile) endsWithJpegEofMarker() bool {
 	return endsWithJpegEofMarker(f.File)
 }

 // A valid JPEG file has JPEG info and ends with a JPEG EOF marker
 func isValidJpegFile(f *os.File) bool {
 	_, err := jpeg.DecodeConfig(f)
 	return err == nil && endsWithJpegEofMarker(f)
 }

 func isJpegEofMarker(marker []byte) bool {
 	return (len(marker) == 2) && (marker[0] == 0xff && marker[1] == 0xd9)
 }

 func endsWithJpegEofMarker(f *os.File) bool {
 	info, _ := f.Stat()
 	size := info.Size()
 	if size < 2 {
 		return false
 	}
 	a := make([]byte, 2)
 	f.ReadAt(a, size-2)
 	return isJpegEofMarker(a)
 }

 // When no -start flag is present, figure out where to start downloading.
 // This function returns the ordinal of the first image that is either not
 // complete (no JPEG EOF marker) or not present.
 func findStart(start int, outputDir string, numImgs int) (int, error) {
 	if start > 0 {
 		return start, nil
 	}
 	infoList, err := ioutil.ReadDir(outputDir)
 	if err != nil {
 		return 0, err
 	}
 	present := make([]int, numImgs+1)
 	for _, info := range infoList {
 		name := info.Name()
 		if ext := filepath.Ext(name); ext == ".jpg" {
 			nameMinusExt := name[:len(name)-len(ext)]
 			i, err := strconv.Atoi(nameMinusExt)
 			if err != nil {
 				return 0, err
 			}
 			f, err := os.Open(filepath.Join(outputDir, name))
 			if err != nil {
 				return 0, err
 			}
 			if endsWithJpegEofMarker(f) {
 				present[i] = i
 			}
 			f.Close()
 		}
 	}
 	for i, v := range present {
 		if i != v {
 			return i, nil
 		}
 	}
 	return 0, errors.New("All images appear to be present")
 }

 func getPreviewPage(pageUrl string) ([]byte, error) {
 	resp, err := http.Get(pageUrl)
 	defer resp.Body.Close()
 	if err != nil {
 		return nil, err
 	}
 	body, err := ioutil.ReadAll(resp.Body)
 	if err != nil {
 		return nil, err
 	}
 	return body, nil
 }

 func extractImgUrl(page []byte) (imgUrl string, err error) {
 	re, _ := regexp.Compile(imgUrlRegexp)
 	m := re.FindSubmatch(page)
 	if m == nil || len(m) != 2 {
 		err = errors.New("Could not find image URL.")
 	}
 	return fmt.Sprintf("http:%s", m[1]), err
 }

 func downloadImg(imgUrl string, file *TargetFile, stat func()) error {
 	client := &http.Client{}
 	req, _ := http.NewRequest("GET", imgUrl, nil)

 	currentSize := file.currentSize
 	targetSize := file.targetSize

 	if currentSize > 0 && currentSize < targetSize {
 		req.Header.Add("Range", fmt.Sprintf("bytes=%d-", currentSize))
 	}

 	resp, err := client.Do(req)
 	defer resp.Body.Close()
 	if err != nil {
 		return err
 	} else if resp.StatusCode >= 300 {
 		b, _ := ioutil.ReadAll(resp.Body)
 		return errors.New(fmt.Sprintf("%s\n%s\n", resp.Status, b))
 	}

 	contentLength := resp.ContentLength

 	if currentSize+contentLength != targetSize {
 		return errors.New(
 			fmt.Sprintf("Byte counts don't match: %d (file size + Range) != %d (total)\n", currentSize+contentLength, targetSize))
 	}

 	// Download the full size image in chunks
 	file.Seek(0, os.SEEK_END)
 	for file.currentSize < targetSize {
 		if _, err := io.CopyN(file, resp.Body, chunkSize); err != nil {
 			stat()
 			return err
 		}
 		stat()
 	}

 	return nil
 }

 func die(err error, errCode int, a ...interface{}) {
 	msg := fmt.Sprintf(err.Error(), a...)
 	fmt.Println(os.Stderr, msg)
 	os.Exit(errCode)
 }
	/*
	This "script" scrapes a collection of images from Wikipedia.

	It reads a file containing a list of URLs (that were previously scraped
	from the index page of the collection). For each of those URLs, it
	retrieves the pointed-to page, locates a particular image URL within that
	page (using a regex), and then downloads the image.

	The image is written in 1MB chunks; progress is reported as each chunk is
	written. Downloaded images are named N.jpg where N corresponds to the line
	number in the URL file.

	If an image has already been downloaded, it will be skipped. A file is
	considered already downloaded if it has a JPEG EOF marker (FF D9) AND it
	has the same byte count as the source image; if only one of these
	conditions is true, the program will abort (in that case, delete the
	offending file and try again).

	If an image has been partially downloaded, the next attempt at downloading
	it will resume directly after the already downloaded bytes by adding a
	Range header to the request.

	Command line args:

	-inputFile The file containing the list of URLs, one per line
	-start Image to start from (line # in list of URLs); if not present,
	start from the first incomplete or missing image
	-outputDir The directory into which images will be downloaded

	TODO:

	- Resume after partial download
	- Resume after suspend
	- Retry after network failure
	- Allow downloading of a single file
	*/
	package main

	import (
	"errors"
	"flag"
	"fmt"
	"image/jpeg"
	"io"
	"io/ioutil"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
	"time"
	)

	const (
	chunkSize = 1 << 20
	statMsg = "\rDownloading %d/%d %s (%d/%d %.2f%%)..." // index, total, name, bytes read, total bytes, percent read
	doneMsg = "\nDownloaded %d/%d %s in %v." // index, total, name, time
	pauseTime = 2 * time.Second
	imgUrlRegexp = `<div class="fullImageLink".*><a href="(//upload.wikimedia.org/wikipedia/commons/.+/.+/.+\.jpg)">`
	)

	var (
	start int
	inputFile string
	outputDir string
	)

	func init() {
	flag.StringVar(&inputFile, "inputFile", "", "File containing URLs, one per line")
	flag.IntVar(&start, "start", 0, "Image to start at")
	flag.StringVar(&outputDir, "outputDir", "", "Output directory")
	}

	func main() {
	flag.Parse()

	// Get the list of preview page URLs from the specified file
	content, _ := ioutil.ReadFile(inputFile)
	if content[len(content)-1] == '\n' {
	content = content[:len(content)-1]
	}
	urls := strings.Split(string(content), "\n")

	numImgs := len(urls)

	start, err := findStart(start, outputDir, numImgs)
	if err != nil {
	die(err, 1)
	}

	urls = urls[start-1:]

	fmt.Printf("Starting from image %d.\n", start)

	for i, pageUrl := range urls {
	startTime := time.Now()

	ordinal := start + i
	fileName := fmt.Sprintf("%d.jpg", ordinal)
	path := filepath.Join(outputDir, fileName)

	page, err := getPreviewPage(pageUrl)
	if err != nil {
	die(err, 2)
	}

	imgUrl, err := extractImgUrl(page)
	if err != nil {
	die(err, 2)
	}

	// Get last part of image URL for stat message
	shortName := strings.Split(imgUrl, "wikipedia/commons")[1]

	// Make HEAD request to get total number of bytes
	headResp, _ := http.Head(imgUrl)
	headResp.Body.Close()

	targetSize, _ := strconv.ParseInt(headResp.Header["Content-Length"][0], 10, 64)
	targetFile := NewTargetFile(path, ordinal, targetSize)

	stat := func() {
	bytesWritten := targetFile.currentSize
	percentage := float32(bytesWritten) / float32(targetSize) * 100
	fmt.Printf(statMsg, ordinal, numImgs, shortName, bytesWritten, targetSize, percentage)
	}

	stat()

	if targetFile.isDownloaded() {
	fmt.Printf("\n%s already downloaded; skipping.", path)
	} else if targetFile.endsWithJpegEofMarker() \|\| targetFile.currentSizeIsTargetSize() {
	die(errors.New("File appears to be complete OR byte count matches but not both; too confused to continue."), 3)
	} else {
	for {
	err := downloadImg(imgUrl, targetFile, stat)
	// If we get an EOF here, it's from the TCP socket.
	// This will happen when the image is fully transferred.
	// It will also happen if the socket gets closed prematurely.
	// In the latter case, retry.
	if err != nil {
	if err == io.EOF {
	if targetFile.currentSize == targetSize {
	fmt.Printf(doneMsg, targetFile.ordinal, numImgs, shortName, time.Now().Sub(startTime))
	break
	}
	fmt.Printf("\nRetrying %s...\n", fileName)
	} else {
	die(err, 4)
	}
	}
	}
	}

	targetFile.Close() // XXX: Do this here or somewhere else?

	fmt.Printf("\nPausing for %d seconds...", (pauseTime / time.Second))
	time.Sleep(pauseTime)
	}
	}

	type TargetFile struct {
	*os.File
	ordinal int
	currentSize int64
	targetSize int64
	}

	func NewTargetFile(path string, ordinal int, targetSize int64) *TargetFile {
	var currentSize int64 = 0
	f, err := os.OpenFile(path, os.O_RDWR, 0664)
	if os.IsNotExist(err) {
	f, _ = os.Create(path)
	} else if err != nil {
	die(err, 100)
	} else {
	fileInfo, _ := f.Stat()
	currentSize = fileInfo.Size()
	}
	return &TargetFile{f, ordinal, currentSize, targetSize}
	}

	func (f *TargetFile) Write(b []byte) (n int, err error) {
	n, err = f.File.Write(b)
	f.currentSize += int64(n)
	return
	}

	func (f *TargetFile) isDownloaded() bool {
	return f.currentSizeIsTargetSize() && f.endsWithJpegEofMarker()
	}

	func (f *TargetFile) currentSizeIsTargetSize() bool {
	return f.currentSize == f.targetSize
	}

	func (f *TargetFile) endsWithJpegEofMarker() bool {
	return endsWithJpegEofMarker(f.File)
	}

	// A valid JPEG file has JPEG info and ends with a JPEG EOF marker
	func isValidJpegFile(f *os.File) bool {
	_, err := jpeg.DecodeConfig(f)
	return err == nil && endsWithJpegEofMarker(f)
	}

	func isJpegEofMarker(marker []byte) bool {
	return (len(marker) == 2) && (marker[0] == 0xff && marker[1] == 0xd9)
	}

	func endsWithJpegEofMarker(f *os.File) bool {
	info, _ := f.Stat()
	size := info.Size()
	if size < 2 {
	return false
	}
	a := make([]byte, 2)
	f.ReadAt(a, size-2)
	return isJpegEofMarker(a)
	}

	// When no -start flag is present, figure out where to start downloading.
	// This function returns the ordinal of the first image that is either not
	// complete (no JPEG EOF marker) or not present.
	func findStart(start int, outputDir string, numImgs int) (int, error) {
	if start > 0 {
	return start, nil
	}
	infoList, err := ioutil.ReadDir(outputDir)
	if err != nil {
	return 0, err
	}
	present := make([]int, numImgs+1)
	for _, info := range infoList {
	name := info.Name()
	if ext := filepath.Ext(name); ext == ".jpg" {
	nameMinusExt := name[:len(name)-len(ext)]
	i, err := strconv.Atoi(nameMinusExt)
	if err != nil {
	return 0, err
	}
	f, err := os.Open(filepath.Join(outputDir, name))
	if err != nil {
	return 0, err
	}
	if endsWithJpegEofMarker(f) {
	present[i] = i
	}
	f.Close()
	}
	}
	for i, v := range present {
	if i != v {
	return i, nil
	}
	}
	return 0, errors.New("All images appear to be present")
	}

	func getPreviewPage(pageUrl string) ([]byte, error) {
	resp, err := http.Get(pageUrl)
	defer resp.Body.Close()
	if err != nil {
	return nil, err
	}
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
	return nil, err
	}
	return body, nil
	}

	func extractImgUrl(page []byte) (imgUrl string, err error) {
	re, _ := regexp.Compile(imgUrlRegexp)
	m := re.FindSubmatch(page)
	if m == nil \|\| len(m) != 2 {
	err = errors.New("Could not find image URL.")
	}
	return fmt.Sprintf("http:%s", m[1]), err
	}

	func downloadImg(imgUrl string, file *TargetFile, stat func()) error {
	client := &http.Client{}
	req, _ := http.NewRequest("GET", imgUrl, nil)

	currentSize := file.currentSize
	targetSize := file.targetSize

	if currentSize > 0 && currentSize < targetSize {
	req.Header.Add("Range", fmt.Sprintf("bytes=%d-", currentSize))
	}

	resp, err := client.Do(req)
	defer resp.Body.Close()
	if err != nil {
	return err
	} else if resp.StatusCode >= 300 {
	b, _ := ioutil.ReadAll(resp.Body)
	return errors.New(fmt.Sprintf("%s\n%s\n", resp.Status, b))
	}

	contentLength := resp.ContentLength

	if currentSize+contentLength != targetSize {
	return errors.New(
	fmt.Sprintf("Byte counts don't match: %d (file size + Range) != %d (total)\n", currentSize+contentLength, targetSize))
	}

	// Download the full size image in chunks
	file.Seek(0, os.SEEK_END)
	for file.currentSize < targetSize {
	if _, err := io.CopyN(file, resp.Body, chunkSize); err != nil {
	stat()
	return err
	}
	stat()
	}

	return nil
	}

	func die(err error, errCode int, a ...interface{}) {
	msg := fmt.Sprintf(err.Error(), a...)
	fmt.Println(os.Stderr, msg)
	os.Exit(errCode)
	}