Created
September 25, 2012 18:55
-
-
Save wylee/3783735 to your computer and use it in GitHub Desktop.
My first Go program
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This "script" scrapes a collection of images from Wikipedia. | |
It reads a file containing a list of URLs (that were previously scraped | |
from the index page of the collection). For each of those URLs, it | |
retrieves the pointed-to page, locates a particular image URL within that | |
page (using a regex), and then downloads the image. | |
The image is written in 1MB chunks; progress is reported as each chunk is | |
written. Downloaded images are named N.jpg where N corresponds to the line | |
number in the URL file. | |
If an image has already been downloaded, it will be skipped. A file is | |
considered already downloaded if it has a JPEG EOF marker (FF D9) AND it | |
has the same byte count as the source image; if only one of these | |
conditions is true, the program will abort (in that case, delete the | |
offending file and try again). | |
If an image has been partially downloaded, the next attempt at downloading | |
it will resume directly after the already downloaded bytes by adding a | |
Range header to the request. | |
Command line args: | |
-inputFile The file containing the list of URLs, one per line | |
-start Image to start from (line # in list of URLs); if not present, | |
start from the first incomplete or missing image | |
-outputDir The directory into which images will be downloaded | |
TODO: | |
- Resume after partial download | |
- Resume after suspend | |
- Retry after network failure | |
- Allow downloading of a single file | |
*/ | |
package main | |
import ( | |
"errors" | |
"flag" | |
"fmt" | |
"image/jpeg" | |
"io" | |
"io/ioutil" | |
"net/http" | |
"os" | |
"path/filepath" | |
"regexp" | |
"strconv" | |
"strings" | |
"time" | |
) | |
const ( | |
chunkSize = 1 << 20 | |
statMsg = "\rDownloading %d/%d %s (%d/%d %.2f%%)..." // index, total, name, bytes read, total bytes, percent read | |
doneMsg = "\nDownloaded %d/%d %s in %v." // index, total, name, time | |
pauseTime = 2 * time.Second | |
imgUrlRegexp = `<div class="fullImageLink".*><a href="(//upload.wikimedia.org/wikipedia/commons/.+/.+/.+\.jpg)">` | |
) | |
var ( | |
start int | |
inputFile string | |
outputDir string | |
) | |
func init() { | |
flag.StringVar(&inputFile, "inputFile", "", "File containing URLs, one per line") | |
flag.IntVar(&start, "start", 0, "Image to start at") | |
flag.StringVar(&outputDir, "outputDir", "", "Output directory") | |
} | |
func main() { | |
flag.Parse() | |
// Get the list of preview page URLs from the specified file | |
content, _ := ioutil.ReadFile(inputFile) | |
if content[len(content)-1] == '\n' { | |
content = content[:len(content)-1] | |
} | |
urls := strings.Split(string(content), "\n") | |
numImgs := len(urls) | |
start, err := findStart(start, outputDir, numImgs) | |
if err != nil { | |
die(err, 1) | |
} | |
urls = urls[start-1:] | |
fmt.Printf("Starting from image %d.\n", start) | |
for i, pageUrl := range urls { | |
startTime := time.Now() | |
ordinal := start + i | |
fileName := fmt.Sprintf("%d.jpg", ordinal) | |
path := filepath.Join(outputDir, fileName) | |
page, err := getPreviewPage(pageUrl) | |
if err != nil { | |
die(err, 2) | |
} | |
imgUrl, err := extractImgUrl(page) | |
if err != nil { | |
die(err, 2) | |
} | |
// Get last part of image URL for stat message | |
shortName := strings.Split(imgUrl, "wikipedia/commons")[1] | |
// Make HEAD request to get total number of bytes | |
headResp, _ := http.Head(imgUrl) | |
headResp.Body.Close() | |
targetSize, _ := strconv.ParseInt(headResp.Header["Content-Length"][0], 10, 64) | |
targetFile := NewTargetFile(path, ordinal, targetSize) | |
stat := func() { | |
bytesWritten := targetFile.currentSize | |
percentage := float32(bytesWritten) / float32(targetSize) * 100 | |
fmt.Printf(statMsg, ordinal, numImgs, shortName, bytesWritten, targetSize, percentage) | |
} | |
stat() | |
if targetFile.isDownloaded() { | |
fmt.Printf("\n%s already downloaded; skipping.", path) | |
} else if targetFile.endsWithJpegEofMarker() || targetFile.currentSizeIsTargetSize() { | |
die(errors.New("File appears to be complete OR byte count matches but not both; too confused to continue."), 3) | |
} else { | |
for { | |
err := downloadImg(imgUrl, targetFile, stat) | |
// If we get an EOF here, it's from the TCP socket. | |
// This will happen when the image is fully transferred. | |
// It will also happen if the socket gets closed prematurely. | |
// In the latter case, retry. | |
if err != nil { | |
if err == io.EOF { | |
if targetFile.currentSize == targetSize { | |
fmt.Printf(doneMsg, targetFile.ordinal, numImgs, shortName, time.Now().Sub(startTime)) | |
break | |
} | |
fmt.Printf("\nRetrying %s...\n", fileName) | |
} else { | |
die(err, 4) | |
} | |
} | |
} | |
} | |
targetFile.Close() // XXX: Do this here or somewhere else? | |
fmt.Printf("\nPausing for %d seconds...", (pauseTime / time.Second)) | |
time.Sleep(pauseTime) | |
} | |
} | |
type TargetFile struct { | |
*os.File | |
ordinal int | |
currentSize int64 | |
targetSize int64 | |
} | |
func NewTargetFile(path string, ordinal int, targetSize int64) *TargetFile { | |
var currentSize int64 = 0 | |
f, err := os.OpenFile(path, os.O_RDWR, 0664) | |
if os.IsNotExist(err) { | |
f, _ = os.Create(path) | |
} else if err != nil { | |
die(err, 100) | |
} else { | |
fileInfo, _ := f.Stat() | |
currentSize = fileInfo.Size() | |
} | |
return &TargetFile{f, ordinal, currentSize, targetSize} | |
} | |
func (f *TargetFile) Write(b []byte) (n int, err error) { | |
n, err = f.File.Write(b) | |
f.currentSize += int64(n) | |
return | |
} | |
func (f *TargetFile) isDownloaded() bool { | |
return f.currentSizeIsTargetSize() && f.endsWithJpegEofMarker() | |
} | |
func (f *TargetFile) currentSizeIsTargetSize() bool { | |
return f.currentSize == f.targetSize | |
} | |
func (f *TargetFile) endsWithJpegEofMarker() bool { | |
return endsWithJpegEofMarker(f.File) | |
} | |
// A valid JPEG file has JPEG info and ends with a JPEG EOF marker | |
func isValidJpegFile(f *os.File) bool { | |
_, err := jpeg.DecodeConfig(f) | |
return err == nil && endsWithJpegEofMarker(f) | |
} | |
func isJpegEofMarker(marker []byte) bool { | |
return (len(marker) == 2) && (marker[0] == 0xff && marker[1] == 0xd9) | |
} | |
func endsWithJpegEofMarker(f *os.File) bool { | |
info, _ := f.Stat() | |
size := info.Size() | |
if size < 2 { | |
return false | |
} | |
a := make([]byte, 2) | |
f.ReadAt(a, size-2) | |
return isJpegEofMarker(a) | |
} | |
// When no -start flag is present, figure out where to start downloading. | |
// This function returns the ordinal of the first image that is either not | |
// complete (no JPEG EOF marker) or not present. | |
func findStart(start int, outputDir string, numImgs int) (int, error) { | |
if start > 0 { | |
return start, nil | |
} | |
infoList, err := ioutil.ReadDir(outputDir) | |
if err != nil { | |
return 0, err | |
} | |
present := make([]int, numImgs+1) | |
for _, info := range infoList { | |
name := info.Name() | |
if ext := filepath.Ext(name); ext == ".jpg" { | |
nameMinusExt := name[:len(name)-len(ext)] | |
i, err := strconv.Atoi(nameMinusExt) | |
if err != nil { | |
return 0, err | |
} | |
f, err := os.Open(filepath.Join(outputDir, name)) | |
if err != nil { | |
return 0, err | |
} | |
if endsWithJpegEofMarker(f) { | |
present[i] = i | |
} | |
f.Close() | |
} | |
} | |
for i, v := range present { | |
if i != v { | |
return i, nil | |
} | |
} | |
return 0, errors.New("All images appear to be present") | |
} | |
func getPreviewPage(pageUrl string) ([]byte, error) { | |
resp, err := http.Get(pageUrl) | |
defer resp.Body.Close() | |
if err != nil { | |
return nil, err | |
} | |
body, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
return nil, err | |
} | |
return body, nil | |
} | |
func extractImgUrl(page []byte) (imgUrl string, err error) { | |
re, _ := regexp.Compile(imgUrlRegexp) | |
m := re.FindSubmatch(page) | |
if m == nil || len(m) != 2 { | |
err = errors.New("Could not find image URL.") | |
} | |
return fmt.Sprintf("http:%s", m[1]), err | |
} | |
func downloadImg(imgUrl string, file *TargetFile, stat func()) error { | |
client := &http.Client{} | |
req, _ := http.NewRequest("GET", imgUrl, nil) | |
currentSize := file.currentSize | |
targetSize := file.targetSize | |
if currentSize > 0 && currentSize < targetSize { | |
req.Header.Add("Range", fmt.Sprintf("bytes=%d-", currentSize)) | |
} | |
resp, err := client.Do(req) | |
defer resp.Body.Close() | |
if err != nil { | |
return err | |
} else if resp.StatusCode >= 300 { | |
b, _ := ioutil.ReadAll(resp.Body) | |
return errors.New(fmt.Sprintf("%s\n%s\n", resp.Status, b)) | |
} | |
contentLength := resp.ContentLength | |
if currentSize+contentLength != targetSize { | |
return errors.New( | |
fmt.Sprintf("Byte counts don't match: %d (file size + Range) != %d (total)\n", currentSize+contentLength, targetSize)) | |
} | |
// Download the full size image in chunks | |
file.Seek(0, os.SEEK_END) | |
for file.currentSize < targetSize { | |
if _, err := io.CopyN(file, resp.Body, chunkSize); err != nil { | |
stat() | |
return err | |
} | |
stat() | |
} | |
return nil | |
} | |
func die(err error, errCode int, a ...interface{}) { | |
msg := fmt.Sprintf(err.Error(), a...) | |
fmt.Println(os.Stderr, msg) | |
os.Exit(errCode) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment