Last active
November 6, 2017 04:47
-
-
Save SilverCory/339d11a49f9bb699271ca53aae67cfec to your computer and use it in GitHub Desktop.
Scrape all the video information from a youtube video and store it in json files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"flag" | |
"fmt" | |
"io/ioutil" | |
"net/http" | |
"os" | |
"strconv" | |
) | |
type Page struct { | |
Token string `json:"nextPageToken"` | |
Videos []Video `json:"items"` | |
} | |
type Video struct { | |
ID ID `json:"id"` | |
} | |
type ID struct { | |
VideoID string `json:"videoId"` | |
} | |
func main() { | |
apikey := flag.String("apikey", "", "Your youtube data enabled API key") | |
channelId := flag.String("channelId", "", "The channel id of the person you're scraping.") | |
flag.Parse() | |
if err := os.Mkdir(*channelId, 0644); err != nil { | |
fmt.Println("Error making channel directory!") | |
return | |
} | |
pageNumber := 1 | |
var nextPage *Page | |
request := "https://www.googleapis.com/youtube/v3/search?key=" + *apikey + "&channelId=" + *channelId + "&part=snippet,id&order=date&maxResults=50" | |
for { | |
req := request | |
if nextPage == nil || nextPage.Token == "" { | |
if pageNumber != 1 { | |
fmt.Println("Unexpected end!") | |
return | |
} | |
} else { | |
req += "&pageToken=" + nextPage.Token | |
} | |
fmt.Println("Fetching page: ", strconv.Itoa(pageNumber)) | |
data, page, err := getPage(req) | |
if err != nil { | |
return | |
} | |
if err := writeData(*channelId, data, pageNumber); err != nil { | |
fmt.Println("Error writing file!", err) | |
return | |
} | |
traverseVideos(*channelId, *apikey, page.Videos) | |
if page.Token == "" { | |
return | |
} | |
nextPage = page | |
pageNumber += 1 | |
} | |
} | |
func traverseVideos(channelId, apikey string, videos []Video) { | |
for _, v := range videos { | |
fmt.Println(" - Fetching video id: " + v.ID.VideoID) | |
fmt.Println(" - URL: https://www.googleapis.com/youtube/v3/videos?part=id%2C+snippet&id=" + v.ID.VideoID + "&key=" + apikey) | |
req, err := http.Get("https://www.googleapis.com/youtube/v3/videos?part=id%2C+snippet&id=" + v.ID.VideoID + "&key=" + apikey) | |
if err != nil { | |
fmt.Println(" - Error getting video page!", err) | |
continue | |
} else if req.StatusCode < 200 || req.StatusCode > 209 { | |
fmt.Println(" - Non 20x response status!", req.Status) | |
continue | |
} | |
data, err := ioutil.ReadAll(req.Body) | |
if err != nil { | |
fmt.Println(" - Error reading video page!", err) | |
continue | |
} | |
err = ioutil.WriteFile(channelId+"/"+v.ID.VideoID+".json", data, 0644) | |
if err != nil { | |
fmt.Println(" - Error reading writing video page!", err) | |
continue | |
} | |
} | |
} | |
func writeData(channelId string, data []byte, number int) error { | |
return ioutil.WriteFile(channelId+"_page"+strconv.Itoa(number)+".json", data, 0644) | |
} | |
func getPage(url string) ([]byte, *Page, error) { | |
fmt.Println("URL: ", url) | |
req, err := http.Get(url) | |
if err != nil { | |
fmt.Println("Error getting page!", err) | |
return []byte{}, nil, err | |
} else if req.StatusCode < 200 || req.StatusCode > 209 { | |
fmt.Println("Non 20x response status!", req.Status) | |
return []byte{}, nil, fmt.Errorf("non 20x response status %q", req.Status) | |
} | |
defer req.Body.Close() | |
pageContents, err := ioutil.ReadAll(req.Body) | |
if err != nil { | |
fmt.Println("Error reading page!", err) | |
return []byte{}, nil, err | |
} | |
page := &Page{} | |
err = json.Unmarshal(pageContents, page) | |
if err != nil { | |
fmt.Println("Error decoding page!", err) | |
return []byte{}, nil, err | |
} | |
return pageContents, page, nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment