Last active
November 3, 2019 20:55
-
-
Save sug0/0d9380f6b3931c37619ed64af19f3d1f to your computer and use it in GitHub Desktop.
YouTube search result scraper that doesn't rely on an html parser. Runs in O(len(ResponseBody)).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| //======================================================================= | |
| // NOTICE -- this program has stopped working, please refer to this gist: | |
| // https://gist.github.com/sug0/79680851924800d24f393978a08e9bc7 | |
| //======================================================================= | |
| // | |
| package main | |
| import ( | |
| "log" | |
| "fmt" | |
| "strings" | |
| "unicode/utf8" | |
| "io/ioutil" | |
| "net/http" | |
| "net/url" | |
| "compress/gzip" | |
| "reflect" | |
| "unsafe" | |
| flag "github.com/ogier/pflag" | |
| ) | |
| const ytBaseURL = "https://www.youtube.com/results?search_query=" | |
| func main() { | |
| var delim string | |
| var page int | |
| flag.StringVarP(&delim, "delim", "d", " -- ", "Delim character sequence.") | |
| flag.IntVarP(&page, "page", "p", 1, "The page to search.") | |
| flag.Parse() | |
| args := flag.Args() | |
| err := ytQuery(page-1, strings.Join(args, " "), func(title, uri string) { | |
| fmt.Printf("%s%shttps://www.youtube.com%s\n", title, delim, uri) | |
| }) | |
| if err != nil { | |
| log.Fatalln(err) | |
| } | |
| } | |
| func ytQuery(page int, query string, f func(title, uri string)) error { | |
| query = fmt.Sprintf("%s%s&page=%d", ytBaseURL, url.QueryEscape(query), page) | |
| req, err := http.NewRequest("GET", query, nil) | |
| if err != nil { | |
| return err | |
| } | |
| req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.32 Safari/537.36") | |
| req.Header.Set("Host", "www.youtube.com") | |
| req.Header.Set("Referer", query) | |
| req.Header.Set("Accept", "*/*") | |
| req.Header.Set("Accept-Encoding", "gzip") | |
| req.Header.Set("Accept-Language", "en-US,en;q=0.5") | |
| req.Header.Set("Cache-Control", "max-age=0") | |
| req.Header.Set("DNT", "1") | |
| req.Header.Set("Connection", "close") | |
| var cli http.Client | |
| rsp, err := cli.Do(req) | |
| if err != nil { | |
| return err | |
| } | |
| r, err := gzip.NewReader(rsp.Body) | |
| if err != nil { | |
| return err | |
| } | |
| sliceBody, err := ioutil.ReadAll(r) | |
| if err != nil { | |
| return err | |
| } | |
| body := toString(sliceBody) | |
| // is this even worth it..? | |
| r = nil | |
| var a, b int | |
| for { | |
| a, b = locateTitle(b, body) | |
| if b < 0 { | |
| break | |
| } | |
| title := body[a:b] | |
| a, b = locateURI(b, body) | |
| if b < 0 { | |
| break | |
| } | |
| if title != "TUNE" { | |
| title = decodeString(title) | |
| uri := decodeString(body[a:b]) | |
| f(title, uri) | |
| } | |
| body = body[b:] | |
| } | |
| return nil | |
| } | |
| func locateTitle(i int, body string) (int, int) { | |
| const pattern = `"},"de` | |
| j := strings.Index(body, pattern) | |
| if j < 0 { | |
| return -1, -1 | |
| } | |
| i = j - 1 | |
| for i >= 0 { | |
| if body[i] == '"' { | |
| return i+1, j | |
| } | |
| i-- | |
| } | |
| return -1, -1 | |
| } | |
| func locateURI(i int, body string) (int, int) { | |
| const pattern = `/wat` | |
| i = strings.Index(body, pattern) | |
| if i < 0 { | |
| return -1, -1 | |
| } | |
| j := i | |
| for j < len(body) { | |
| if body[j] == '"' { | |
| return i, j | |
| } | |
| j++ | |
| } | |
| return -1, -1 | |
| } | |
| func toString(body []byte) string { | |
| return *(*string)(unsafe.Pointer(&body)) | |
| } | |
| func fromString(body string) []byte { | |
| sh := (*reflect.StringHeader)(unsafe.Pointer(&body)) | |
| return *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{ | |
| Data: sh.Data, | |
| Len: sh.Len, | |
| Cap: sh.Len, | |
| })) | |
| } | |
| // no errors should occur assuming the scraped | |
| // json has none either | |
| func decodeString(str string) string { | |
| if str == "" { | |
| return "" | |
| } | |
| k := 0 | |
| p := fromString(str) | |
| for i := 0; i < len(p); { | |
| if r, ok := isEscaped(p, i); ok { | |
| k += utf8.EncodeRune(p[i:], r) | |
| i += 6 | |
| continue | |
| } | |
| p[k] = p[i] | |
| i++ | |
| k++ | |
| } | |
| //return toString(p[:k]) | |
| return str[:k] | |
| } | |
| func isEscaped(p []byte, i int) (rune, bool) { | |
| if p[i] == '\\' && i+5 < len(p) && p[i+1] == 'u' { | |
| p = p[i+2:i+6] | |
| // i : 0 1 2 3 <-- | |
| // \ u A B C D | |
| r := hexcode(p[3]) << 0 | |
| r |= hexcode(p[2]) << 4 | |
| r |= hexcode(p[1]) << 8 | |
| r |= hexcode(p[0]) << 12 | |
| return r, true | |
| } | |
| return 0, false | |
| } | |
| func hexcode(b byte) rune { | |
| switch { | |
| default: | |
| return 0 | |
| case b >= 'a' && b <= 'z': | |
| return rune(b - 'a' + 10) | |
| case b >= 'A' && b <= 'Z': | |
| return rune(b - 'A' + 10) | |
| case b >= '0' && b <= '9': | |
| return rune(b - '0') | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment