Skip to content

Instantly share code, notes, and snippets.

@sug0
Last active November 3, 2019 20:55
Show Gist options
  • Select an option

  • Save sug0/0d9380f6b3931c37619ed64af19f3d1f to your computer and use it in GitHub Desktop.

Select an option

Save sug0/0d9380f6b3931c37619ed64af19f3d1f to your computer and use it in GitHub Desktop.
YouTube search result scraper that doesn't rely on an html parser. Runs in O(len(ResponseBody)).
//
//=======================================================================
// NOTICE -- this program has stopped working, please refer to this gist:
// https://gist.github.com/sug0/79680851924800d24f393978a08e9bc7
//=======================================================================
//
package main
import (
"log"
"fmt"
"strings"
"unicode/utf8"
"io/ioutil"
"net/http"
"net/url"
"compress/gzip"
"reflect"
"unsafe"
flag "github.com/ogier/pflag"
)
const ytBaseURL = "https://www.youtube.com/results?search_query="
func main() {
var delim string
var page int
flag.StringVarP(&delim, "delim", "d", " -- ", "Delim character sequence.")
flag.IntVarP(&page, "page", "p", 1, "The page to search.")
flag.Parse()
args := flag.Args()
err := ytQuery(page-1, strings.Join(args, " "), func(title, uri string) {
fmt.Printf("%s%shttps://www.youtube.com%s\n", title, delim, uri)
})
if err != nil {
log.Fatalln(err)
}
}
func ytQuery(page int, query string, f func(title, uri string)) error {
query = fmt.Sprintf("%s%s&page=%d", ytBaseURL, url.QueryEscape(query), page)
req, err := http.NewRequest("GET", query, nil)
if err != nil {
return err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.32 Safari/537.36")
req.Header.Set("Host", "www.youtube.com")
req.Header.Set("Referer", query)
req.Header.Set("Accept", "*/*")
req.Header.Set("Accept-Encoding", "gzip")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
req.Header.Set("Cache-Control", "max-age=0")
req.Header.Set("DNT", "1")
req.Header.Set("Connection", "close")
var cli http.Client
rsp, err := cli.Do(req)
if err != nil {
return err
}
r, err := gzip.NewReader(rsp.Body)
if err != nil {
return err
}
sliceBody, err := ioutil.ReadAll(r)
if err != nil {
return err
}
body := toString(sliceBody)
// is this even worth it..?
r = nil
var a, b int
for {
a, b = locateTitle(b, body)
if b < 0 {
break
}
title := body[a:b]
a, b = locateURI(b, body)
if b < 0 {
break
}
if title != "TUNE" {
title = decodeString(title)
uri := decodeString(body[a:b])
f(title, uri)
}
body = body[b:]
}
return nil
}
func locateTitle(i int, body string) (int, int) {
const pattern = `"},"de`
j := strings.Index(body, pattern)
if j < 0 {
return -1, -1
}
i = j - 1
for i >= 0 {
if body[i] == '"' {
return i+1, j
}
i--
}
return -1, -1
}
func locateURI(i int, body string) (int, int) {
const pattern = `/wat`
i = strings.Index(body, pattern)
if i < 0 {
return -1, -1
}
j := i
for j < len(body) {
if body[j] == '"' {
return i, j
}
j++
}
return -1, -1
}
func toString(body []byte) string {
return *(*string)(unsafe.Pointer(&body))
}
func fromString(body string) []byte {
sh := (*reflect.StringHeader)(unsafe.Pointer(&body))
return *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{
Data: sh.Data,
Len: sh.Len,
Cap: sh.Len,
}))
}
// no errors should occur assuming the scraped
// json has none either
func decodeString(str string) string {
if str == "" {
return ""
}
k := 0
p := fromString(str)
for i := 0; i < len(p); {
if r, ok := isEscaped(p, i); ok {
k += utf8.EncodeRune(p[i:], r)
i += 6
continue
}
p[k] = p[i]
i++
k++
}
//return toString(p[:k])
return str[:k]
}
func isEscaped(p []byte, i int) (rune, bool) {
if p[i] == '\\' && i+5 < len(p) && p[i+1] == 'u' {
p = p[i+2:i+6]
// i : 0 1 2 3 <--
// \ u A B C D
r := hexcode(p[3]) << 0
r |= hexcode(p[2]) << 4
r |= hexcode(p[1]) << 8
r |= hexcode(p[0]) << 12
return r, true
}
return 0, false
}
func hexcode(b byte) rune {
switch {
default:
return 0
case b >= 'a' && b <= 'z':
return rune(b - 'a' + 10)
case b >= 'A' && b <= 'Z':
return rune(b - 'A' + 10)
case b >= '0' && b <= '9':
return rune(b - '0')
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment