rkumar · April 5, 2010 05:16
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/gawk -f
 #
 # 2007.07.10 v1.0 - initial release
 # 2007.10.21 v1.1 - youtube changed the way it displays vids
 # 2008.03.01 v1.2 - youtube changed the way it displays vids
 # 2008.08.28 v1.3 - added a progress bar and removed need for --re-interval 
 # 2009.08.25 v1.4 - youtube changed the way it displays vids
 # 2010.04.05      - youtube changes video_url format
 #
 # Peteris Krumins (peter@catonmat.net)
 # http://www.catonmat.net -- good coders code, great reuse
 #
 # Usage: gawk -f get_youtube_vids.awk <http://youtube.com/watch?v=ID1 | ID1> ...
 # or just ./get_youtube_vids.awk <http://youtube.com/watch?v=ID1 | ID1>
 #

 BEGIN {
    if (ARGC == 1) usage();

    BINMODE = 3

    delete ARGV[0]
    print "Parsing YouTube video urls/IDs..."
    for (i in ARGV) {
        vid_id = parse_url(ARGV[i])
        if (length(vid_id) < 6) { # havent seen youtube vids with IDs < 6 chars
            print "Invalid YouTube video specified: " ARGV[i] ", not downloading!"
            continue
        }
        VIDS[i] = vid_id
        print "ARGV: " VIDS[i] "..."
    }
    print "  ........... "

    for (i in VIDS) {
        print "Getting video information for video: " VIDS[i] "..."
        get_vid_info(VIDS[i], INFO)

        if (INFO["_redirected"]) {
            print "Could not get video info for video: " VIDS[i]
            continue 
        }

        if (!INFO["video_url"]) {
            print "Could not get video_url for video: " VIDS[i]
            print "Please goto my website, and submit a comment with an URL to this video, so that I can fix it!"
            print "Url: http://www.catonmat.net/blog/downloading-youtube-videos-with-gawk/"
            continue
        }
        if ("title" in INFO) {
            print "Downloading: " INFO["title"] "..."
            title = INFO["title"] "_" VIDS[i]
        }
        else {
            print "Could not get title for video: " VIDS[i]
            print "Trying to download " VIDS[i] " anyway"
            title = VIDS[i]
        }
        download_video(INFO["video_url"], title)
    }
 }

 function usage() {
    print "Downloading YouTube Videos with GNU Awk"
    print
    print "Peteris Krumins (peter@catonmat.net)"
    print "http://www.catonmat.net  --  good coders code, great reuse"
    print 
    print "Usage: gawk -f get_youtube_vids.awk <http://youtube.com/watch?v=ID1 | ID1> ..."
    print "or just ./get_youtube_vids.awk <http://youtube.com/watch?v=ID1 | ID1> ..."
    exit 1
 }

 #
 # function parse_url
 #
 # takes a url or an ID of a youtube video and returns just the ID
 # for example the url could be the full url: http://www.youtube.com/watch?v=ID
 # or it could be www.youtube.com/watch?v=ID
 # or just youtube.com/watch?v=ID or http://youtube.com/watch?v=ID
 # or just the ID
 #
 function parse_url(url) {
    gsub(/http:\/\//, "", url)                # get rid of http:// part
    gsub(/www\./,     "", url)                # get rid of www.    part
    gsub(/youtube\.com\/watch\?v=/, "", url)  # get rid of youtube.com... part

    if ((p = index(url, "&")) > 0)      # get rid of &foo=bar&... after the ID
        url = substr(url, 1, p-1)

    return url
 }

 #
 # function get_vid_info
 #
 # function takes the youtube video ID and gets the title of the video
 # and the url to .flv file
 #
 function get_vid_info(vid_id, INFO,    InetFile, Request, HEADERS, matches, escaped_urls, fmt_urls, fmt) {
    delete INFO
    InetFile = "/inet/tcp/0/www.youtube.com/80"
    Request = "GET /watch?v=" vid_id " HTTP/1.1\r\n"
    Request = Request "Host: www.youtube.com\r\n\r\n"

    get_headers(InetFile, Request, HEADERS)
    if ("Location" in HEADERS) {
        INFO["_redirected"] = 1
        close(InetFile)
        return
    }

    while ((InetFile |& getline) > 0) {
        # changed 2010-04-05 10:50 due to change in fmt_url_map format (by rahul kumar)

        ## Oops, the next line works but should actually be [^&]+ 
        ## i.e. & instead of " inside square brackets
 	if (match($0, /fmt_url_map=([^"]+)&/, matches)) {
 	    escaped_urls = url_unescape(matches[1])
            split(escaped_urls, fmt_urls, /,?[0-9]+\|/)
            for (fmt in fmt_urls) {
                if (fmt_urls[fmt] ~ /itag=5/) {
                    # fmt number 5 is the best video
                    INFO["video_url"] = fmt_urls[fmt]
                    close(InetFile)
                    return
                }
            }
            close(InetFile)
            return
        }
        else if (match($0, /<title>YouTube - ([^<]+)</, matches)) {
            # lets try to get the title of the video from html tag which is
            # less likely a subject to future html design changes
            # THIS does not work since the above pattern has been split into multiple lines.
            INFO["title"] = matches[1]
        }
        else if (match($0, /<h1 >([^<]+)</, matches)) {
            ## lets try to get the title of the video from html tag which is
            ## less likely a subject to future html design changes
            INFO["title"] = matches[1]
            printf " ----> Got title: %s\n\n", INFO["title"]
        }
        else if (match($0, /VIDEO_TITLE': '([^']+)'/, matches)) {
            ## lets try to get the title of the video from html tag which is
            ## less likely a subject to future html design changes
            # block added by rahul kumar since format has changed RK
            INFO["title"] = matches[1]
            printf " ----> GOT title: %s\n\n", INFO["title"]
        }
    }
    close(InetFile)
 }

 #
 # function url_unescape
 #
 # given a string, it url-unescapes it.
 # charactes such as %20 get converted to their ascii counterparts.
 #
 function url_unescape(str,    nmatches, entity, entities, seen, i) {
    nmatches = find_all_matches(str, "%[0-9A-Fa-f][0-9A-Fa-f]", entities)
    for (i = 1; i <= nmatches; i++) {
 	entity = entities[i]
        if (!seen[entity]) {
            if (entity == "%26") { # special case for gsub(s, r, t), when r = '&'
                gsub(entity, "\\&", str)
            }
            else {
 	        gsub(entity, url_entity_unescape(entity), str)
            }
            seen[entity] = 1
        }
    }
    return str
 }

 #
 # function find_all_matches
 #
 # http://awk.freeshell.org/FindAllMatches
 #
 function find_all_matches(str, re, arr,    j, a, b) {
    j=0
    a = RSTART; b = RLENGTH   # to avoid unexpected side effects

    while (match(str, re) > 0) {
        arr[++j] = substr(str, RSTART, RLENGTH)
        str = substr(str, RSTART+RLENGTH)
    }
    RSTART = a; RLENGTH = b
    return j
 }

 #
 # function url_entity_unescape
 #
 # given an url-escaped entity, such as %20, return its ascii counterpart.
 #
 function url_entity_unescape(entity) {
    sub("%", "", entity)
    return sprintf("%c", strtonum("0x" entity))
 }

 #
 # function download_video
 #
 # takes the url to video and saves the movie to current directory using
 # santized video title as filename
 #
 function download_video(url, title,    filename, InetFile, Request, Loop, HEADERS, FOO) {
    title = sanitize_title(title)
    filename = create_filename(title)

    parse_location(url, FOO)
    InetFile = FOO["InetFile"]
    Request  = "GET " FOO["Request"] " HTTP/1.1\r\n"
    Request  = Request "Host: " FOO["Host"] "\r\n\r\n"

    Loop = 0 # make sure we do not get caught in Location: loop
    do {     # we can get more than one redirect, follow them all
        get_headers(InetFile, Request, HEADERS)
        if ("Location" in HEADERS) { # we got redirected, let's follow the link
            close(InetFile)
            parse_location(HEADERS["Location"], FOO)
            InetFile = FOO["InetFile"]
            Request  = "GET " FOO["Request"] " HTTP/1.1\r\n"
            Request  = Request "Host: " FOO["Host"] "\r\n\r\n"
            if (InetFile == "") {
                print "Downloading '" title "' failed, couldn't parse Location header!"
                return
            }
        }
        Loop++
    } while (("Location" in HEADERS) && Loop < 5)

    if (Loop == 5) {
        print "Downloading '" title "' failed, got caught in Location loop!"
        return
    }
    
    print "Saving video to file '" filename "' (size: " bytes_to_human(HEADERS["Content-Length"]) ")..."
    save_file(InetFile, filename, HEADERS)
    close(InetFile)
    print "Successfully downloaded '" title "'!"
 }

 #
 # function sanitize_title
 #
 # sanitizes the video title, by removing ()'s, replacing spaces with _, etc.
 # 
 function sanitize_title(title) {
    gsub(/\(|\)/, "", title)
    gsub(/[^[:alnum:]-]/, "_", title)
    gsub(/_-/, "-", title)
    gsub(/-_/, "-", title)
    gsub(/_$/, "", title)
    gsub(/-$/, "", title)
    gsub(/_{2,}/, "_", title)
    gsub(/-{2,}/, "-", title)
    # added  RK since single quotes in title are freaking out
    gsub(/\'/, "", title) 
    return title
 }

 #
 # function create_filename
 #
 # given a sanitized video title, creates a nonexisting filename
 #
 function create_filename(title,    filename, i) {
    filename = title ".flv"
    i = 1
    while (file_exists(filename)) {
        print "file exists: " filename "!"
        filename = title "-" i ".flv"
        i++
    }
    return filename
 }

 #
 # function save_file
 #
 # given a special network file and filename reads from network until eof
 # and saves the read contents into a file named filename
 #
 function save_file(Inet, filename, HEADERS,    done, cl, perc, hd, hcl) {
    OLD_RS  = RS
    OLD_ORS = ORS

    ORS = ""

    # clear the file
    print "" > filename

    # here we will do a little hackery to write the downloaded data
    # to file chunk by chunk instead of downloading it all to memory
    # and then writing
    #
    # the idea is to use a regex for the record field seperator
    # everything that gets matched is stored in RT variable

    #
    # RS = ".{1,512}" # let's read 512 byte records

    RS = "@" # I replaced the 512 block reading with something better.
             # To read blocks I had to force users to specify --re-interval,
             # which made them uncomfortable.
             # I did statistical analysis on YouTube video files and
             # I found that hex value 0x40 appears pretty often (200 bytes or so)!
             #

    cl = HEADERS["Content-Length"]
    hcl = bytes_to_human(cl)
    done = 0
    while ((Inet |& getline) > 0) {
        done += length($0 RT)
        perc = done*100/cl
        hd = bytes_to_human(done)
        printf "Done: %d/%d bytes (%d%%, %s/%s)            \r",
            done, cl, perc, bytes_to_human(done), bytes_to_human(cl)
        print $0 RT >> filename
    }
    printf "Done: %d/%d bytes (%d%%, %s/%s)            \n",
        done, cl, perc, bytes_to_human(done), bytes_to_human(cl)

    RS  = OLD_RS
    ORS = OLD_ORS
 }

 #
 # function get_headers
 #
 # given a special inet file and the request saves headers in HEADERS array
 # special key "_status" can be used to find HTTP response code
 # issuing another getline() on inet file would start returning the contents
 #
 function get_headers(Inet, Request,    HEADERS, matches, OLD_RS) {
    delete HEADERS

    # save global vars
    OLD_RS=RS

    print Request |& Inet

    # get the http status response
    if (Inet |& getline > 0) {
        HEADERS["_status"] = $2
    }
    else {
        print "Failed reading from the net. Quitting!"
        exit 1
    }

    RS="\r\n"
    while ((Inet |& getline) > 0) {
        # we could have used FS=": " to split, but i could not think of a good
        # way to handle header values which contain multiple ": "
        # so i better go with a match
        if (match($0, /([^:]+): (.+)/, matches)) {
            HEADERS[matches[1]] = matches[2]
        }
        else { break }
    }
    RS=OLD_RS
 }

 #
 # function parse_location
 #
 # given a Location HTTP header value the function constructs a special
 # inet file and the request storing them in FOO
 #
 function parse_location(location, FOO) {
    # location might look like http://cache.googlevideo.com/get_video?video_id=ID
    if (match(location, /http:\/\/([^\/]+)(\/.+)/, matches)) {
        FOO["InetFile"] = "/inet/tcp/0/" matches[1] "/80"
        FOO["Host"]     = matches[1]
        FOO["Request"]  = matches[2]
    }
    else {
        FOO["InetFile"] = ""
        FOO["Host"]     = ""
        FOO["Request"]  = ""
    }
 }

 # function bytes_to_human
 #
 # given bytes, converts them to human readable format like 13.2mb
 #
 function bytes_to_human(bytes,    MAP, map_idx, bytes_copy) {
    MAP[0] = "b"
    MAP[1] = "kb"
    MAP[2] = "mb"
    MAP[3] = "gb"
    MAP[4] = "tb"
   
    map_idx = 0
    bytes_copy = int(bytes)
    while (bytes_copy > 1024) {
        bytes_copy /= 1024
        map_idx++
    }

    if (map_idx > 4)
        return sprintf("%d bytes", bytes, MAP[map_idx])
    else
        return sprintf("%.02f%s", bytes_copy, MAP[map_idx])
 }

 #
 # function file_exists
 #
 # given a path to file, returns 1 if the file exists, or 0 if it doesn't
 #
 function file_exists(file,    foo) {
    if ((getline foo <file) >= 0) {
        close(file)
        return 1
    }
    return 0
 }
	#!/usr/bin/gawk -f
	#
	# 2007.07.10 v1.0 - initial release
	# 2007.10.21 v1.1 - youtube changed the way it displays vids
	# 2008.03.01 v1.2 - youtube changed the way it displays vids
	# 2008.08.28 v1.3 - added a progress bar and removed need for --re-interval
	# 2009.08.25 v1.4 - youtube changed the way it displays vids
	# 2010.04.05 - youtube changes video_url format
	#
	# Peteris Krumins (peter@catonmat.net)
	# http://www.catonmat.net -- good coders code, great reuse
	#
	# Usage: gawk -f get_youtube_vids.awk <http://youtube.com/watch?v=ID1 \| ID1> ...
	# or just ./get_youtube_vids.awk <http://youtube.com/watch?v=ID1 \| ID1>
	#

	BEGIN {
	if (ARGC == 1) usage();

	BINMODE = 3

	delete ARGV[0]
	print "Parsing YouTube video urls/IDs..."
	for (i in ARGV) {
	vid_id = parse_url(ARGV[i])
	if (length(vid_id) < 6) { # havent seen youtube vids with IDs < 6 chars
	print "Invalid YouTube video specified: " ARGV[i] ", not downloading!"
	continue
	}
	VIDS[i] = vid_id
	print "ARGV: " VIDS[i] "..."
	}
	print " ........... "

	for (i in VIDS) {
	print "Getting video information for video: " VIDS[i] "..."
	get_vid_info(VIDS[i], INFO)

	if (INFO["_redirected"]) {
	print "Could not get video info for video: " VIDS[i]
	continue
	}

	if (!INFO["video_url"]) {
	print "Could not get video_url for video: " VIDS[i]
	print "Please goto my website, and submit a comment with an URL to this video, so that I can fix it!"
	print "Url: http://www.catonmat.net/blog/downloading-youtube-videos-with-gawk/"
	continue
	}
	if ("title" in INFO) {
	print "Downloading: " INFO["title"] "..."
	title = INFO["title"] "_" VIDS[i]
	}
	else {
	print "Could not get title for video: " VIDS[i]
	print "Trying to download " VIDS[i] " anyway"
	title = VIDS[i]
	}
	download_video(INFO["video_url"], title)
	}
	}

	function usage() {
	print "Downloading YouTube Videos with GNU Awk"
	print
	print "Peteris Krumins (peter@catonmat.net)"
	print "http://www.catonmat.net -- good coders code, great reuse"
	print
	print "Usage: gawk -f get_youtube_vids.awk <http://youtube.com/watch?v=ID1 \| ID1> ..."
	print "or just ./get_youtube_vids.awk <http://youtube.com/watch?v=ID1 \| ID1> ..."
	exit 1
	}

	#
	# function parse_url
	#
	# takes a url or an ID of a youtube video and returns just the ID
	# for example the url could be the full url: http://www.youtube.com/watch?v=ID
	# or it could be www.youtube.com/watch?v=ID
	# or just youtube.com/watch?v=ID or http://youtube.com/watch?v=ID
	# or just the ID
	#
	function parse_url(url) {
	gsub(/http:\/\//, "", url) # get rid of http:// part
	gsub(/www\./, "", url) # get rid of www. part
	gsub(/youtube\.com\/watch\?v=/, "", url) # get rid of youtube.com... part

	if ((p = index(url, "&")) > 0) # get rid of &foo=bar&... after the ID
	url = substr(url, 1, p-1)

	return url
	}

	#
	# function get_vid_info
	#
	# function takes the youtube video ID and gets the title of the video
	# and the url to .flv file
	#
	function get_vid_info(vid_id, INFO, InetFile, Request, HEADERS, matches, escaped_urls, fmt_urls, fmt) {
	delete INFO
	InetFile = "/inet/tcp/0/www.youtube.com/80"
	Request = "GET /watch?v=" vid_id " HTTP/1.1\r\n"
	Request = Request "Host: www.youtube.com\r\n\r\n"

	get_headers(InetFile, Request, HEADERS)
	if ("Location" in HEADERS) {
	INFO["_redirected"] = 1
	close(InetFile)
	return
	}

	while ((InetFile \|& getline) > 0) {
	# changed 2010-04-05 10:50 due to change in fmt_url_map format (by rahul kumar)

	## Oops, the next line works but should actually be [^&]+
	## i.e. & instead of " inside square brackets
	if (match($0, /fmt_url_map=([^"]+)&/, matches)) {
	escaped_urls = url_unescape(matches[1])
	split(escaped_urls, fmt_urls, /,?[0-9]+\\|/)
	for (fmt in fmt_urls) {
	if (fmt_urls[fmt] ~ /itag=5/) {
	# fmt number 5 is the best video
	INFO["video_url"] = fmt_urls[fmt]
	close(InetFile)
	return
	}
	}
	close(InetFile)
	return
	}
	else if (match($0, /<title>YouTube - ([^<]+)</, matches)) {
	# lets try to get the title of the video from html tag which is
	# less likely a subject to future html design changes
	# THIS does not work since the above pattern has been split into multiple lines.
	INFO["title"] = matches[1]
	}
	else if (match($0, /<h1 >([^<]+)</, matches)) {
	## lets try to get the title of the video from html tag which is
	## less likely a subject to future html design changes
	INFO["title"] = matches[1]
	printf " ----> Got title: %s\n\n", INFO["title"]
	}
	else if (match($0, /VIDEO_TITLE': '([^']+)'/, matches)) {
	## lets try to get the title of the video from html tag which is
	## less likely a subject to future html design changes
	# block added by rahul kumar since format has changed RK
	INFO["title"] = matches[1]
	printf " ----> GOT title: %s\n\n", INFO["title"]
	}
	}
	close(InetFile)
	}

	#
	# function url_unescape
	#
	# given a string, it url-unescapes it.
	# charactes such as %20 get converted to their ascii counterparts.
	#
	function url_unescape(str, nmatches, entity, entities, seen, i) {
	nmatches = find_all_matches(str, "%[0-9A-Fa-f][0-9A-Fa-f]", entities)
	for (i = 1; i <= nmatches; i++) {
	entity = entities[i]
	if (!seen[entity]) {
	if (entity == "%26") { # special case for gsub(s, r, t), when r = '&'
	gsub(entity, "\\&", str)
	}
	else {
	gsub(entity, url_entity_unescape(entity), str)
	}
	seen[entity] = 1
	}
	}
	return str
	}

	#
	# function find_all_matches
	#
	# http://awk.freeshell.org/FindAllMatches
	#
	function find_all_matches(str, re, arr, j, a, b) {
	j=0
	a = RSTART; b = RLENGTH # to avoid unexpected side effects

	while (match(str, re) > 0) {
	arr[++j] = substr(str, RSTART, RLENGTH)
	str = substr(str, RSTART+RLENGTH)
	}
	RSTART = a; RLENGTH = b
	return j
	}

	#
	# function url_entity_unescape
	#
	# given an url-escaped entity, such as %20, return its ascii counterpart.
	#
	function url_entity_unescape(entity) {
	sub("%", "", entity)
	return sprintf("%c", strtonum("0x" entity))
	}

	#
	# function download_video
	#
	# takes the url to video and saves the movie to current directory using
	# santized video title as filename
	#
	function download_video(url, title, filename, InetFile, Request, Loop, HEADERS, FOO) {
	title = sanitize_title(title)
	filename = create_filename(title)

	parse_location(url, FOO)
	InetFile = FOO["InetFile"]
	Request = "GET " FOO["Request"] " HTTP/1.1\r\n"
	Request = Request "Host: " FOO["Host"] "\r\n\r\n"

	Loop = 0 # make sure we do not get caught in Location: loop
	do { # we can get more than one redirect, follow them all
	get_headers(InetFile, Request, HEADERS)
	if ("Location" in HEADERS) { # we got redirected, let's follow the link
	close(InetFile)
	parse_location(HEADERS["Location"], FOO)
	InetFile = FOO["InetFile"]
	Request = "GET " FOO["Request"] " HTTP/1.1\r\n"
	Request = Request "Host: " FOO["Host"] "\r\n\r\n"
	if (InetFile == "") {
	print "Downloading '" title "' failed, couldn't parse Location header!"
	return
	}
	}
	Loop++
	} while (("Location" in HEADERS) && Loop < 5)

	if (Loop == 5) {
	print "Downloading '" title "' failed, got caught in Location loop!"
	return
	}

	print "Saving video to file '" filename "' (size: " bytes_to_human(HEADERS["Content-Length"]) ")..."
	save_file(InetFile, filename, HEADERS)
	close(InetFile)
	print "Successfully downloaded '" title "'!"
	}

	#
	# function sanitize_title
	#
	# sanitizes the video title, by removing ()'s, replacing spaces with _, etc.
	#
	function sanitize_title(title) {
	gsub(/\(\|\)/, "", title)
	gsub(/[^[:alnum:]-]/, "_", title)
	gsub(/_-/, "-", title)
	gsub(/-_/, "-", title)
	gsub(/_$/, "", title)
	gsub(/-$/, "", title)
	gsub(/_{2,}/, "_", title)
	gsub(/-{2,}/, "-", title)
	# added RK since single quotes in title are freaking out
	gsub(/\'/, "", title)
	return title
	}

	#
	# function create_filename
	#
	# given a sanitized video title, creates a nonexisting filename
	#
	function create_filename(title, filename, i) {
	filename = title ".flv"
	i = 1
	while (file_exists(filename)) {
	print "file exists: " filename "!"
	filename = title "-" i ".flv"
	i++
	}
	return filename
	}

	#
	# function save_file
	#
	# given a special network file and filename reads from network until eof
	# and saves the read contents into a file named filename
	#
	function save_file(Inet, filename, HEADERS, done, cl, perc, hd, hcl) {
	OLD_RS = RS
	OLD_ORS = ORS

	ORS = ""

	# clear the file
	print "" > filename

	# here we will do a little hackery to write the downloaded data
	# to file chunk by chunk instead of downloading it all to memory
	# and then writing
	#
	# the idea is to use a regex for the record field seperator
	# everything that gets matched is stored in RT variable

	#
	# RS = ".{1,512}" # let's read 512 byte records

	RS = "@" # I replaced the 512 block reading with something better.
	# To read blocks I had to force users to specify --re-interval,
	# which made them uncomfortable.
	# I did statistical analysis on YouTube video files and
	# I found that hex value 0x40 appears pretty often (200 bytes or so)!
	#

	cl = HEADERS["Content-Length"]
	hcl = bytes_to_human(cl)
	done = 0
	while ((Inet \|& getline) > 0) {
	done += length($0 RT)
	perc = done*100/cl
	hd = bytes_to_human(done)
	printf "Done: %d/%d bytes (%d%%, %s/%s) \r",
	done, cl, perc, bytes_to_human(done), bytes_to_human(cl)
	print $0 RT >> filename
	}
	printf "Done: %d/%d bytes (%d%%, %s/%s) \n",
	done, cl, perc, bytes_to_human(done), bytes_to_human(cl)

	RS = OLD_RS
	ORS = OLD_ORS
	}

	#
	# function get_headers
	#
	# given a special inet file and the request saves headers in HEADERS array
	# special key "_status" can be used to find HTTP response code
	# issuing another getline() on inet file would start returning the contents
	#
	function get_headers(Inet, Request, HEADERS, matches, OLD_RS) {
	delete HEADERS

	# save global vars
	OLD_RS=RS

	print Request \|& Inet

	# get the http status response
	if (Inet \|& getline > 0) {
	HEADERS["_status"] = $2
	}
	else {
	print "Failed reading from the net. Quitting!"
	exit 1
	}

	RS="\r\n"
	while ((Inet \|& getline) > 0) {
	# we could have used FS=": " to split, but i could not think of a good
	# way to handle header values which contain multiple ": "
	# so i better go with a match
	if (match($0, /([^:]+): (.+)/, matches)) {
	HEADERS[matches[1]] = matches[2]
	}
	else { break }
	}
	RS=OLD_RS
	}

	#
	# function parse_location
	#
	# given a Location HTTP header value the function constructs a special
	# inet file and the request storing them in FOO
	#
	function parse_location(location, FOO) {
	# location might look like http://cache.googlevideo.com/get_video?video_id=ID
	if (match(location, /http:\/\/([^\/]+)(\/.+)/, matches)) {
	FOO["InetFile"] = "/inet/tcp/0/" matches[1] "/80"
	FOO["Host"] = matches[1]
	FOO["Request"] = matches[2]
	}
	else {
	FOO["InetFile"] = ""
	FOO["Host"] = ""
	FOO["Request"] = ""
	}
	}

	# function bytes_to_human
	#
	# given bytes, converts them to human readable format like 13.2mb
	#
	function bytes_to_human(bytes, MAP, map_idx, bytes_copy) {
	MAP[0] = "b"
	MAP[1] = "kb"
	MAP[2] = "mb"
	MAP[3] = "gb"
	MAP[4] = "tb"

	map_idx = 0
	bytes_copy = int(bytes)
	while (bytes_copy > 1024) {
	bytes_copy /= 1024
	map_idx++
	}

	if (map_idx > 4)
	return sprintf("%d bytes", bytes, MAP[map_idx])
	else
	return sprintf("%.02f%s", bytes_copy, MAP[map_idx])
	}

	#
	# function file_exists
	#
	# given a path to file, returns 1 if the file exists, or 0 if it doesn't
	#
	function file_exists(file, foo) {
	if ((getline foo <file) >= 0) {
	close(file)
	return 1
	}
	return 0
	}
No results found