Last active
April 25, 2021 23:21
-
-
Save rhardih/278a3cc6f5534785068819159f2c2919 to your computer and use it in GitHub Desktop.
Listing filenames of a remote ZIP archive without downloading entire file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'httparty' | |
require 'uri' | |
def get_file_names(zip_url) | |
# ZIP file format: https://en.wikipedia.org/wiki/ZIP_(file_format) | |
# 1. Do an initial head request to figure out how big the file is from the | |
# content size | |
response = HTTParty.head(zip_url) | |
content_length = response.headers["content-length"].to_i | |
# 2. Request just enough bytes from the end of the file, to get the End of | |
# central directory record (EOCD) | |
response = HTTParty.get(zip_url, { | |
headers: { | |
'Range' => "bytes=#{content_length - 100}-#{content_length}" | |
} | |
}) | |
# 3. Extract the central directory byte offset, size and comment length | |
ss = StringScanner.new(response.body) | |
ss.scan_until(/\x50\x4b\x05\x06/) # end of central directory signature | |
ss.pos += 12 - 4 # pos is index of first byte after a match | |
cd_size, cd_offset, comment_length = ss.peek(10).unpack("L2S") | |
throw "Not enough bytes requested for EOCD" if comment_length > 80 | |
# 4. Use the offset and size to request just the bytes that contain the | |
# Central directory file header | |
response = HTTParty.get(zip_url, { | |
headers: { | |
'Range' => "bytes=#{cd_offset}-#{cd_offset + cd_size}" | |
} | |
}) | |
# 5. Go through each entry in the file and collect the filenames | |
file_names = [] | |
ss = StringScanner.new(response.body) | |
until ss.scan_until(/\x50\x4b\x01\x02/).nil? # central directory signature | |
ss.pos += 28 - 4 | |
file_name_length = ss.peek(2).unpack("S").first | |
ss.pos += 18 | |
file_name = ss.peek(file_name_length) | |
file_names << file_name | |
end | |
file_names | |
end | |
pp get_file_names("https://rhardih.io/wp-content/uploads/2021/04/test.zip") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'httparty' | |
require 'uri' | |
def get_file_names(zip_url) | |
# ZIP file format: https://en.wikipedia.org/wiki/ZIP_(file_format) | |
# 1. Request just enough bytes from the end of the file, to get the End of | |
# central directory record (EOCD) | |
response = HTTParty.get(zip_url, { headers: { 'Range' => "bytes=-100" } }) | |
# 2. Extract the central directory byte offset, size and comment length | |
ss = StringScanner.new(response.body) | |
ss.scan_until(/\x50\x4b\x05\x06/) # end of central directory signature | |
ss.pos += 12 - 4 # pos is index of first byte after a match | |
cd_size, cd_offset, comment_length = ss.peek(10).unpack("L2S") | |
throw "Not enough bytes requested for EOCD" if comment_length > 80 | |
# 3. Use the offset and size to request just the bytes that contain the | |
# Central directory file header | |
response = HTTParty.get(zip_url, { | |
headers: { | |
'Range' => "bytes=#{cd_offset}-#{cd_offset + cd_size}" | |
} | |
}) | |
# 4. Go through each entry in the file and collect the filenames | |
file_names = [] | |
ss = StringScanner.new(response.body) | |
until ss.scan_until(/\x50\x4b\x01\x02/).nil? # central directory signature | |
ss.pos += 28 - 4 | |
file_name_length = ss.peek(2).unpack("S").first | |
ss.pos += 18 | |
file_name = ss.peek(file_name_length) | |
file_names << file_name | |
end | |
file_names | |
end | |
pp get_file_names("https://rhardih.io/wp-content/uploads/2021/04/test.zip") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Install dependent gem with
$ gem install httparty