rhardih · April 25, 2021 23:21 · rhardih · Apr 18, 2021 · rhardih · Apr 18, 2021
diff --git a/zip_file_names.rb b/zip_file_names.rb
 require 'httparty'
 require 'uri'

 def get_file_names(zip_url)
  # ZIP file format: https://en.wikipedia.org/wiki/ZIP_(file_format)

  # 1. Do an initial head request to figure out how big the file is from the
  # content size
  response = HTTParty.head(zip_url)
  
  content_length = response.headers["content-length"].to_i

  # 2. Request just enough bytes from the end of the file, to get the End of
  # central directory record (EOCD)
  response = HTTParty.get(zip_url, {
    headers: {
      'Range' => "bytes=#{content_length - 100}-#{content_length}"
    }
  })

  # 3. Extract the central directory byte offset, size and comment length
  ss = StringScanner.new(response.body)
  ss.scan_until(/\x50\x4b\x05\x06/) # end of central directory signature

  ss.pos += 12 - 4 # pos is index of first byte after a match
  cd_size, cd_offset, comment_length = ss.peek(10).unpack("L2S")

  throw "Not enough bytes requested for EOCD" if comment_length > 80

  # 4. Use the offset and size to request just the bytes that contain the
  # Central directory file header
  response = HTTParty.get(zip_url, {
    headers: {
      'Range' => "bytes=#{cd_offset}-#{cd_offset + cd_size}"
    }
  })

  # 5. Go through each entry in the file and collect the filenames
  file_names = []

  ss = StringScanner.new(response.body)

  until ss.scan_until(/\x50\x4b\x01\x02/).nil? # central directory signature
    ss.pos += 28 - 4
    file_name_length =  ss.peek(2).unpack("S").first

    ss.pos += 18
    file_name = ss.peek(file_name_length)

    file_names << file_name
  end

  file_names
 end

 pp get_file_names("https://rhardih.io/wp-content/uploads/2021/04/test.zip")
diff --git a/zip_file_names_v2.rb b/zip_file_names_v2.rb
 require 'httparty'
 require 'uri'

 def get_file_names(zip_url)
  # ZIP file format: https://en.wikipedia.org/wiki/ZIP_(file_format)

  # 1. Request just enough bytes from the end of the file, to get the End of
  # central directory record (EOCD)
  response = HTTParty.get(zip_url, { headers: { 'Range' => "bytes=-100" } })

  # 2. Extract the central directory byte offset, size and comment length
  ss = StringScanner.new(response.body)
  ss.scan_until(/\x50\x4b\x05\x06/) # end of central directory signature

  ss.pos += 12 - 4 # pos is index of first byte after a match
  cd_size, cd_offset, comment_length = ss.peek(10).unpack("L2S")

  throw "Not enough bytes requested for EOCD" if comment_length > 80

  # 3. Use the offset and size to request just the bytes that contain the
  # Central directory file header
  response = HTTParty.get(zip_url, {
    headers: {
      'Range' => "bytes=#{cd_offset}-#{cd_offset + cd_size}"
    }
  })

  # 4. Go through each entry in the file and collect the filenames
  file_names = []

  ss = StringScanner.new(response.body)

  until ss.scan_until(/\x50\x4b\x01\x02/).nil? # central directory signature
    ss.pos += 28 - 4
    file_name_length =  ss.peek(2).unpack("S").first

    ss.pos += 18
    file_name = ss.peek(file_name_length)

    file_names << file_name
  end

  file_names
 end

 pp get_file_names("https://rhardih.io/wp-content/uploads/2021/04/test.zip")
	require 'httparty'
	require 'uri'

	def get_file_names(zip_url)
	# ZIP file format: https://en.wikipedia.org/wiki/ZIP_(file_format)

	# 1. Do an initial head request to figure out how big the file is from the
	# content size
	response = HTTParty.head(zip_url)

	content_length = response.headers["content-length"].to_i

	# 2. Request just enough bytes from the end of the file, to get the End of
	# central directory record (EOCD)
	response = HTTParty.get(zip_url, {
	headers: {
	'Range' => "bytes=#{content_length - 100}-#{content_length}"
	}
	})

	# 3. Extract the central directory byte offset, size and comment length
	ss = StringScanner.new(response.body)
	ss.scan_until(/\x50\x4b\x05\x06/) # end of central directory signature

	ss.pos += 12 - 4 # pos is index of first byte after a match
	cd_size, cd_offset, comment_length = ss.peek(10).unpack("L2S")

	throw "Not enough bytes requested for EOCD" if comment_length > 80

	# 4. Use the offset and size to request just the bytes that contain the
	# Central directory file header
	response = HTTParty.get(zip_url, {
	headers: {
	'Range' => "bytes=#{cd_offset}-#{cd_offset + cd_size}"
	}
	})

	# 5. Go through each entry in the file and collect the filenames
	file_names = []

	ss = StringScanner.new(response.body)

	until ss.scan_until(/\x50\x4b\x01\x02/).nil? # central directory signature
	ss.pos += 28 - 4
	file_name_length = ss.peek(2).unpack("S").first

	ss.pos += 18
	file_name = ss.peek(file_name_length)

	file_names << file_name
	end

	file_names
	end

	pp get_file_names("https://rhardih.io/wp-content/uploads/2021/04/test.zip")