petermueller · May 6, 2023 05:56 · petermueller · May 6, 2023
diff --git a/http_utils_download.ex b/http_utils_download.ex
 defmodule HttpUtils.Download do
  @moduledoc """
  A module for interacting with the `MyApp.Downloadable` protocol in a web context.

  This module contains a collection of functions for commonly use-cases,
  such as sending chunked streams on a `t:Plug.Conn.t/0`
  """

  alias MyApp.Downloadable

  # RFC 2616 Section 2.2
  # Clarified by RFC 6266
  # ====
  # OCTET          = <any 8-bit sequence of data>
  # CHAR           = <any US-ASCII character (octets 0 - 127)>
  # UPALPHA        = <any US-ASCII uppercase letter "A".."Z">
  # LOALPHA        = <any US-ASCII lowercase letter "a".."z">
  # ALPHA          = UPALPHA | LOALPHA
  # DIGIT          = <any US-ASCII digit "0".."9">
  # CTL            = <any US-ASCII control character
  #                 (octets 0 - 31) and DEL (127)>
  # CR             = <US-ASCII CR, carriage return (13)>
  # LF             = <US-ASCII LF, linefeed (10)>
  # SP             = <US-ASCII SP, space (32)>
  # HT             = <US-ASCII HT, horizontal-tab (9)>
  # <">            = <US-ASCII double-quote mark (34)>

  # LWS            = [CRLF] 1*( SP | HT )

  # TEXT           = <any OCTET except CTLs, but including LWS>

  # quoted-string  = ( <"> *(qdtext | quoted-pair ) <"> )
  # qdtext         = <any TEXT except <">>
  # quoted-pair    = "\" CHAR
  # ====

  # Intentionally not including CRLF-prefixed LWS, or quoted-pair, as they require multi-char matching, which
  # would be better served by using an actual parser, a la NimbleParsec

  # The spec also states TEXT as based off OCTET, but RFC 6266 explicitly suggests substituting
  # letters like "ä" (Latin Small Letter A With Diaeresis) as US-ASCII "ae" even though is a valid
  # ASCII character, octet 228 (but not US-ASCII). Based on this we're ignoring the part of the
  # spec that says "TEXT = <any OCTET ..." and assuming they meant "<any CHAR ..."
  @text_chars Enum.to_list(32..126)
  @qdtext_chars @text_chars -- [?"]
  @quoted_string_chars @qdtext_chars
  @mapset_quoted_string_strings MapSet.new(@quoted_string_chars, &List.to_string([&1]))

  # RFCs 5987 & 8187, Sections 3.2.1
  # ====
  # ext-value     = charset  "'" [ language ] "'" value-chars
  #                  ; like RFC 2231's <extended-initial-value>
  #                  ; (see [RFC2231], Section 7)

  # Parameter extension value charset
  # charset       = "UTF-8" / mime-charset

  # mime-charset  = 1*mime-charsetc
  # mime-charsetc = ALPHA / DIGIT
  #               / "!" / "#" / "$" / "%" / "&"
  #               / "+" / "-" / "^" / "_" / "`"
  #               / "{" / "}" / "~"
  #               ; as <mime-charset> in Section 2.3 of [RFC2978]
  #               ; except that the single quote is not included
  #               ; SHOULD be registered in the IANA charset registry
  # @rfc_5987_parameter_extension_custom_charset_chars '!#$%&+-^_`{}~'

  # value-chars   = *( pct-encoded / attr-char )

  # pct-encoded   = "%" HEXDIG HEXDIG
  #               ; see [RFC3986], Section 2.1

  # attr-char     = ALPHA / DIGIT
  #               / "!" / "#" / "$" / "&" / "+" / "-" / "."
  #               / "^" / "_" / "`" / "|" / "~"
  #               ; token except ( "*" / "'" / "%" )
  # ====
  @rfc_5987_parameter_extension_value_chars Enum.flat_map(
                                              [?A..?Z, ?a..?z, ?0..?9, '!#$&+-.^_`|~'],
                                              &Enum.to_list/1
                                            )
  @mapset_param_ext_value_chars MapSet.new(@rfc_5987_parameter_extension_value_chars)

  # Types
  @typep disposition_atom() :: :inline | :attachment
  @type disposition() :: disposition_atom() | String.t()

  # Public Functions

  @doc false
  @spec content_disposition(disposition()) :: String.t()
  def content_disposition(disposition) do
    disposition
    |> disposition_type()
    |> to_string()
  end

  @doc ~S"""
  Formats the given options to a standards-compliant `Content-Disposition` string.
  Raises if given an unsupported disposition as an atom or case-insensitive string

  See the implementation comments for more context into the RFCs, and specific characters left
  unescaped.

  ## Arguments
    * `disposition` - The disposition type to use
    * `downloadable` - A `t:MyApp.Downloadable.t/1` that is used to gather the  base filename and
      extension. This argument is optional and when passed will be encoded for the "filename*="
      header parameter. For the "filename=" legacy header parameter, any non-US-ASCII characters
      (interpreted as codepoints) will be replaced with `_` to support older browsers.


  ## Examples

      iex> Download.content_disposition(:inline)
      "inline"

      iex> Download.content_disposition(:attachment)
      "attachment"

      iex> Download.content_disposition(%MyApp.ZipPdfDownload{filename: "kittens"}, "inline")
      "inline; filename=\"kittens.zip\"; filename*=UTF-8''kittens.zip"

      iex> Download.content_disposition(%MyApp.ZipPdfDownload{filename: "kïttéñs"}, :attachment)
      "attachment; filename=\"k_tt__s.zip\"; filename*=UTF-8''k%C3%AFtt%C3%A9%C3%B1s.zip"

      iex> Download.content_disposition("form-data")
      ** (ArgumentError) form-data unsupported, use `Plug.Parsers.MULTIPART`

      iex> Download.content_disposition("filename=\"myfile.txt\"")
      ** (ArgumentError) invalid disposition type: `"filename=\"myfile.txt\""`, use `:inline` or `:attachment`

  """
  @spec content_disposition(Downloadable.t(), disposition()) :: String.t()
  def content_disposition(downloadable, disposition) do
    disposition = disposition_type(disposition)
    filename = Downloadable.filename(downloadable) <> "." <> Downloadable.extension(downloadable)

    Enum.join([disposition, ascii_filename(filename), utf8_filename(filename)], "; ")
  end

  @doc false
  @spec disposition_type(disposition()) :: disposition_atom() | no_return()
  def disposition_type(:inline), do: :inline
  def disposition_type(:attachment), do: :attachment

  def disposition_type(disposition) when is_binary(disposition) do
    case String.downcase(disposition) do
      "attachment" ->
        :attachment

      "inline" ->
        :inline

      "form-data" ->
        raise(ArgumentError, "form-data unsupported, use `Plug.Parsers.MULTIPART`")

      _ ->
        raise(
          ArgumentError,
          "invalid disposition type: `#{inspect(disposition)}`, use `:inline` or `:attachment`"
        )
    end
  end

  @doc false
  @spec utf8_filename(String.t()) :: String.t()
  def utf8_filename(filename) do
    filename = URI.encode(filename, &(&1 in @mapset_param_ext_value_chars))

    "filename*=UTF-8''#{filename}"
  end

  @doc false
  @spec ascii_filename(String.t()) :: String.t()
  def ascii_filename(filename), do: "filename=\"#{to_ascii(filename)}\""

  @doc false
  @spec to_ascii(String.t()) :: String.t()
  def to_ascii(utf8) do
    for char <- String.codepoints(utf8),
        into: "",
        do: if(char in @mapset_quoted_string_strings, do: char, else: "_")
  end

  @typedoc """
  Result of a parsed Content-Disposition header
  """
  @type parsed_content_disposition() :: %{
          optional(:filename_utf8) => String.t(),
          optional(:legacy_filename) => String.t(),
          required(:disposition) => disposition_atom()
        }

  @doc ~S"""
  Parses a [Content-Disposition](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition)
  based upon the IETF RFCs, with some room for incorrect encoding from the sender.

    Returns a `t:parsed_content_disposition/0` map
  Raises if given an unsupported disposition type

  ## Important Notes on the `t:parsed_content_disposition/0` result
  - only the `:disposition` key will always be present.
  - when consuming, it is HIGHLY suggested to use the `:filename_utf8` field if present
  - `:legacy_filename` is intentionally NOT decoded.
  - No path cleanup is done. Treat the values as unsafe, like any other external input

  See the implementation comments for more context into the RFCs, and specific characters.

  ## Examples

      iex> Download.parse_content_disposition("inline")
      %{disposition: :inline}

      iex> Download.parse_content_disposition("attachment")
      %{disposition: :attachment}

      iex> Download.parse_content_disposition("attachment; filename*=UTF-8''k%C3%AFtt%C3%A9%C3%B1.jpg")
      %{filename_utf8: "kïttéñ.jpg", disposition: :attachment}

      iex> Download.parse_content_disposition("inline; filename=\"kitten.jpg\"; filename*=UTF-8''kitten.jpg")
      %{filename_utf8: "kitten.jpg", legacy_filename: "kitten.jpg", disposition: :inline}

      # Doesn't try to decode legacy filenames
      iex> Download.parse_content_disposition("attachment; filename=\"k%3Ftt%3F%3F.jpg\"")
      %{legacy_filename: "k%3Ftt%3F%3F.jpg", disposition: :attachment}

  """
  @spec parse_content_disposition(String.t()) :: parsed_content_disposition()
  def parse_content_disposition(header_value) do
    [disposition | rest] = :binary.split(header_value, ";")
    disposition = disposition_type(disposition)

    params =
      rest
      |> List.first("")
      |> Plug.Conn.Utils.params()

    raw_filename_star = params["filename*"] || ""

    filename_utf8? =
      raw_filename_star
      |> String.trim_leading()
      |> String.downcase(:ascii)
      |> String.starts_with?("utf-8")

    filename_utf8 =
      with true <- filename_utf8?,
           {:ok, filename} <-
             strip_utf8_str_and_language_tag_from_parameter_extension(raw_filename_star),
           filename <- URI.decode(filename),
           true <- String.valid?(filename) do
        filename
      else
        _ -> nil
      end

    [
      filename_utf8: filename_utf8,
      legacy_filename: params["filename"],
      disposition: disposition
    ]
    |> Enum.reject(fn {_k, v} ->
      is_nil(v)
    end)
    |> Map.new()
  end

  # Private functions

  # This could also be a split on single-quote, "'" based on the spec, but this is a bit more forgiving of bad encoding
  @filename_utf8_maybe_language_tag_regex_capture ~r/^utf-8'(?<lang_tag>.*-?.*)'(?<filename>.+)/i
  defp strip_utf8_str_and_language_tag_from_parameter_extension(raw_string) do
    string = String.trim(raw_string)

    case Regex.named_captures(@filename_utf8_maybe_language_tag_regex_capture, string) do
      nil -> {:error, "failed to extract the filename* parameter"}
      %{"filename" => ""} -> {:error, "filename* parameter empty"}
      %{"filename" => filename} -> {:ok, filename}
    end
  end
 end
	defmodule HttpUtils.Download do
	@moduledoc """
	A module for interacting with the `MyApp.Downloadable` protocol in a web context.

	This module contains a collection of functions for commonly use-cases,
	such as sending chunked streams on a `t:Plug.Conn.t/0`
	"""

	alias MyApp.Downloadable

	# RFC 2616 Section 2.2
	# Clarified by RFC 6266
	# ====
	# OCTET = <any 8-bit sequence of data>
	# CHAR = <any US-ASCII character (octets 0 - 127)>
	# UPALPHA = <any US-ASCII uppercase letter "A".."Z">
	# LOALPHA = <any US-ASCII lowercase letter "a".."z">
	# ALPHA = UPALPHA \| LOALPHA
	# DIGIT = <any US-ASCII digit "0".."9">
	# CTL = <any US-ASCII control character
	# (octets 0 - 31) and DEL (127)>
	# CR = <US-ASCII CR, carriage return (13)>
	# LF = <US-ASCII LF, linefeed (10)>
	# SP = <US-ASCII SP, space (32)>
	# HT = <US-ASCII HT, horizontal-tab (9)>
	# <"> = <US-ASCII double-quote mark (34)>

	# LWS = [CRLF] 1*( SP \| HT )

	# TEXT = <any OCTET except CTLs, but including LWS>

	# quoted-string = ( <"> *(qdtext \| quoted-pair ) <"> )
	# qdtext = <any TEXT except <">>
	# quoted-pair = "\" CHAR
	# ====

	# Intentionally not including CRLF-prefixed LWS, or quoted-pair, as they require multi-char matching, which
	# would be better served by using an actual parser, a la NimbleParsec

	# The spec also states TEXT as based off OCTET, but RFC 6266 explicitly suggests substituting
	# letters like "ä" (Latin Small Letter A With Diaeresis) as US-ASCII "ae" even though is a valid
	# ASCII character, octet 228 (but not US-ASCII). Based on this we're ignoring the part of the
	# spec that says "TEXT = <any OCTET ..." and assuming they meant "<any CHAR ..."
	@text_chars Enum.to_list(32..126)
	@qdtext_chars @text_chars -- [?"]
	@quoted_string_chars @qdtext_chars
	@mapset_quoted_string_strings MapSet.new(@quoted_string_chars, &List.to_string([&1]))

	# RFCs 5987 & 8187, Sections 3.2.1
	# ====
	# ext-value = charset "'" [ language ] "'" value-chars
	# ; like RFC 2231's <extended-initial-value>
	# ; (see [RFC2231], Section 7)

	# Parameter extension value charset
	# charset = "UTF-8" / mime-charset

	# mime-charset = 1*mime-charsetc
	# mime-charsetc = ALPHA / DIGIT
	# / "!" / "#" / "$" / "%" / "&"
	# / "+" / "-" / "^" / "_" / "`"
	# / "{" / "}" / "~"
	# ; as <mime-charset> in Section 2.3 of [RFC2978]
	# ; except that the single quote is not included
	# ; SHOULD be registered in the IANA charset registry
	# @rfc_5987_parameter_extension_custom_charset_chars '!#$%&+-^_`{}~'

	# value-chars = *( pct-encoded / attr-char )

	# pct-encoded = "%" HEXDIG HEXDIG
	# ; see [RFC3986], Section 2.1

	# attr-char = ALPHA / DIGIT
	# / "!" / "#" / "$" / "&" / "+" / "-" / "."
	# / "^" / "_" / "`" / "\|" / "~"
	# ; token except ( "*" / "'" / "%" )
	# ====
	@rfc_5987_parameter_extension_value_chars Enum.flat_map(
	[?A..?Z, ?a..?z, ?0..?9, '!#$&+-.^_`\|~'],
	&Enum.to_list/1
	)
	@mapset_param_ext_value_chars MapSet.new(@rfc_5987_parameter_extension_value_chars)

	# Types
	@typep disposition_atom() :: :inline \| :attachment
	@type disposition() :: disposition_atom() \| String.t()

	# Public Functions

	@doc false
	@spec content_disposition(disposition()) :: String.t()
	def content_disposition(disposition) do
	disposition
	\|> disposition_type()
	\|> to_string()
	end

	@doc ~S"""
	Formats the given options to a standards-compliant `Content-Disposition` string.
	Raises if given an unsupported disposition as an atom or case-insensitive string

	See the implementation comments for more context into the RFCs, and specific characters left
	unescaped.

	## Arguments
	* `disposition` - The disposition type to use
	* `downloadable` - A `t:MyApp.Downloadable.t/1` that is used to gather the base filename and
	extension. This argument is optional and when passed will be encoded for the "filename*="
	header parameter. For the "filename=" legacy header parameter, any non-US-ASCII characters
	(interpreted as codepoints) will be replaced with `_` to support older browsers.


	## Examples

	iex> Download.content_disposition(:inline)
	"inline"

	iex> Download.content_disposition(:attachment)
	"attachment"

	iex> Download.content_disposition(%MyApp.ZipPdfDownload{filename: "kittens"}, "inline")
	"inline; filename=\"kittens.zip\"; filename*=UTF-8''kittens.zip"

	iex> Download.content_disposition(%MyApp.ZipPdfDownload{filename: "kïttéñs"}, :attachment)
	"attachment; filename=\"k_tt__s.zip\"; filename*=UTF-8''k%C3%AFtt%C3%A9%C3%B1s.zip"

	iex> Download.content_disposition("form-data")
	** (ArgumentError) form-data unsupported, use `Plug.Parsers.MULTIPART`

	iex> Download.content_disposition("filename=\"myfile.txt\"")
	** (ArgumentError) invalid disposition type: `"filename=\"myfile.txt\""`, use `:inline` or `:attachment`

	"""
	@spec content_disposition(Downloadable.t(), disposition()) :: String.t()
	def content_disposition(downloadable, disposition) do
	disposition = disposition_type(disposition)
	filename = Downloadable.filename(downloadable) <> "." <> Downloadable.extension(downloadable)

	Enum.join([disposition, ascii_filename(filename), utf8_filename(filename)], "; ")
	end

	@doc false
	@spec disposition_type(disposition()) :: disposition_atom() \| no_return()
	def disposition_type(:inline), do: :inline
	def disposition_type(:attachment), do: :attachment

	def disposition_type(disposition) when is_binary(disposition) do
	case String.downcase(disposition) do
	"attachment" ->
	:attachment

	"inline" ->
	:inline

	"form-data" ->
	raise(ArgumentError, "form-data unsupported, use `Plug.Parsers.MULTIPART`")

	_ ->
	raise(
	ArgumentError,
	"invalid disposition type: `#{inspect(disposition)}`, use `:inline` or `:attachment`"
	)
	end
	end

	@doc false
	@spec utf8_filename(String.t()) :: String.t()
	def utf8_filename(filename) do
	filename = URI.encode(filename, &(&1 in @mapset_param_ext_value_chars))

	"filename*=UTF-8''#{filename}"
	end

	@doc false
	@spec ascii_filename(String.t()) :: String.t()
	def ascii_filename(filename), do: "filename=\"#{to_ascii(filename)}\""

	@doc false
	@spec to_ascii(String.t()) :: String.t()
	def to_ascii(utf8) do
	for char <- String.codepoints(utf8),
	into: "",
	do: if(char in @mapset_quoted_string_strings, do: char, else: "_")
	end

	@typedoc """
	Result of a parsed Content-Disposition header
	"""
	@type parsed_content_disposition() :: %{
	optional(:filename_utf8) => String.t(),
	optional(:legacy_filename) => String.t(),
	required(:disposition) => disposition_atom()
	}

	@doc ~S"""
	Parses a [Content-Disposition](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition)
	based upon the IETF RFCs, with some room for incorrect encoding from the sender.

	Returns a `t:parsed_content_disposition/0` map
	Raises if given an unsupported disposition type

	## Important Notes on the `t:parsed_content_disposition/0` result
	- only the `:disposition` key will always be present.
	- when consuming, it is HIGHLY suggested to use the `:filename_utf8` field if present
	- `:legacy_filename` is intentionally NOT decoded.
	- No path cleanup is done. Treat the values as unsafe, like any other external input

	See the implementation comments for more context into the RFCs, and specific characters.

	## Examples

	iex> Download.parse_content_disposition("inline")
	%{disposition: :inline}

	iex> Download.parse_content_disposition("attachment")
	%{disposition: :attachment}

	iex> Download.parse_content_disposition("attachment; filename*=UTF-8''k%C3%AFtt%C3%A9%C3%B1.jpg")
	%{filename_utf8: "kïttéñ.jpg", disposition: :attachment}

	iex> Download.parse_content_disposition("inline; filename=\"kitten.jpg\"; filename*=UTF-8''kitten.jpg")
	%{filename_utf8: "kitten.jpg", legacy_filename: "kitten.jpg", disposition: :inline}

	# Doesn't try to decode legacy filenames
	iex> Download.parse_content_disposition("attachment; filename=\"k%3Ftt%3F%3F.jpg\"")
	%{legacy_filename: "k%3Ftt%3F%3F.jpg", disposition: :attachment}

	"""
	@spec parse_content_disposition(String.t()) :: parsed_content_disposition()
	def parse_content_disposition(header_value) do
	[disposition \| rest] = :binary.split(header_value, ";")
	disposition = disposition_type(disposition)

	params =
	rest
	\|> List.first("")
	\|> Plug.Conn.Utils.params()

	raw_filename_star = params["filename*"] \|\| ""

	filename_utf8? =
	raw_filename_star
	\|> String.trim_leading()
	\|> String.downcase(:ascii)
	\|> String.starts_with?("utf-8")

	filename_utf8 =
	with true <- filename_utf8?,
	{:ok, filename} <-
	strip_utf8_str_and_language_tag_from_parameter_extension(raw_filename_star),
	filename <- URI.decode(filename),
	true <- String.valid?(filename) do
	filename
	else
	_ -> nil
	end

	[
	filename_utf8: filename_utf8,
	legacy_filename: params["filename"],
	disposition: disposition
	]
	\|> Enum.reject(fn {_k, v} ->
	is_nil(v)
	end)
	\|> Map.new()
	end

	# Private functions

	# This could also be a split on single-quote, "'" based on the spec, but this is a bit more forgiving of bad encoding
	@filename_utf8_maybe_language_tag_regex_capture ~r/^utf-8'(?<lang_tag>.-?.)'(?<filename>.+)/i
	defp strip_utf8_str_and_language_tag_from_parameter_extension(raw_string) do
	string = String.trim(raw_string)

	case Regex.named_captures(@filename_utf8_maybe_language_tag_regex_capture, string) do
	nil -> {:error, "failed to extract the filename* parameter"}
	%{"filename" => ""} -> {:error, "filename* parameter empty"}
	%{"filename" => filename} -> {:ok, filename}
	end
	end
	end