olebedev · May 22, 2023 05:57
diff --git a/parse-yarn-lock.nix b/parse-yarn-lock.nix
 # Parse a yarn.lock file using pure Nix
 # yarn.lock v1 files are basically YAML with support for having multiple keys for a single value in a map and without array support.
 # Inspired by https://github.com/yarnpkg/yarn/blob/158d96dce95313d9a00218302631cd263877d164/src/lockfile/parse.js

 with builtins;
 let
  # Add index to a list of elements
  enumerate = list: genList (i: ({ inherit i; e = elemAt list i; })) (length list);
  mkToken = type: value: { inherit type value; };
  parseLockfile = str: let
    # A Regex that tokenizes a yarn lockfile
    # I've split up the regex in the various token types
    newlineRe = "(\r?\n)";
    commentRe = "#([^\n]+)";
    # Used for any kind of whitespace and also indentation in an object
    indentRe = "( +)";
    # Note that this contains a group for repetition, so the next group is offset.
    # This is a regex that matches JSON strings, which is the format used.
    stringRe = "(\"([^\"\\\\]|\\\\[\\\"\\\\/bfnrt]|\\\\u[0-9a-f]{4})+\")";
    numberRe = "([0-9]+)";
    booleanRe = "(true|false)";
    colonRe = "(:)";
    commaRe = "(,)";
    # A symbol is a string without quotes
    symbolRe = "([a-zA-Z\\/.-][^: \n\r,]+)";
    tokenizeRe = "${newlineRe}|${commentRe}|${indentRe}|${stringRe}|${numberRe}|${booleanRe}|${colonRe}|${commaRe}|${symbolRe}";

    tokenize = split tokenizeRe;
    convert = token: if isString token then abort "Invalid token ${token}"
    else if (elemAt token 0) != null then
      mkToken "newline" null
    else if (elemAt token 1) != null then
      mkToken "comment" (elemAt token 1)
    else if (elemAt token 2) != null then
      mkToken "indent" (stringLength (elemAt token 2))
    else if (elemAt token 3) != null then
      mkToken "string" (fromJSON (elemAt token 3))
    else if (elemAt token 5) != null then
      mkToken "number" (fromJSON (elemAt token 5))
    else if (elemAt token 6) != null then
      mkToken "boolean" (elemAt token 6) == "true"
    else if (elemAt token 7) != null then
      mkToken "colon" null
    else if (elemAt token 8) != null then
      mkToken "comma" null
    else if (elemAt token 9) != null then
      mkToken "string" (elemAt token 9)
    else abort "unreachable";
    unprocessedTokens = map convert (filter (e: e != "") (tokenize str));

    # Filter out comments, and spaces that don't follow a newline
    tokens = map ({ i, e }: e) (filter ({ i, e }:
      if e.type == "comment" then
        # Check if this is the right version lockfile
        if (match "[[:space:]]*yarn lockfile v[0-9]+[[:space:]]*" e.value) != null && (match "[[:space:]]*yarn lockfile v1[[:space:]]*" e.value) == null
        then abort "Unsupported lockfile: ${e.value}"
        else false
      else
        !(e.type == "indent" && (elemAt unprocessedTokens (i - 1)).type != "newline")) (enumerate unprocessedTokens));

    get = index: if index < length tokens then elemAt tokens index else { type = "eof"; };

    # Take one or more keys interspersed with commas
    takeKeys = index: [(get index).value] ++ (if (get (index + 1)).type == "comma" && (get (index + 2)).type == "string" then takeKeys (index + 2) else []);

    # Consume tokens for a single object
    # Returns 'value' for the object and 'index' for how far we iterated
    parse = start: indent:
    let
      # genericClosure is used here to iterate over the tokens in a non-recursive way,
      # which would be too slow for the Nix language.
      # We can't use fold because we need to recurse into nested maps and skip over
      # the tokens that were consumed.
      result = genericClosure {
        startSet = [ { key = start; values = []; } ];
        operator = { key, ... }:
        let
          token = get key;
          nextToken = get (key + 1);
          done = [];
          next = [{ key = key + 1; values = []; }];
        in
          if token.type == "eof" then done
          else if token.type == "newline" then
            if indent == 0 then
              next
            else if nextToken.type != "indent" || nextToken.value != indent then
              done
            else [{ key = key + 1; values = []; }]
          else if token.type == "indent" then
            if token.value == indent then next else done
          # String means this is a key value pair
          else if token.type == "string" then
            let
              keys = takeKeys key;
              skip = 1 + ((length keys) - 1) * 2;
              nextToken = get (key + skip);
            in
              # If the key is followed by a colon then this is a nested object
              if nextToken.type == "colon" then
                let
                  # Parse the nested object
                  res = parse (key + skip + 1) (indent + 2);
                  inherit (res) value index;
                in
                  [{
                    key = index;
                    values = map (name: { inherit name value; }) keys;
                  }]
              # The only valid values
              else if (nextToken.type == "string" || nextToken.type == "number" || nextToken.type == "boolean") then
                [{
                  key = (key + skip + 1);
                  values = map (name: { inherit name; value = nextToken.value; }) keys;
                }]
              else abort "Invalid token ${nextToken.type}"
          else abort "Invalid token ${token.type}"
        ;
      };
      results = concatLists (map (el: el.values) result);
    in
      {
        value = listToAttrs results;
        index = (elemAt result ((length result) - 1)).key;
      };
  in
    (parse 0 0).value;
 in
  parseLockfile (readFile ./yarn.lock)
	# Parse a yarn.lock file using pure Nix
	# yarn.lock v1 files are basically YAML with support for having multiple keys for a single value in a map and without array support.
	# Inspired by https://github.com/yarnpkg/yarn/blob/158d96dce95313d9a00218302631cd263877d164/src/lockfile/parse.js

	with builtins;
	let
	# Add index to a list of elements
	enumerate = list: genList (i: ({ inherit i; e = elemAt list i; })) (length list);
	mkToken = type: value: { inherit type value; };
	parseLockfile = str: let
	# A Regex that tokenizes a yarn lockfile
	# I've split up the regex in the various token types
	newlineRe = "(\r?\n)";
	commentRe = "#([^\n]+)";
	# Used for any kind of whitespace and also indentation in an object
	indentRe = "( +)";
	# Note that this contains a group for repetition, so the next group is offset.
	# This is a regex that matches JSON strings, which is the format used.
	stringRe = "(\"([^\"\\\\]\|\\\\[\\\"\\\\/bfnrt]\|\\\\u[0-9a-f]{4})+\")";
	numberRe = "([0-9]+)";
	booleanRe = "(true\|false)";
	colonRe = "(:)";
	commaRe = "(,)";
	# A symbol is a string without quotes
	symbolRe = "([a-zA-Z\\/.-][^: \n\r,]+)";
	tokenizeRe = "${newlineRe}\|${commentRe}\|${indentRe}\|${stringRe}\|${numberRe}\|${booleanRe}\|${colonRe}\|${commaRe}\|${symbolRe}";

	tokenize = split tokenizeRe;
	convert = token: if isString token then abort "Invalid token ${token}"
	else if (elemAt token 0) != null then
	mkToken "newline" null
	else if (elemAt token 1) != null then
	mkToken "comment" (elemAt token 1)
	else if (elemAt token 2) != null then
	mkToken "indent" (stringLength (elemAt token 2))
	else if (elemAt token 3) != null then
	mkToken "string" (fromJSON (elemAt token 3))
	else if (elemAt token 5) != null then
	mkToken "number" (fromJSON (elemAt token 5))
	else if (elemAt token 6) != null then
	mkToken "boolean" (elemAt token 6) == "true"
	else if (elemAt token 7) != null then
	mkToken "colon" null
	else if (elemAt token 8) != null then
	mkToken "comma" null
	else if (elemAt token 9) != null then
	mkToken "string" (elemAt token 9)
	else abort "unreachable";
	unprocessedTokens = map convert (filter (e: e != "") (tokenize str));

	# Filter out comments, and spaces that don't follow a newline
	tokens = map ({ i, e }: e) (filter ({ i, e }:
	if e.type == "comment" then
	# Check if this is the right version lockfile
	if (match "[[:space:]]yarn lockfile v[0-9]+[[:space:]]" e.value) != null && (match "[[:space:]]yarn lockfile v1[[:space:]]" e.value) == null
	then abort "Unsupported lockfile: ${e.value}"
	else false
	else
	!(e.type == "indent" && (elemAt unprocessedTokens (i - 1)).type != "newline")) (enumerate unprocessedTokens));

	get = index: if index < length tokens then elemAt tokens index else { type = "eof"; };

	# Take one or more keys interspersed with commas
	takeKeys = index: [(get index).value] ++ (if (get (index + 1)).type == "comma" && (get (index + 2)).type == "string" then takeKeys (index + 2) else []);

	# Consume tokens for a single object
	# Returns 'value' for the object and 'index' for how far we iterated
	parse = start: indent:
	let
	# genericClosure is used here to iterate over the tokens in a non-recursive way,
	# which would be too slow for the Nix language.
	# We can't use fold because we need to recurse into nested maps and skip over
	# the tokens that were consumed.
	result = genericClosure {
	startSet = [ { key = start; values = []; } ];
	operator = { key, ... }:
	let
	token = get key;
	nextToken = get (key + 1);
	done = [];
	next = [{ key = key + 1; values = []; }];
	in
	if token.type == "eof" then done
	else if token.type == "newline" then
	if indent == 0 then
	next
	else if nextToken.type != "indent" \|\| nextToken.value != indent then
	done
	else [{ key = key + 1; values = []; }]
	else if token.type == "indent" then
	if token.value == indent then next else done
	# String means this is a key value pair
	else if token.type == "string" then
	let
	keys = takeKeys key;
	skip = 1 + ((length keys) - 1) * 2;
	nextToken = get (key + skip);
	in
	# If the key is followed by a colon then this is a nested object
	if nextToken.type == "colon" then
	let
	# Parse the nested object
	res = parse (key + skip + 1) (indent + 2);
	inherit (res) value index;
	in
	[{
	key = index;
	values = map (name: { inherit name value; }) keys;
	}]
	# The only valid values
	else if (nextToken.type == "string" \|\| nextToken.type == "number" \|\| nextToken.type == "boolean") then
	[{
	key = (key + skip + 1);
	values = map (name: { inherit name; value = nextToken.value; }) keys;
	}]
	else abort "Invalid token ${nextToken.type}"
	else abort "Invalid token ${token.type}"
	;
	};
	results = concatLists (map (el: el.values) result);
	in
	{
	value = listToAttrs results;
	index = (elemAt result ((length result) - 1)).key;
	};
	in
	(parse 0 0).value;
	in
	parseLockfile (readFile ./yarn.lock)