dustinknopoff/migrateToTaxonomies.ts

jpcaruana · 2023-01-07T15:08:15Z

Great ! THanks for sharing.
Sadly it does not detect TOML frontmatter contained within +++ and not --- (as described in https://gohugo.io/content-management/front-matter/#front-matter-formats)

dustinknopoff · 2023-01-07T15:12:58Z

It looks like there is a different path to import from for toml frontmatter https://deno.land/[email protected]/encoding/front_matter/toml.ts

dustinknopoff · 2023-01-07T15:14:21Z

Ah, it still uses --- as the delimiter though 😅

jpcaruana · 2023-01-07T15:21:54Z

do you mean changing the export on L4 (https://gist.github.com/dustinknopoff/0913e25d059f111f57045c904de25980#file-migratetotaxonomies-ts-L4)? I had the same result. (I am trying to understand deno as I am not very versed in js.)

jpcaruana · 2023-01-07T15:23:39Z

and zola behaves as hugo here, expects --- to be YAML:

Error: Failed to serve the site
Error: Error when parsing front matter of section `xxxxcontent/posts/2020/09/08/assassin-royal/index.md`
Error: Reason: YAML deserialize error: Error("invalid type: string \"date = 2020-09-08T16:18:51+02:00 title = \\\"Lire le cycle de l'Assassin Royal, c'est compliqué\\\"\\n[taxonomies] tags = [\\\"livre\\\", \\\"un\\\", \\\"deux\\\", \\\"trois\\\", \\\"quatre\\\", \\\"cing\\\", \\\"six\\\", \\\"sept\\\", \\\"huit\\\", \\\"neuf\\\", \\\"dix\\\", \\\"etc...\\\"] categories = [\\\"test\\\", \\\"autre catégorie\\\", \\\"un\\\", \\\"deux\\\", \\\"trois\\\", \\\"quatre\\\", \\\"cing\\\", \\\"six\\\", \\\"sept\\\", \\\"huit\\\", \\\"neuf\\\", \\\"dix\\\", \\\"etc...\\\"]\\n[extra] twitter = \\\"https://twitter.com/jpcaruana/status/1303356472705921026\\\"\", expected struct PageFrontMatter", line: 2, column: 1)

jpcaruana · 2023-01-07T15:26:37Z

https://github.com/denoland/deno_std/blob/main/encoding/front_matter/mod.ts#L176 seems wrong indeed

jpcaruana · 2023-01-07T15:31:45Z

looks like a bug to me

denoland/std#3094

dustinknopoff · 2023-01-07T15:39:10Z

Here's an alternate version which overrides the std lib frontmatter to use +++ as the delimiters instead

NOTE: This will convert your frontmatter into YAML (which is still valid for Zola)

import {
    Extractor,
    Extract,
    Format,
    Parser,
    test as _test,
} from "https://deno.land/[email protected]/encoding/front_matter/mod.ts";
import { parse } from "https://deno.land/[email protected]/encoding/toml.ts";
import { walk } from "https://deno.land/[email protected]/fs/mod.ts";
import { stringify } from "npm:[email protected]"

function _extract<T>(
    str: string,
    rx: RegExp,
    parse: Parser,
): Extract<T> {
    const match = rx.exec(str);
    if (!match || match.index !== 0) {
        throw new TypeError("Unexpected end of input");
    }
    const frontMatter = match.at(-1)?.replace(/^\s+|\s+$/g, "") || "";
    const attrs = parse(frontMatter) as T;
    const body = str.replace(match[0], "");
    return { frontMatter, body, attrs };
}

function recognize(str: string, formats?: Format[]): Format {
    if (!formats) {
        formats = Object.keys(MAP_FORMAT_TO_RECOGNIZER_RX) as Format[];
    }

    const [firstLine] = str.split(/(\r?\n)/);

    for (const format of formats) {
        if (format === Format.UNKNOWN) {
            continue;
        }

        if (MAP_FORMAT_TO_RECOGNIZER_RX[format].test(firstLine)) {
            return format;
        }
    }

    return Format.UNKNOWN;
}

function createExtractor(
    formats: Partial<Record<Format, Parser>>,
): Extractor {
    const formatKeys = Object.keys(formats) as Format[];

    return function extract<T>(str: string): Extract<T> {
        const format = recognize(str, formatKeys);
        const parser = formats[format];

        if (format === Format.UNKNOWN || !parser) {
            throw new TypeError(`Unsupported front matter format`);
        }

        return _extract(str, MAP_FORMAT_TO_EXTRACTOR_RX[format], parser);
    };
}

type Delimiter = string | [begin: string, end: string];

function getBeginToken(delimiter: Delimiter): string {
    return Array.isArray(delimiter) ? delimiter[0] : delimiter;
}

function getEndToken(delimiter: Delimiter): string {
    return Array.isArray(delimiter) ? delimiter[1] : delimiter;
}

function createRegExp(...dv: Delimiter[]): [RegExp, RegExp] {
    const beginPattern = "(" + dv.map(getBeginToken).join("|") + ")";
    const pattern = "^(" +
        "\\ufeff?" + // Maybe byte order mark
        beginPattern +
        "$([\\s\\S]+?)" +
        "^(?:" + dv.map(getEndToken).join("|") + ")\\s*" +
        "$" +
        (Deno.build.os === "windows" ? "\\r?" : "") +
        "(?:\\n)?)";

    return [
        new RegExp("^" + beginPattern + "$", "im"),
        new RegExp(pattern, "im"),
    ];
}

const [RX_RECOGNIZE_TOML, RX_TOML] = createRegExp(
    ["\\+\\+\\+", "\\+\\+\\+"],
    "= toml =",
);


const MAP_FORMAT_TO_RECOGNIZER_RX: Omit<
    Record<Format, RegExp>,
    Format.UNKNOWN
> = {
    [Format.TOML]: RX_RECOGNIZE_TOML,
};
const MAP_FORMAT_TO_EXTRACTOR_RX: Omit<Record<Format, RegExp>, Format.UNKNOWN> =
{
    [Format.TOML]: RX_TOML,
};

const extract = createExtractor({
    [Format.TOML]: parse as Parser,
});

function test(str: string, formats?: Format[]): boolean {
    if (!formats) {
        formats = Object.keys(MAP_FORMAT_TO_EXTRACTOR_RX) as Format[];
    }

    for (const format of formats) {
        if (format === Format.UNKNOWN) {
            throw new TypeError("Unable to test for unknown front matter format");
        }

        const match = MAP_FORMAT_TO_EXTRACTOR_RX[format].exec(str);
        if (match?.index === 0) {
            return true;
        }
    }

    return false;
}

async function writeFile(path: string, attrs: { [key: string]: any }, body: string) {
    await Deno.writeTextFile(path, `---\n${stringify(attrs)}\n---\n\n${body}`)
}

const permittedTopLevelKeys = new Set(["title", "description", "updated", "weight", "draft", "slug", "path", "aliases", "in_search_index", "template", "taxonomies", "extra", "date"])

const taxonomies = new Set(["tags"])

function difference<T>(setA: Set<T>, setB: Set<T>): Set<T> {
    const _difference = new Set(setA);
    for (const elem of setB) {
        _difference.delete(elem);
    }
    return _difference;
}


for await (const entry of walk("./", { includeDirs: false })) {
    if (entry.path.includes("sample")) {
        console.log(entry.path);
        const str = await Deno.readTextFile(entry.path);
        let post;
        if (test(str)) {
            post = extract(str);
        } else {
            post = { body: str, attrs: {} }
        }
        if (!post.attrs.extra) {
            post.attrs.extra = {}
        }
        if (!post.attrs.taxonomies) {
            post.attrs.taxonomies = {}
        }

        const diff = difference(new Set(Object.keys(post.attrs)), permittedTopLevelKeys)
        if (diff.size > 0) {
            for (const elem of diff) {
                if (taxonomies.has(elem)) {
                    post.attrs.taxonomies[elem] = post.attrs[elem]
                } else {
                    post.attrs.extra[elem] = post.attrs[elem]
                }
                delete post.attrs[elem]
            }
        }
        await writeFile(entry.path, post.attrs, post.body)
    }
}

jpcaruana · 2023-01-07T16:33:36Z

Thanks for the upgrade. I still have issues with the script, as regular working posts fail.

content/posts/2013/03/04/afpy.md
error: Uncaught Error: Parse error on line 1, column 26: Unexpected character: "+"
      throw new TOMLParseError(message);
            ^
    at parse (https://deno.land/[email protected]/encoding/_toml/parser.ts:890:13)
    at _extract (file:///xxxmigrate_taxonomies.ts:21:19)
    at extract (file:///xxx/migrate_taxonomies.ts:59:16)
    at file:///Usersjxxxmigrate_taxonomies.ts:153:20

I issued a PR on deno

dustinknopoff/migrateToTaxonomies.ts

jpcaruana commented Jan 7, 2023

dustinknopoff commented Jan 7, 2023

dustinknopoff commented Jan 7, 2023

jpcaruana commented Jan 7, 2023 •

edited

Loading

jpcaruana commented Jan 7, 2023

jpcaruana commented Jan 7, 2023

jpcaruana commented Jan 7, 2023

dustinknopoff commented Jan 7, 2023 •

edited

Loading

jpcaruana commented Jan 7, 2023

	import {
	extract,
	test as containsFrontmatter,
	} from "https://deno.land/[email protected]/encoding/front_matter/any.ts";
	import { walk } from "https://deno.land/[email protected]/fs/mod.ts";
	import { stringify } from "npm:[email protected]"

	async function writeFile(path: string, attrs: { [key: string]: any }, body: string) {
	await Deno.writeTextFile(path, `---\n${stringify(attrs)}\n---\n\n${body}`)
	}

	const permittedTopLevelKeys = new Set(["title", "description", "updated", "weight", "draft", "slug", "path", "aliases", "in_search_index", "template", "taxonomies", "extra", "date"])

	const taxonomies = new Set(["tags"])

	function difference<T>(setA: Set<T>, setB: Set<T>): Set<T> {
	const _difference = new Set(setA);
	for (const elem of setB) {
	_difference.delete(elem);
	}
	return _difference;
	}


	for await (const entry of walk("./content/articles", { includeDirs: false })) {
	if (!entry.path.includes("_index")) {
	console.log(entry.path);
	const str = await Deno.readTextFile(entry.path);
	let post;
	if (containsFrontmatter(str)) {
	post = extract(str);
	} else {
	post = { body: str, attrs: {} }
	}
	if (!post.attrs.extra) {
	post.attrs.extra = {}
	}
	if (!post.attrs.taxonomies) {
	post.attrs.taxonomies = {}
	}

	const diff = difference(new Set(Object.keys(post.attrs)), permittedTopLevelKeys)
	if (diff.size > 0) {
	for (const elem of diff) {
	if (taxonomies.has(elem)) {
	post.attrs.taxonomies[elem] = post.attrs[elem]
	} else {
	post.attrs.extra[elem] = post.attrs[elem]
	}
	delete post.attrs[elem]
	}
	}
	await writeFile(entry.path, post.attrs, post.body)
	}
	}

dustinknopoff/migrateToTaxonomies.ts

jpcaruana commented Jan 7, 2023

dustinknopoff commented Jan 7, 2023

dustinknopoff commented Jan 7, 2023

jpcaruana commented Jan 7, 2023 • edited Loading

jpcaruana commented Jan 7, 2023

jpcaruana commented Jan 7, 2023

jpcaruana commented Jan 7, 2023

dustinknopoff commented Jan 7, 2023 • edited Loading

jpcaruana commented Jan 7, 2023

jpcaruana commented Jan 7, 2023 •

edited

Loading

dustinknopoff commented Jan 7, 2023 •

edited

Loading