revolunet · December 11, 2024 00:18
diff --git a/albert.mjs b/albert.mjs
 // @ts-check

 // see also: https://albert.api.etalab.gouv.fr/documentation

 import { readFile, readdir } from "fs/promises";
 import pAll from "p-all";

 const API_KEY = process.env.ALBERT_API_KEY;

 const API_URL = "https://albert.api.etalab.gouv.fr";

 // see https://albert.api.etalab.gouv.fr/v1/models
 const languageModel = "AgentPublic/llama3-instruct-8b"; 
 const embeddingModel = "BAAI/bge-m3";

 const headers = {
  Authorization: `Bearer ${API_KEY}`,
 };

 /**
 *
 * @returns {Promise<{data:{id: string, object: "model", type: "text-generation"|"text-embeddings-inference"|"automatic-speech-recognition", status:"available"}[]}>} list of models
 */
 const getModels = () =>
  fetch(`${API_URL}/v1/models`, {
    method: "GET",
    headers: {
      ...headers,
      "Content-Type": "application/json",
    },
  }).then((r) => r.json());

 /**
 *
 * @param {{name: string, model: string}} param0
 * @returns {Promise<string>} Id of the new collection
 */
 const createCollection = ({ name, model }) =>
  fetch(`${API_URL}/v1/collections`, {
    method: "POST",
    headers: {
      ...headers,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({ name, model }),
  })
    .then((r) => r.json())
    .then((d) => {
      console.log(d);
      return d;
    })
    .then((d) => d.id);

 /**
 *
 * @param {string} somePath
 * @returns {string} base file name
 */
 const baseName = (somePath) => {
  const parts = somePath.split("/");
  return parts[parts.length - 1];
 };

 /**
 * Add some file to an albert collection
 * @param {{file: Blob, fileName: string, collectionId: string}} param0
 * @returns {Promise<string>}
 */
 const addFileToCollection = async ({ file, fileName, collectionId }) => {
  const formData = new FormData();
  formData.append("file", file, fileName);
  formData.append("request", JSON.stringify({ collection: collectionId }));
  return fetch(`${API_URL}/v1/files`, {
    method: "POST",
    headers: {
      ...headers,
    },
    body: formData,
  }).then(async (r) => r.text());
 };

 const importLocalMarkdown = async (collectionId, filePath) => {
  const markdown = (await readFile(filePath)).toString();
  return addFileToCollection({
    file: new Blob([markdown], { type: "text/markdown" }),
    fileName: baseName(filePath),
    collectionId,
  });
 };

 // albert API has a low rate-limit
 const wait = (r) => new Promise((resolve) => setTimeout(() => resolve(r), 500));

 /**
 * Import local markdowns files to an albert collection
 * @param {string} collectionId The collection to import to
 * @param {string} baseDir The local path with some markdown
 * @returns
 */
 const importLocalMarkdownPath = async (collectionId, baseDir) => {
  const files = await readdir(baseDir).then((f) =>
    f.filter((n) => n.endsWith(".md")).filter((n) => !n.includes("anct-pni"))
  );
  // todo: improve headers management
  return pAll(
    files.map(
      (f) => () =>
        importLocalMarkdown(collectionId, `${baseDir}/${f}`).then(wait)
    ),
    { concurrency: 1 }
  );
 };

 /**
 *
 * @param {{collections: string[], query: string}} param0
 * @returns {Promise<{data: {chunk: {content: string, metadata: {document_name: string}}}[]}>} Search API result
 */
 const getSearch = ({ collections, query }) =>
  fetch(`${API_URL}/v1/search`, {
    method: "POST",
    headers: {
      ...headers,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({ collections, k: 6, prompt: query }),
  }).then((r) => r.json());

 /**
 *
 * @param {{prompt: string}} param0
 * @returns {Promise<{model: string, id: string, choices: {message: {content: string}}[]}>} Search API result
 */
 const getCompletion = ({ prompt }) =>
  fetch(`${API_URL}/v1/chat/completions`, {
    method: "POST",
    headers: {
      ...headers,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      model: languageModel,
      stream: false,
      messages: [{ role: "user", content: prompt }],
    }),
  }).then((r) => r.json());

 /**
 * Query Albert
 * @param {{query: string, collections: string[]}} param0
 * @returns {string}
 */
 const askAlbert = async ({ query, collections }) => {
  const searchResult = await getSearch({
    query,
    collections,
  });

  const prompt = `Réponds à la question suivante en te basant sur les documents ci-dessous : ${query}
  
  Documents :
  
  ${searchResult.data.map((c) => c.chunk.content)}`;

  const result = await getCompletion({ prompt });
  const sources = searchResult.data.map(
    (c) =>
      `https://beta.gouv.fr/startups/${c.chunk.metadata.document_name.replace(
        /\.md/,
        ".html"
      )}`
  );
  const sourcesList = sources.length
    ? "Sources:\n" +
      Array.from(new Set(sources))
        .sort()
        .map((s) => `- ${s}`)
        .join("\n")
    : "";

  return `${result.choices[0].message.content}\n\n${sourcesList}`;
 };

 const collectionId = "1ce00182-f2ef-4810-a913-94823b6d8cb4";

 // // create collection
 // const collectionId = await createCollection({
 //   name: "fiches beta.gouv.fr 3",
 //   model: embeddingModel,
 // });
 // console.log("collectionId", collectionId);

 // // import local files to collection
 // await importLocalMarkdownPath(
 //   collectionId,
 //   "../beta.gouv.fr/content/_startups"
 // );

 const queries = [
  "C'est quoi domifa ?",
  "Comment contacter demarches-simplifiées ?",
  "Dans quelle phase est le code du travail numérique ?",
  "Quelle sont les objectifs de mes aides reno ?",
  "Quelle sont les métriques d'impact de dossier facile ?",
  "Quel est le budget de l'annuaire entreprise ?",
 ];

 queries.forEach(async (q) => {
  const answer = await askAlbert({ query: q, collections: [collectionId] });
  console.log(`\n\n### ${q}\n> ${answer}`);
 });
diff --git a/result.md b/result.md
	// @ts-check

	// see also: https://albert.api.etalab.gouv.fr/documentation

	import { readFile, readdir } from "fs/promises";
	import pAll from "p-all";

	const API_KEY = process.env.ALBERT_API_KEY;

	const API_URL = "https://albert.api.etalab.gouv.fr";

	// see https://albert.api.etalab.gouv.fr/v1/models
	const languageModel = "AgentPublic/llama3-instruct-8b";
	const embeddingModel = "BAAI/bge-m3";

	const headers = {
	Authorization: `Bearer ${API_KEY}`,
	};

	/**
	*
	* @returns {Promise<{data:{id: string, object: "model", type: "text-generation"\|"text-embeddings-inference"\|"automatic-speech-recognition", status:"available"}[]}>} list of models
	*/
	const getModels = () =>
	fetch(`${API_URL}/v1/models`, {
	method: "GET",
	headers: {
	...headers,
	"Content-Type": "application/json",
	},
	}).then((r) => r.json());

	/**
	*
	* @param {{name: string, model: string}} param0
	* @returns {Promise<string>} Id of the new collection
	*/
	const createCollection = ({ name, model }) =>
	fetch(`${API_URL}/v1/collections`, {
	method: "POST",
	headers: {
	...headers,
	"Content-Type": "application/json",
	},
	body: JSON.stringify({ name, model }),
	})
	.then((r) => r.json())
	.then((d) => {
	console.log(d);
	return d;
	})
	.then((d) => d.id);

	/**
	*
	* @param {string} somePath
	* @returns {string} base file name
	*/
	const baseName = (somePath) => {
	const parts = somePath.split("/");
	return parts[parts.length - 1];
	};

	/**
	* Add some file to an albert collection
	* @param {{file: Blob, fileName: string, collectionId: string}} param0
	* @returns {Promise<string>}
	*/
	const addFileToCollection = async ({ file, fileName, collectionId }) => {
	const formData = new FormData();
	formData.append("file", file, fileName);
	formData.append("request", JSON.stringify({ collection: collectionId }));
	return fetch(`${API_URL}/v1/files`, {
	method: "POST",
	headers: {
	...headers,
	},
	body: formData,
	}).then(async (r) => r.text());
	};

	const importLocalMarkdown = async (collectionId, filePath) => {
	const markdown = (await readFile(filePath)).toString();
	return addFileToCollection({
	file: new Blob([markdown], { type: "text/markdown" }),
	fileName: baseName(filePath),
	collectionId,
	});
	};

	// albert API has a low rate-limit
	const wait = (r) => new Promise((resolve) => setTimeout(() => resolve(r), 500));

	/**
	* Import local markdowns files to an albert collection
	* @param {string} collectionId The collection to import to
	* @param {string} baseDir The local path with some markdown
	* @returns
	*/
	const importLocalMarkdownPath = async (collectionId, baseDir) => {
	const files = await readdir(baseDir).then((f) =>
	f.filter((n) => n.endsWith(".md")).filter((n) => !n.includes("anct-pni"))
	);
	// todo: improve headers management
	return pAll(
	files.map(
	(f) => () =>
	importLocalMarkdown(collectionId, `${baseDir}/${f}`).then(wait)
	),
	{ concurrency: 1 }
	);
	};

	/**
	*
	* @param {{collections: string[], query: string}} param0
	* @returns {Promise<{data: {chunk: {content: string, metadata: {document_name: string}}}[]}>} Search API result
	*/
	const getSearch = ({ collections, query }) =>
	fetch(`${API_URL}/v1/search`, {
	method: "POST",
	headers: {
	...headers,
	"Content-Type": "application/json",
	},
	body: JSON.stringify({ collections, k: 6, prompt: query }),
	}).then((r) => r.json());

	/**
	*
	* @param {{prompt: string}} param0
	* @returns {Promise<{model: string, id: string, choices: {message: {content: string}}[]}>} Search API result
	*/
	const getCompletion = ({ prompt }) =>
	fetch(`${API_URL}/v1/chat/completions`, {
	method: "POST",
	headers: {
	...headers,
	"Content-Type": "application/json",
	},
	body: JSON.stringify({
	model: languageModel,
	stream: false,
	messages: [{ role: "user", content: prompt }],
	}),
	}).then((r) => r.json());

	/**
	* Query Albert
	* @param {{query: string, collections: string[]}} param0
	* @returns {string}
	*/
	const askAlbert = async ({ query, collections }) => {
	const searchResult = await getSearch({
	query,
	collections,
	});

	const prompt = `Réponds à la question suivante en te basant sur les documents ci-dessous : ${query}

	Documents :

	${searchResult.data.map((c) => c.chunk.content)}`;

	const result = await getCompletion({ prompt });
	const sources = searchResult.data.map(
	(c) =>
	`https://beta.gouv.fr/startups/${c.chunk.metadata.document_name.replace(
	/\.md/,
	".html"
	)}`
	);
	const sourcesList = sources.length
	? "Sources:\n" +
	Array.from(new Set(sources))
	.sort()
	.map((s) => `- ${s}`)
	.join("\n")
	: "";

	return `${result.choices[0].message.content}\n\n${sourcesList}`;
	};

	const collectionId = "1ce00182-f2ef-4810-a913-94823b6d8cb4";

	// // create collection
	// const collectionId = await createCollection({
	// name: "fiches beta.gouv.fr 3",
	// model: embeddingModel,
	// });
	// console.log("collectionId", collectionId);

	// // import local files to collection
	// await importLocalMarkdownPath(
	// collectionId,
	// "../beta.gouv.fr/content/_startups"
	// );

	const queries = [
	"C'est quoi domifa ?",
	"Comment contacter demarches-simplifiées ?",
	"Dans quelle phase est le code du travail numérique ?",
	"Quelle sont les objectifs de mes aides reno ?",
	"Quelle sont les métriques d'impact de dossier facile ?",
	"Quel est le budget de l'annuaire entreprise ?",
	];

	queries.forEach(async (q) => {
	const answer = await askAlbert({ query: q, collections: [collectionId] });
	console.log(`\n\n### ${q}\n> ${answer}`);
	});