Sebastian Vindaar

More detailed thoughts about data extraction

This gist contains some ideas about using LLMs to extract data from papers (specifically related to biology, aging research and the like).

Just to quickly expand a bit on what I was trying to say when our meeting was cut off:

I think the LLM data extraction can be viewed as a problem tractable at 3 different layers:
1. purely text based, e.g. use `pdftotext` to turn a PDF into a text document, then use LLMs to summarize, extract, tag, ... papers in order to have machine readable data.

	import std / [strutils, strformat, tables, dynlib, os]
	import noise, shell


	import compiler/[llstream, renderer, types, magicsys, ast,
	transf, # for code transformation (for -> while etc)
	injectdestructors, # destructor injection
	pathutils, # AbsoluteDir
	modulegraphs] # getBody
	import ./nimeval_dynlib_clean

	import std / [strutils, strformat, tables, dynlib, os]
	import noise, shell


	import compiler/[llstream, renderer, types, magicsys, ast,
	transf, # for code transformation (for -> while etc)
	injectdestructors, # destructor injection
	pathutils, # AbsoluteDir
	modulegraphs] # getBody
	import ./nimeval_dynlib

	import noise, strutils, strformat, shell, tables, dynlib

	proc printHelp() = echo ""

	const procTmpl = """
	{.push cdecl, exportc, dynlib.}
	$#
	{.pop.}
	"""
	const exprTmpl = """

	import datamancer
	import std / [math, complex]

	const xn = 960
	const yn = 960
	const xmin = -2.0
	const xmax = 0.6
	const ymin = -1.5
	const ymax = 1.5
	const MAX_ITERS = 200

	import ggplotnim, math
	import arraymancer

	const ε = 3
	proc φ(r: float): float =
	result = exp(-pow((ε.float * r), 2.0))

	proc toMatrix(n: int, start, stop: float): Tensor[float] =
	result = zeros[float]([n, n])
	let xs = linspace(start, stop, n)

	#[
	This code solves the unit conversion from system A to system B generically.
	If natural units are involed, the remaining units (e.g. Energy in particle physics
	natural units) are combined with the constants set to 1 that replace real units.

	It solves the following system of linear equations which arises from:

	`Π_i,α A_i^α = Π_i^β B_i^β`

	where `A_i` is the i-th unit of unit system `A` and `α` the power needed to raise

	x	y
	702	376
	699	376
	703	376
	719	376
	723	376
	654	375
	656	375
	657	375
	660	375

Sebastian Vindaar

More detailed thoughts about data extraction

Table of Contents