Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Last active November 21, 2024 15:48
Show Gist options
  • Save shawngraham/01553d7e30d9306817072ca8fc741eaa to your computer and use it in GitHub Desktop.
Save shawngraham/01553d7e30d9306817072ca8fc741eaa to your computer and use it in GitHub Desktop.
using ollama and nuextract-tiny1.5 (via https://ollama.com/sroecker/nuextract-tiny-v1.5/) to extract structured text.
import ollama
import argparse
def read_file(file):
"""Read and return file contents, with error handling."""
try:
with open(file, 'r') as f:
return f.read()
except FileNotFoundError:
print(f"The file {file} could not be found.")
return None
def main():
# Set up argument parser
parser = argparse.ArgumentParser(description='Extract information using Ollama')
parser.add_argument('template', help='Path to template JSON file')
parser.add_argument('text', help='Path to input text file')
# Parse arguments
args = parser.parse_args()
# Read template and text files
template_content = read_file(args.template)
text_content = read_file(args.text)
if not template_content or not text_content:
print("Failed to read input files.")
return
# Construct prompt
prompt = f"""<|input|>
###Template:
{template_content}
###Text:
{text_content}
<|output|>"""
# Use Ollama to process the extraction
ollama_response = ollama.chat(
model='sroecker/nuextract-tiny-v1.5',
messages=[
{
'role': 'system',
'content': 'Extract.',
},
{
'role': 'user',
'content': prompt,
},
],
options={
'temperature': 0
}
)
# Print the extraction result
print("Extraction Result:")
print(ollama_response['message']['content'])
if __name__ == "__main__":
main()
import ollama
import argparse
import os
import json
def read_file(file):
"""Read and return file contents, with error handling."""
try:
with open(file, 'r') as f:
return f.read()
except FileNotFoundError:
print(f"The file {file} could not be found.")
return None
def main():
# Set up argument parser
parser = argparse.ArgumentParser(description='Batch extract information using Ollama')
parser.add_argument('template', help='Path to template JSON file')
parser.add_argument('input_folder', help='Path to folder containing input text files')
parser.add_argument('-o', '--output', default='output.json',
help='Output JSON file name (default: output.json)')
# Parse arguments
args = parser.parse_args()
# Read template file
template_content = read_file(args.template)
if not template_content:
return
# Collect extraction results
extraction_results = []
# Iterate through text files in the input folder
for filename in os.listdir(args.input_folder):
if filename.endswith('.txt'):
file_path = os.path.join(args.input_folder, filename)
text_content = read_file(file_path)
if not text_content:
continue
# Construct prompt
prompt = f"""<|input|>
###Template:
{template_content}
###Text:
{text_content}
<|output|>"""
# Use Ollama to process the extraction
ollama_response = ollama.chat(
model='sroecker/nuextract-tiny-v1.5',
messages=[
{
'role': 'system',
'content': 'Extract.',
},
{
'role': 'user',
'content': prompt,
},
],
options={
'temperature': 0
}
)
# Parse and store the response
try:
extracted_data = json.loads(ollama_response['message']['content'])
extraction_results.append(extracted_data)
print(f"Extracted data from {filename}")
except json.JSONDecodeError:
print(f"Failed to parse JSON from {filename}")
# Write results to output JSON file
with open(args.output, 'w') as outfile:
json.dump(extraction_results, outfile, indent=2)
print(f"Extraction complete. Results saved to {args.output}")
if __name__ == "__main__":
main()
{
"jesuit_report": {
"letter": {
"letter_author": {},
"baptisms": {
"individuals": {
"name": [],
"relations": [],
"location": []
},
"sponsors": {
"name": [],
"date": []
},
"letter_written_location": []
}
}
}
}
[3] A Letter Missive in regard to the Conversion and Baptism of the
Grand Sagamore of new France, who was, before the arrival of the
French, its chief and sovereign.
SIR and Brother, I did not wish the ship to depart without giving you
some news of this country which I believe will be acceptable, as I know
that you are a good Catholic. The Grand Sagamore, whom we call in our
language Grand Captain of the Savages, and chief of all, was baptized
on last saint John the Baptist's day; [4] with his wife, children, and
children's children, to the number of twenty; with as much enthusiasm,
fervor, and zeal for Religion as would have been evinced by a person
who had been instructed in it for three or four years. He promises to
have the others baptized, or else make war upon them. [_The news of the
King's death had not then reached Canada._] Monsieur de Poutrincourt
and his son acted as sponsors for them in the name of the King, and
of Monseigneur the Dauphin. We have already made this good beginning,
which I believe will become still better hereafter. As to the country,
I have never seen anything so beautiful, better, or more fertile;
and I can say to you, truly and honestly, that if I had three or
four Laborers with me now, and [5] the means of supporting them for
one year, and some wheat to sow in the ground tilled by their labor
alone, I should expect to have a yearly trade in Beaver and other
Skins amounting to seven or eight thousand livres, with the surplus
which would remain to me after their support. I am very sorry that I
did not know before my departure what I know now; if I had, I should
have left no stone unturned to bring with me two or three farmers,
and two hogsheads of wheat, which is a mere trifle. I assure you it
is delightful to engage in trade over here and to make such handsome
profits. If you wish to take a hand in it, let me know your intentions
by the bearer, who desires to return and traffic here in pursuance of
what he has seen. I [6] shall say no more, except to pray God to give
you, Sir and Brother, a long life and perfect health. From Port Royal,
New France, this 28th of June, 1610.
_Your very affectionate Brother and servant_,
BERTRAND.
Extraction Result:
{
"jesuit_report": {
"letter": {
"letter_author": {},
"baptisms": {
"individuals": {
"name": [
"Grand Sagamore"
],
"relations": [],
"location": []
},
"sponsors": {
"name": [
"Monsieur de Poutrincourt",
"his son"
],
"date": [
"28th of June, 1610"
]
},
"letter_written_location": [
"Port Royal",
"New France"
]
}
}
}
import json
import csv
import argparse
def json_to_csv(json_data, output_file='output.csv'):
# Parse JSON if it's a string
if isinstance(json_data, str):
json_data = json.loads(json_data)
# Flatten nested JSON for multiple entries
def flatten_json(data, prefix=''):
flat_dict = {}
for key, value in data.items():
new_key = f"{prefix}{key}" if prefix else key
if isinstance(value, dict):
flat_dict.update(flatten_json(value, f"{new_key}_"))
elif isinstance(value, list):
# Handle lists by converting to comma-separated strings
flat_dict[new_key] = ', '.join(map(str, value))
else:
flat_dict[new_key] = value
return flat_dict
# Flatten each entry in the list
flat_data = []
for entry in json_data:
flat_entry = flatten_json(entry)
flat_data.append(flat_entry)
# Get all possible keys across all entries
all_keys = set()
for entry in flat_data:
all_keys.update(entry.keys())
# Write to CSV
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=sorted(all_keys), extrasaction='ignore')
writer.writeheader()
writer.writerows(flat_data)
print(f"CSV file '{output_file}' has been created.")
def main():
parser = argparse.ArgumentParser(description='Convert JSON to CSV')
parser.add_argument('input_file', help='Input JSON file')
parser.add_argument('-o', '--output', default='output.csv',
help='Output CSV file name (default: output.csv)')
args = parser.parse_args()
# Read JSON file
try:
with open(args.input_file, 'r') as f:
json_data = json.load(f)
# Convert to CSV
json_to_csv(json_data, args.output)
except FileNotFoundError:
print(f"Error: File {args.input_file} not found.")
except json.JSONDecodeError:
print(f"Error: Invalid JSON in {args.input_file}")
if __name__ == "__main__":
main()
import json
import csv
import argparse
def json_to_csv(json_data, output_file='output.csv'):
# Parse JSON if it's a string
if isinstance(json_data, str):
json_data = json.loads(json_data)
# Flatten nested JSON
def flatten_json(data, prefix=''):
flat_dict = {}
for key, value in data.items():
new_key = f"{prefix}{key}" if prefix else key
if isinstance(value, dict):
flat_dict.update(flatten_json(value, f"{new_key}_"))
elif isinstance(value, list):
flat_dict[new_key] = ', '.join(map(str, value))
else:
flat_dict[new_key] = value
return flat_dict
# Flatten the JSON
flat_data = flatten_json(json_data)
# Write to CSV
with open(output_file, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=flat_data.keys())
writer.writeheader()
writer.writerow(flat_data)
print(f"CSV file '{output_file}' has been created.")
def main():
parser = argparse.ArgumentParser(description='Convert JSON to CSV')
parser.add_argument('input_file', help='Input JSON file')
parser.add_argument('-o', '--output', default='output.csv',
help='Output CSV file name (default: output.csv)')
args = parser.parse_args()
# Read JSON file
try:
with open(args.input_file, 'r') as f:
json_data = json.load(f)
# Convert to CSV
json_to_csv(json_data, args.output)
except FileNotFoundError:
print(f"Error: File {args.input_file} not found.")
except json.JSONDecodeError:
print(f"Error: Invalid JSON in {args.input_file}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment