Last active
November 21, 2024 15:48
-
-
Save shawngraham/01553d7e30d9306817072ca8fc741eaa to your computer and use it in GitHub Desktop.
using ollama and nuextract-tiny1.5 (via https://ollama.com/sroecker/nuextract-tiny-v1.5/) to extract structured text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ollama | |
import argparse | |
def read_file(file): | |
"""Read and return file contents, with error handling.""" | |
try: | |
with open(file, 'r') as f: | |
return f.read() | |
except FileNotFoundError: | |
print(f"The file {file} could not be found.") | |
return None | |
def main(): | |
# Set up argument parser | |
parser = argparse.ArgumentParser(description='Extract information using Ollama') | |
parser.add_argument('template', help='Path to template JSON file') | |
parser.add_argument('text', help='Path to input text file') | |
# Parse arguments | |
args = parser.parse_args() | |
# Read template and text files | |
template_content = read_file(args.template) | |
text_content = read_file(args.text) | |
if not template_content or not text_content: | |
print("Failed to read input files.") | |
return | |
# Construct prompt | |
prompt = f"""<|input|> | |
###Template: | |
{template_content} | |
###Text: | |
{text_content} | |
<|output|>""" | |
# Use Ollama to process the extraction | |
ollama_response = ollama.chat( | |
model='sroecker/nuextract-tiny-v1.5', | |
messages=[ | |
{ | |
'role': 'system', | |
'content': 'Extract.', | |
}, | |
{ | |
'role': 'user', | |
'content': prompt, | |
}, | |
], | |
options={ | |
'temperature': 0 | |
} | |
) | |
# Print the extraction result | |
print("Extraction Result:") | |
print(ollama_response['message']['content']) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ollama | |
import argparse | |
import os | |
import json | |
def read_file(file): | |
"""Read and return file contents, with error handling.""" | |
try: | |
with open(file, 'r') as f: | |
return f.read() | |
except FileNotFoundError: | |
print(f"The file {file} could not be found.") | |
return None | |
def main(): | |
# Set up argument parser | |
parser = argparse.ArgumentParser(description='Batch extract information using Ollama') | |
parser.add_argument('template', help='Path to template JSON file') | |
parser.add_argument('input_folder', help='Path to folder containing input text files') | |
parser.add_argument('-o', '--output', default='output.json', | |
help='Output JSON file name (default: output.json)') | |
# Parse arguments | |
args = parser.parse_args() | |
# Read template file | |
template_content = read_file(args.template) | |
if not template_content: | |
return | |
# Collect extraction results | |
extraction_results = [] | |
# Iterate through text files in the input folder | |
for filename in os.listdir(args.input_folder): | |
if filename.endswith('.txt'): | |
file_path = os.path.join(args.input_folder, filename) | |
text_content = read_file(file_path) | |
if not text_content: | |
continue | |
# Construct prompt | |
prompt = f"""<|input|> | |
###Template: | |
{template_content} | |
###Text: | |
{text_content} | |
<|output|>""" | |
# Use Ollama to process the extraction | |
ollama_response = ollama.chat( | |
model='sroecker/nuextract-tiny-v1.5', | |
messages=[ | |
{ | |
'role': 'system', | |
'content': 'Extract.', | |
}, | |
{ | |
'role': 'user', | |
'content': prompt, | |
}, | |
], | |
options={ | |
'temperature': 0 | |
} | |
) | |
# Parse and store the response | |
try: | |
extracted_data = json.loads(ollama_response['message']['content']) | |
extraction_results.append(extracted_data) | |
print(f"Extracted data from {filename}") | |
except json.JSONDecodeError: | |
print(f"Failed to parse JSON from {filename}") | |
# Write results to output JSON file | |
with open(args.output, 'w') as outfile: | |
json.dump(extraction_results, outfile, indent=2) | |
print(f"Extraction complete. Results saved to {args.output}") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"jesuit_report": { | |
"letter": { | |
"letter_author": {}, | |
"baptisms": { | |
"individuals": { | |
"name": [], | |
"relations": [], | |
"location": [] | |
}, | |
"sponsors": { | |
"name": [], | |
"date": [] | |
}, | |
"letter_written_location": [] | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[3] A Letter Missive in regard to the Conversion and Baptism of the | |
Grand Sagamore of new France, who was, before the arrival of the | |
French, its chief and sovereign. | |
SIR and Brother, I did not wish the ship to depart without giving you | |
some news of this country which I believe will be acceptable, as I know | |
that you are a good Catholic. The Grand Sagamore, whom we call in our | |
language Grand Captain of the Savages, and chief of all, was baptized | |
on last saint John the Baptist's day; [4] with his wife, children, and | |
children's children, to the number of twenty; with as much enthusiasm, | |
fervor, and zeal for Religion as would have been evinced by a person | |
who had been instructed in it for three or four years. He promises to | |
have the others baptized, or else make war upon them. [_The news of the | |
King's death had not then reached Canada._] Monsieur de Poutrincourt | |
and his son acted as sponsors for them in the name of the King, and | |
of Monseigneur the Dauphin. We have already made this good beginning, | |
which I believe will become still better hereafter. As to the country, | |
I have never seen anything so beautiful, better, or more fertile; | |
and I can say to you, truly and honestly, that if I had three or | |
four Laborers with me now, and [5] the means of supporting them for | |
one year, and some wheat to sow in the ground tilled by their labor | |
alone, I should expect to have a yearly trade in Beaver and other | |
Skins amounting to seven or eight thousand livres, with the surplus | |
which would remain to me after their support. I am very sorry that I | |
did not know before my departure what I know now; if I had, I should | |
have left no stone unturned to bring with me two or three farmers, | |
and two hogsheads of wheat, which is a mere trifle. I assure you it | |
is delightful to engage in trade over here and to make such handsome | |
profits. If you wish to take a hand in it, let me know your intentions | |
by the bearer, who desires to return and traffic here in pursuance of | |
what he has seen. I [6] shall say no more, except to pray God to give | |
you, Sir and Brother, a long life and perfect health. From Port Royal, | |
New France, this 28th of June, 1610. | |
_Your very affectionate Brother and servant_, | |
BERTRAND. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Extraction Result: | |
{ | |
"jesuit_report": { | |
"letter": { | |
"letter_author": {}, | |
"baptisms": { | |
"individuals": { | |
"name": [ | |
"Grand Sagamore" | |
], | |
"relations": [], | |
"location": [] | |
}, | |
"sponsors": { | |
"name": [ | |
"Monsieur de Poutrincourt", | |
"his son" | |
], | |
"date": [ | |
"28th of June, 1610" | |
] | |
}, | |
"letter_written_location": [ | |
"Port Royal", | |
"New France" | |
] | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
import argparse | |
def json_to_csv(json_data, output_file='output.csv'): | |
# Parse JSON if it's a string | |
if isinstance(json_data, str): | |
json_data = json.loads(json_data) | |
# Flatten nested JSON for multiple entries | |
def flatten_json(data, prefix=''): | |
flat_dict = {} | |
for key, value in data.items(): | |
new_key = f"{prefix}{key}" if prefix else key | |
if isinstance(value, dict): | |
flat_dict.update(flatten_json(value, f"{new_key}_")) | |
elif isinstance(value, list): | |
# Handle lists by converting to comma-separated strings | |
flat_dict[new_key] = ', '.join(map(str, value)) | |
else: | |
flat_dict[new_key] = value | |
return flat_dict | |
# Flatten each entry in the list | |
flat_data = [] | |
for entry in json_data: | |
flat_entry = flatten_json(entry) | |
flat_data.append(flat_entry) | |
# Get all possible keys across all entries | |
all_keys = set() | |
for entry in flat_data: | |
all_keys.update(entry.keys()) | |
# Write to CSV | |
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=sorted(all_keys), extrasaction='ignore') | |
writer.writeheader() | |
writer.writerows(flat_data) | |
print(f"CSV file '{output_file}' has been created.") | |
def main(): | |
parser = argparse.ArgumentParser(description='Convert JSON to CSV') | |
parser.add_argument('input_file', help='Input JSON file') | |
parser.add_argument('-o', '--output', default='output.csv', | |
help='Output CSV file name (default: output.csv)') | |
args = parser.parse_args() | |
# Read JSON file | |
try: | |
with open(args.input_file, 'r') as f: | |
json_data = json.load(f) | |
# Convert to CSV | |
json_to_csv(json_data, args.output) | |
except FileNotFoundError: | |
print(f"Error: File {args.input_file} not found.") | |
except json.JSONDecodeError: | |
print(f"Error: Invalid JSON in {args.input_file}") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
import argparse | |
def json_to_csv(json_data, output_file='output.csv'): | |
# Parse JSON if it's a string | |
if isinstance(json_data, str): | |
json_data = json.loads(json_data) | |
# Flatten nested JSON | |
def flatten_json(data, prefix=''): | |
flat_dict = {} | |
for key, value in data.items(): | |
new_key = f"{prefix}{key}" if prefix else key | |
if isinstance(value, dict): | |
flat_dict.update(flatten_json(value, f"{new_key}_")) | |
elif isinstance(value, list): | |
flat_dict[new_key] = ', '.join(map(str, value)) | |
else: | |
flat_dict[new_key] = value | |
return flat_dict | |
# Flatten the JSON | |
flat_data = flatten_json(json_data) | |
# Write to CSV | |
with open(output_file, 'w', newline='') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=flat_data.keys()) | |
writer.writeheader() | |
writer.writerow(flat_data) | |
print(f"CSV file '{output_file}' has been created.") | |
def main(): | |
parser = argparse.ArgumentParser(description='Convert JSON to CSV') | |
parser.add_argument('input_file', help='Input JSON file') | |
parser.add_argument('-o', '--output', default='output.csv', | |
help='Output CSV file name (default: output.csv)') | |
args = parser.parse_args() | |
# Read JSON file | |
try: | |
with open(args.input_file, 'r') as f: | |
json_data = json.load(f) | |
# Convert to CSV | |
json_to_csv(json_data, args.output) | |
except FileNotFoundError: | |
print(f"Error: File {args.input_file} not found.") | |
except json.JSONDecodeError: | |
print(f"Error: Invalid JSON in {args.input_file}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment