shawngraham · November 21, 2024 15:48
diff --git a/1-nuextractor.py b/1-nuextractor.py
 import ollama
 import argparse

 def read_file(file):
    """Read and return file contents, with error handling."""
    try:
        with open(file, 'r') as f:
            return f.read()
    except FileNotFoundError:
        print(f"The file {file} could not be found.")
        return None

 def main():
    # Set up argument parser
    parser = argparse.ArgumentParser(description='Extract information using Ollama')
    parser.add_argument('template', help='Path to template JSON file')
    parser.add_argument('text', help='Path to input text file')
    
    # Parse arguments
    args = parser.parse_args()
    
    # Read template and text files
    template_content = read_file(args.template)
    text_content = read_file(args.text)
    
    if not template_content or not text_content:
        print("Failed to read input files.")
        return
    
    # Construct prompt
    prompt = f"""<|input|>
 ###Template:
 {template_content}
 ###Text:
 {text_content}
 <|output|>"""
    
    # Use Ollama to process the extraction
    ollama_response = ollama.chat(
        model='sroecker/nuextract-tiny-v1.5', 
        messages=[
            {
                'role': 'system',
                'content': 'Extract.',
            },
            {
                'role': 'user',
                'content': prompt,
            },
        ],
        options={
            'temperature': 0
        }
    )
    
    # Print the extraction result
    print("Extraction Result:")
    print(ollama_response['message']['content'])

 if __name__ == "__main__":
    main()
diff --git a/1.1-nuextractor-batch.py b/1.1-nuextractor-batch.py
 import ollama
 import argparse
 import os
 import json

 def read_file(file):
    """Read and return file contents, with error handling."""
    try:
        with open(file, 'r') as f:
            return f.read()
    except FileNotFoundError:
        print(f"The file {file} could not be found.")
        return None

 def main():
    # Set up argument parser
    parser = argparse.ArgumentParser(description='Batch extract information using Ollama')
    parser.add_argument('template', help='Path to template JSON file')
    parser.add_argument('input_folder', help='Path to folder containing input text files')
    parser.add_argument('-o', '--output', default='output.json', 
                        help='Output JSON file name (default: output.json)')
    
    # Parse arguments
    args = parser.parse_args()
    
    # Read template file
    template_content = read_file(args.template)
    if not template_content:
        return
    
    # Collect extraction results
    extraction_results = []
    
    # Iterate through text files in the input folder
    for filename in os.listdir(args.input_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(args.input_folder, filename)
            text_content = read_file(file_path)
            
            if not text_content:
                continue
            
            # Construct prompt
            prompt = f"""<|input|>
 ###Template:
 {template_content}
 ###Text:
 {text_content}
 <|output|>"""
            
            # Use Ollama to process the extraction
            ollama_response = ollama.chat(
                model='sroecker/nuextract-tiny-v1.5', 
                messages=[
                    {
                        'role': 'system',
                        'content': 'Extract.',
                    },
                    {
                        'role': 'user',
                        'content': prompt,
                    },
                ],
                options={
                    'temperature': 0
                }
            )
            
            # Parse and store the response
            try:
                extracted_data = json.loads(ollama_response['message']['content'])
                extraction_results.append(extracted_data)
                print(f"Extracted data from {filename}")
            except json.JSONDecodeError:
                print(f"Failed to parse JSON from {filename}")
    
    # Write results to output JSON file
    with open(args.output, 'w') as outfile:
        json.dump(extraction_results, outfile, indent=2)
    
    print(f"Extraction complete. Results saved to {args.output}")

 if __name__ == "__main__":
    main()
diff --git a/2-template.json b/2-template.json
 {
    "jesuit_report": {
        "letter": {
            "letter_author": {},
            "baptisms": {
                "individuals": {
                    "name": [],
                    "relations": [],
                    "location": []
                },
                "sponsors": {
                    "name": [],
                    "date": []
                },
                "letter_written_location": []
            }
        }
    }
 }
diff --git a/3-test.txt b/3-test.txt
 [3] A Letter Missive in regard to the Conversion and Baptism of the
 Grand Sagamore of new France, who was, before the arrival of the
 French, its chief and sovereign.


 SIR and Brother, I did not wish the ship to depart without giving you
 some news of this country which I believe will be acceptable, as I know
 that you are a good Catholic. The Grand Sagamore, whom we call in our
 language Grand Captain of the Savages, and chief of all, was baptized
 on last saint John the Baptist's day; [4] with his wife, children, and
 children's children, to the number of twenty; with as much enthusiasm,
 fervor, and zeal for Religion as would have been evinced by a person
 who had been instructed in it for three or four years. He promises to
 have the others baptized, or else make war upon them. [_The news of the
 King's death had not then reached Canada._] Monsieur de Poutrincourt
 and his son acted as sponsors for them in the name of the King, and
 of Monseigneur the Dauphin. We have already made this good beginning,
 which I believe will become still better hereafter. As to the country,
 I have never seen anything so beautiful, better, or more fertile;
 and I can say to you, truly and honestly, that if I had three or
 four Laborers with me now, and [5] the means of supporting them for
 one year, and some wheat to sow in the ground tilled by their labor
 alone, I should expect to have a yearly trade in Beaver and other
 Skins amounting to seven or eight thousand livres, with the surplus
 which would remain to me after their support. I am very sorry that I
 did not know before my departure what I know now; if I had, I should
 have left no stone unturned to bring with me two or three farmers,
 and two hogsheads of wheat, which is a mere trifle. I assure you it
 is delightful to engage in trade over here and to make such handsome
 profits. If you wish to take a hand in it, let me know your intentions
 by the bearer, who desires to return and traffic here in pursuance of
 what he has seen. I [6] shall say no more, except to pray God to give
 you, Sir and Brother, a long life and perfect health. From Port Royal,
 New France, this 28th of June, 1610.

  _Your very affectionate Brother and servant_,
  BERTRAND.
diff --git a/4-extraction-result.txt b/4-extraction-result.txt
 Extraction Result:
 {
    "jesuit_report": {
        "letter": {
            "letter_author": {},
            "baptisms": {
                "individuals": {
                    "name": [
                        "Grand Sagamore"
                    ],
                    "relations": [],
                    "location": []
                },
                "sponsors": {
                    "name": [
                        "Monsieur de Poutrincourt",
                        "his son"
                    ],
                    "date": [
                        "28th of June, 1610"
                    ]
                },
                "letter_written_location": [
                    "Port Royal",
                    "New France"
                ]
            }
        }
    }
diff --git a/batch-json2csv.py b/batch-json2csv.py
 import json
 import csv
 import argparse

 def json_to_csv(json_data, output_file='output.csv'):
    # Parse JSON if it's a string
    if isinstance(json_data, str):
        json_data = json.loads(json_data)
    
    # Flatten nested JSON for multiple entries
    def flatten_json(data, prefix=''):
        flat_dict = {}
        for key, value in data.items():
            new_key = f"{prefix}{key}" if prefix else key
            
            if isinstance(value, dict):
                flat_dict.update(flatten_json(value, f"{new_key}_"))
            elif isinstance(value, list):
                # Handle lists by converting to comma-separated strings
                flat_dict[new_key] = ', '.join(map(str, value))
            else:
                flat_dict[new_key] = value
        return flat_dict
    
    # Flatten each entry in the list
    flat_data = []
    for entry in json_data:
        flat_entry = flatten_json(entry)
        flat_data.append(flat_entry)
    
    # Get all possible keys across all entries
    all_keys = set()
    for entry in flat_data:
        all_keys.update(entry.keys())
    
    # Write to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=sorted(all_keys), extrasaction='ignore')
        writer.writeheader()
        writer.writerows(flat_data)
    
    print(f"CSV file '{output_file}' has been created.")

 def main():
    parser = argparse.ArgumentParser(description='Convert JSON to CSV')
    parser.add_argument('input_file', help='Input JSON file')
    parser.add_argument('-o', '--output', default='output.csv', 
                        help='Output CSV file name (default: output.csv)')
    
    args = parser.parse_args()
    
    # Read JSON file
    try:
        with open(args.input_file, 'r') as f:
            json_data = json.load(f)
        
        # Convert to CSV
        json_to_csv(json_data, args.output)
    
    except FileNotFoundError:
        print(f"Error: File {args.input_file} not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON in {args.input_file}")

 if __name__ == "__main__":
    main()
diff --git a/json2csv.py b/json2csv.py
 import json
 import csv
 import argparse

 def json_to_csv(json_data, output_file='output.csv'):
    # Parse JSON if it's a string
    if isinstance(json_data, str):
        json_data = json.loads(json_data)

    # Flatten nested JSON
    def flatten_json(data, prefix=''):
        flat_dict = {}
        for key, value in data.items():
            new_key = f"{prefix}{key}" if prefix else key

            if isinstance(value, dict):
                flat_dict.update(flatten_json(value, f"{new_key}_"))
            elif isinstance(value, list):
                flat_dict[new_key] = ', '.join(map(str, value))
            else:
                flat_dict[new_key] = value
        return flat_dict

    # Flatten the JSON
    flat_data = flatten_json(json_data)

    # Write to CSV
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=flat_data.keys())
        writer.writeheader()
        writer.writerow(flat_data)

    print(f"CSV file '{output_file}' has been created.")

 def main():
    parser = argparse.ArgumentParser(description='Convert JSON to CSV')
    parser.add_argument('input_file', help='Input JSON file')
    parser.add_argument('-o', '--output', default='output.csv', 
                        help='Output CSV file name (default: output.csv)')
    
    args = parser.parse_args()
    
    # Read JSON file
    try:
        with open(args.input_file, 'r') as f:
            json_data = json.load(f)
        
        # Convert to CSV
        json_to_csv(json_data, args.output)
    
    except FileNotFoundError:
        print(f"Error: File {args.input_file} not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON in {args.input_file}")

 if __name__ == "__main__":
    main()
	import ollama
	import argparse

	def read_file(file):
	"""Read and return file contents, with error handling."""
	try:
	with open(file, 'r') as f:
	return f.read()
	except FileNotFoundError:
	print(f"The file {file} could not be found.")
	return None

	def main():
	# Set up argument parser
	parser = argparse.ArgumentParser(description='Extract information using Ollama')
	parser.add_argument('template', help='Path to template JSON file')
	parser.add_argument('text', help='Path to input text file')

	# Parse arguments
	args = parser.parse_args()

	# Read template and text files
	template_content = read_file(args.template)
	text_content = read_file(args.text)

	if not template_content or not text_content:
	print("Failed to read input files.")
	return

	# Construct prompt
	prompt = f"""<\|input\|>
	###Template:
	{template_content}
	###Text:
	{text_content}
	<\|output\|>"""

	# Use Ollama to process the extraction
	ollama_response = ollama.chat(
	model='sroecker/nuextract-tiny-v1.5',
	messages=[
	{
	'role': 'system',
	'content': 'Extract.',
	},
	{
	'role': 'user',
	'content': prompt,
	},
	],
	options={
	'temperature': 0
	}
	)

	# Print the extraction result
	print("Extraction Result:")
	print(ollama_response['message']['content'])

	if __name__ == "__main__":
	main()
	import ollama
	import argparse
	import os
	import json

	def read_file(file):
	"""Read and return file contents, with error handling."""
	try:
	with open(file, 'r') as f:
	return f.read()
	except FileNotFoundError:
	print(f"The file {file} could not be found.")
	return None

	def main():
	# Set up argument parser
	parser = argparse.ArgumentParser(description='Batch extract information using Ollama')
	parser.add_argument('template', help='Path to template JSON file')
	parser.add_argument('input_folder', help='Path to folder containing input text files')
	parser.add_argument('-o', '--output', default='output.json',
	help='Output JSON file name (default: output.json)')

	# Parse arguments
	args = parser.parse_args()

	# Read template file
	template_content = read_file(args.template)
	if not template_content:
	return

	# Collect extraction results
	extraction_results = []

	# Iterate through text files in the input folder
	for filename in os.listdir(args.input_folder):
	if filename.endswith('.txt'):
	file_path = os.path.join(args.input_folder, filename)
	text_content = read_file(file_path)

	if not text_content:
	continue

	# Construct prompt
	prompt = f"""<\|input\|>
	###Template:
	{template_content}
	###Text:
	{text_content}
	<\|output\|>"""

	# Use Ollama to process the extraction
	ollama_response = ollama.chat(
	model='sroecker/nuextract-tiny-v1.5',
	messages=[
	{
	'role': 'system',
	'content': 'Extract.',
	},
	{
	'role': 'user',
	'content': prompt,
	},
	],
	options={
	'temperature': 0
	}
	)

	# Parse and store the response
	try:
	extracted_data = json.loads(ollama_response['message']['content'])
	extraction_results.append(extracted_data)
	print(f"Extracted data from {filename}")
	except json.JSONDecodeError:
	print(f"Failed to parse JSON from {filename}")

	# Write results to output JSON file
	with open(args.output, 'w') as outfile:
	json.dump(extraction_results, outfile, indent=2)

	print(f"Extraction complete. Results saved to {args.output}")

	if __name__ == "__main__":
	main()
	{
	"jesuit_report": {
	"letter": {
	"letter_author": {},
	"baptisms": {
	"individuals": {
	"name": [],
	"relations": [],
	"location": []
	},
	"sponsors": {
	"name": [],
	"date": []
	},
	"letter_written_location": []
	}
	}
	}
	}
	[3] A Letter Missive in regard to the Conversion and Baptism of the
	Grand Sagamore of new France, who was, before the arrival of the
	French, its chief and sovereign.


	SIR and Brother, I did not wish the ship to depart without giving you
	some news of this country which I believe will be acceptable, as I know
	that you are a good Catholic. The Grand Sagamore, whom we call in our
	language Grand Captain of the Savages, and chief of all, was baptized
	on last saint John the Baptist's day; [4] with his wife, children, and
	children's children, to the number of twenty; with as much enthusiasm,
	fervor, and zeal for Religion as would have been evinced by a person
	who had been instructed in it for three or four years. He promises to
	have the others baptized, or else make war upon them. [_The news of the
	King's death had not then reached Canada._] Monsieur de Poutrincourt
	and his son acted as sponsors for them in the name of the King, and
	of Monseigneur the Dauphin. We have already made this good beginning,
	which I believe will become still better hereafter. As to the country,
	I have never seen anything so beautiful, better, or more fertile;
	and I can say to you, truly and honestly, that if I had three or
	four Laborers with me now, and [5] the means of supporting them for
	one year, and some wheat to sow in the ground tilled by their labor
	alone, I should expect to have a yearly trade in Beaver and other
	Skins amounting to seven or eight thousand livres, with the surplus
	which would remain to me after their support. I am very sorry that I
	did not know before my departure what I know now; if I had, I should
	have left no stone unturned to bring with me two or three farmers,
	and two hogsheads of wheat, which is a mere trifle. I assure you it
	is delightful to engage in trade over here and to make such handsome
	profits. If you wish to take a hand in it, let me know your intentions
	by the bearer, who desires to return and traffic here in pursuance of
	what he has seen. I [6] shall say no more, except to pray God to give
	you, Sir and Brother, a long life and perfect health. From Port Royal,
	New France, this 28th of June, 1610.

	_Your very affectionate Brother and servant_,
	BERTRAND.
	Extraction Result:
	{
	"jesuit_report": {
	"letter": {
	"letter_author": {},
	"baptisms": {
	"individuals": {
	"name": [
	"Grand Sagamore"
	],
	"relations": [],
	"location": []
	},
	"sponsors": {
	"name": [
	"Monsieur de Poutrincourt",
	"his son"
	],
	"date": [
	"28th of June, 1610"
	]
	},
	"letter_written_location": [
	"Port Royal",
	"New France"
	]
	}
	}
	}
	import json
	import csv
	import argparse

	def json_to_csv(json_data, output_file='output.csv'):
	# Parse JSON if it's a string
	if isinstance(json_data, str):
	json_data = json.loads(json_data)

	# Flatten nested JSON for multiple entries
	def flatten_json(data, prefix=''):
	flat_dict = {}
	for key, value in data.items():
	new_key = f"{prefix}{key}" if prefix else key

	if isinstance(value, dict):
	flat_dict.update(flatten_json(value, f"{new_key}_"))
	elif isinstance(value, list):
	# Handle lists by converting to comma-separated strings
	flat_dict[new_key] = ', '.join(map(str, value))
	else:
	flat_dict[new_key] = value
	return flat_dict

	# Flatten each entry in the list
	flat_data = []
	for entry in json_data:
	flat_entry = flatten_json(entry)
	flat_data.append(flat_entry)

	# Get all possible keys across all entries
	all_keys = set()
	for entry in flat_data:
	all_keys.update(entry.keys())

	# Write to CSV
	with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=sorted(all_keys), extrasaction='ignore')
	writer.writeheader()
	writer.writerows(flat_data)

	print(f"CSV file '{output_file}' has been created.")

	def main():
	parser = argparse.ArgumentParser(description='Convert JSON to CSV')
	parser.add_argument('input_file', help='Input JSON file')
	parser.add_argument('-o', '--output', default='output.csv',
	help='Output CSV file name (default: output.csv)')

	args = parser.parse_args()

	# Read JSON file
	try:
	with open(args.input_file, 'r') as f:
	json_data = json.load(f)

	# Convert to CSV
	json_to_csv(json_data, args.output)

	except FileNotFoundError:
	print(f"Error: File {args.input_file} not found.")
	except json.JSONDecodeError:
	print(f"Error: Invalid JSON in {args.input_file}")

	if __name__ == "__main__":
	main()