Created
July 23, 2024 23:44
-
-
Save wadefletch/406a74c6e2c96dddc23ebc9f4447d893 to your computer and use it in GitHub Desktop.
Minimal triple extraction workflow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import logging | |
import re | |
import instructor | |
import pydantic | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def segment_sentences(text: str) -> list[str]: | |
logger.debug("Segmenting sentences") | |
sentences = re.split(r"[.?!]\s+|\n+", text) | |
logger.debug(f"Segmented {len(sentences)} sentences") | |
return sentences | |
async def coreference_resolution( | |
model: str, client: instructor.AsyncInstructor, text: str | |
) -> str: | |
logger.info("Performing coreference resolution") | |
result = await client.chat.completions.create( | |
model=model, | |
max_tokens=4096, | |
messages=[ | |
{ | |
"role": "system", | |
"content": """ | |
# Task | |
Resolve the coreferences in the given text. | |
Coreferences are pronouns or phrases that refer to the same entity. | |
Replace these coreferences with the appropriate noun to make the text clearer. | |
# Instructions | |
1. Identify the coreferences in the text. | |
2. Replace each coreference with the noun it refers to. | |
# Examples | |
Input: Jason is a 25-year-old software engineer. He likes to code in Python. | |
Output: Jason is a 25-year-old software engineer. Jason likes to code in Python. | |
Input: Minnesota is in the United States, which is in North America. | |
Output: Minnesota is in the United States. The United States is in North America. | |
Input: Sarah and her friends went to the park. They had a great time. | |
Output: Sarah and her friends went to the park. Sarah and her friends had a great time. | |
Input: The cat chased the mouse because it was hungry. | |
Output: The cat chased the mouse because the cat was hungry. | |
Input: The company released its new product. It received great reviews. | |
Output: The company released the company's new product. The new product received great reviews. | |
# Text to resolve | |
""", | |
}, | |
{"role": "user", "content": text}, | |
], | |
response_model=str, | |
) | |
if not result: | |
logger.error("No result from coreference resolution") | |
raise Exception("No result from coreference resolution") | |
logger.info("Coreference resolution completed") | |
return result | |
class ModelOutputTuple(pydantic.BaseModel): | |
relationship: str | |
source: str | |
target: str | |
def __hash__(self): | |
return hash((self.source, self.relationship, self.target)) | |
async def extract_tuples_from_sentence( | |
model: str, | |
client: instructor.AsyncInstructor, | |
text: str, | |
) -> list[ModelOutputTuple]: | |
logger.info(f"Extracting tuples from sentence: {text[:50]}...") | |
model_tuples = await client.messages.create( | |
model=model, | |
max_tokens=4096, | |
messages=[ | |
{ | |
"role": "system", | |
"content": """ | |
You are a world-class knowledge graph builder. | |
You are given a sentence and you must extract the relationship triples from the sentence. | |
# GUIDELINES | |
There can be one or more relationship triples in a sentence. | |
Relationship triples consist of the relationship between two named entities, followed by the entities themselves. | |
Entities are people, places, ideas, groups, things, and basically any nouns. DO NOT include dates or years as entities. | |
Identify entities as clearly as possible. Compound entities (e.g., "Chicago, Illinois") should be treated as separate, related entities. | |
# EXAMPLES | |
INPUT: Katie Got Bandz' favorite rappers were Waka Flocka Flame, Nicki Minaj, Drake, and Lil Wayne. | |
OUTPUT: [('influenced_by', 'Katie Got Bandz', 'Waka Flocka Flame'), | |
('influenced_by', 'Katie Got Bandz', 'Nicki Minaj'), | |
('influenced_by', 'Katie Got Bandz', 'Drake'), | |
('influenced_by', 'Katie Got Bandz', 'Lil Wayne')] | |
INPUT: The United States is in North America. | |
OUTPUT: [('located_in', 'The United States', 'North America')] | |
INPUT: Barack Obama is a former United States President. He is from Chicago, Illinois. | |
OUTPUT: [('former_president_of', 'Barack Obama', 'United States'), | |
('from', 'Barack Obama', 'Chicago'), | |
('located_in', 'Chicago', 'Illinois')] | |
NOTE: It is not necessary to also indicate that 'Barack Obama' is from 'Illinois', since this can be deduced from the relationships. | |
INPUT: Barack Obama was elected president, defeating Republican Party nominee John McCain in the presidential election and was inaugurated on January 20, 2009. | |
OUTPUT: [('candidate', 'Barack Obama', '2008 United States presidential election'), | |
('candidate', 'John McCain', '2008 United States presidential election'), | |
('won', 'Barack Obama', '2008 United States presidential election'), | |
('lost', 'John McCain', '2008 United States presidential election'), | |
('republican_nominee', 'John McCain', '2008 United States presidential election'), | |
('democratic_nominee', 'Barack Obama', '2008 United States presidential election'), | |
('defeated', 'Barack Obama', 'John McCain'), | |
('in', '2008 United States presidential election', 'United States')] | |
INPUT: Chicago is in Illinois. Illinois is in the United States. | |
OUTPUT: [('located_in', 'Chicago', 'Illinois'), | |
('located_in', 'Illinois', 'United States')] | |
INPUT: Mark, Brad, and Dan are graduates of Indiana University. | |
OUTPUT: [('graduated_from', 'Mark', 'Indiana University'), | |
('graduated_from', 'Brad', 'Indiana University'), | |
('graduated_from', 'Dan', 'Indiana University')] | |
""", | |
}, | |
{ | |
"role": "user", | |
"content": text, | |
}, | |
], | |
response_model=list[ModelOutputTuple], | |
) | |
logger.info(f"Extracted {len(model_tuples)} tuples from sentence") | |
return list(set(model_tuples)) | |
async def extract_tuples_from_text( | |
model: str, client: instructor.AsyncInstructor, text: str | |
): | |
logger.info("Extracting tuples from text") | |
text = await coreference_resolution(model, client, text) | |
sentences = segment_sentences(text) | |
logger.info(f"Processing {len(sentences)} sentences") | |
tasks = [ | |
extract_tuples_from_sentence(model, client, sentence) for sentence in sentences | |
] | |
for future in asyncio.as_completed(tasks): | |
try: | |
tuples = await future | |
for t in tuples: | |
logger.debug(f"Yielding tuple: {t}") | |
yield t | |
except Exception as e: | |
logger.error(f"Error processing sentence: {e}") | |
logger.info("Finished extracting tuples from text") | |
async def extract_keywords_from_query( | |
model: str, client: instructor.AsyncInstructor, query: str | |
) -> list[str]: | |
logger.info(f"Extracting keywords from query: {query}") | |
result = await client.chat.completions.create( | |
model=model, | |
max_tokens=4096, | |
messages=[ | |
{ | |
"role": "system", | |
"content": """ | |
You are a world-class knowledge graph builder. | |
You are given a query and you must extract the keywords from the query so you can search for the relevant entities in the knowledge graph. | |
""", | |
}, | |
{"role": "user", "content": query}, | |
], | |
response_model=list[str], | |
) | |
logger.info(f"Extracted {len(result)} keywords from query") | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment