wadefletch · July 23, 2024 23:44
diff --git a/inference.py b/inference.py
 import asyncio
 import logging
 import re

 import instructor
 import pydantic

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)


 def segment_sentences(text: str) -> list[str]:
    logger.debug("Segmenting sentences")
    sentences = re.split(r"[.?!]\s+|\n+", text)
    logger.debug(f"Segmented {len(sentences)} sentences")
    return sentences


 async def coreference_resolution(
    model: str, client: instructor.AsyncInstructor, text: str
 ) -> str:
    logger.info("Performing coreference resolution")
    result = await client.chat.completions.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "system",
                "content": """
                # Task
                Resolve the coreferences in the given text. 
                Coreferences are pronouns or phrases that refer to the same entity. 
                Replace these coreferences with the appropriate noun to make the text clearer.

                # Instructions
                1. Identify the coreferences in the text.
                2. Replace each coreference with the noun it refers to.

                # Examples
                
                Input: Jason is a 25-year-old software engineer. He likes to code in Python.
                Output: Jason is a 25-year-old software engineer. Jason likes to code in Python.
                
                Input: Minnesota is in the United States, which is in North America.
                Output: Minnesota is in the United States. The United States is in North America.

                Input: Sarah and her friends went to the park. They had a great time.
                Output: Sarah and her friends went to the park. Sarah and her friends had a great time.

                Input: The cat chased the mouse because it was hungry.
                Output: The cat chased the mouse because the cat was hungry.

                Input: The company released its new product. It received great reviews.
                Output: The company released the company's new product. The new product received great reviews.

                # Text to resolve
                """,
            },
            {"role": "user", "content": text},
        ],
        response_model=str,
    )

    if not result:
        logger.error("No result from coreference resolution")
        raise Exception("No result from coreference resolution")
    logger.info("Coreference resolution completed")
    return result


 class ModelOutputTuple(pydantic.BaseModel):
    relationship: str
    source: str
    target: str

    def __hash__(self):
        return hash((self.source, self.relationship, self.target))


 async def extract_tuples_from_sentence(
    model: str,
    client: instructor.AsyncInstructor,
    text: str,
 ) -> list[ModelOutputTuple]:
    logger.info(f"Extracting tuples from sentence: {text[:50]}...")
    model_tuples = await client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "system",
                "content": """
                You are a world-class knowledge graph builder.
                You are given a sentence and you must extract the relationship triples from the sentence.

                # GUIDELINES
                There can be one or more relationship triples in a sentence.
                Relationship triples consist of the relationship between two named entities, followed by the entities themselves.
                Entities are people, places, ideas, groups, things, and basically any nouns. DO NOT include dates or years as entities.
                Identify entities as clearly as possible. Compound entities (e.g., "Chicago, Illinois") should be treated as separate, related entities.
                
                # EXAMPLES
                INPUT: Katie Got Bandz' favorite rappers were Waka Flocka Flame, Nicki Minaj, Drake, and Lil Wayne.
                OUTPUT: [('influenced_by', 'Katie Got Bandz', 'Waka Flocka Flame'), 
                         ('influenced_by', 'Katie Got Bandz', 'Nicki Minaj'), 
                         ('influenced_by', 'Katie Got Bandz', 'Drake'), 
                         ('influenced_by', 'Katie Got Bandz', 'Lil Wayne')]

                INPUT: The United States is in North America.
                OUTPUT: [('located_in', 'The United States', 'North America')]

                INPUT: Barack Obama is a former United States President. He is from Chicago, Illinois.
                OUTPUT: [('former_president_of', 'Barack Obama', 'United States'),
                         ('from', 'Barack Obama', 'Chicago'),
                         ('located_in', 'Chicago', 'Illinois')]
                NOTE: It is not necessary to also indicate that 'Barack Obama' is from 'Illinois', since this can be deduced from the relationships.

                INPUT: Barack Obama was elected president, defeating Republican Party nominee John McCain in the presidential election and was inaugurated on January 20, 2009.
                OUTPUT: [('candidate', 'Barack Obama', '2008 United States presidential election'),
                         ('candidate', 'John McCain', '2008 United States presidential election'),
                         ('won', 'Barack Obama', '2008 United States presidential election'),
                         ('lost', 'John McCain', '2008 United States presidential election'),
                         ('republican_nominee', 'John McCain', '2008 United States presidential election'),
                         ('democratic_nominee', 'Barack Obama', '2008 United States presidential election'),
                         ('defeated', 'Barack Obama', 'John McCain'),
                         ('in', '2008 United States presidential election', 'United States')]

                INPUT: Chicago is in Illinois. Illinois is in the United States.
                OUTPUT: [('located_in', 'Chicago', 'Illinois'),
                         ('located_in', 'Illinois', 'United States')]

                INPUT: Mark, Brad, and Dan are graduates of Indiana University.
                OUTPUT: [('graduated_from', 'Mark', 'Indiana University'),
                         ('graduated_from', 'Brad', 'Indiana University'),
                         ('graduated_from', 'Dan', 'Indiana University')]
                """,
            },
            {
                "role": "user",
                "content": text,
            },
        ],
        response_model=list[ModelOutputTuple],
    )

    logger.info(f"Extracted {len(model_tuples)} tuples from sentence")
    return list(set(model_tuples))


 async def extract_tuples_from_text(
    model: str, client: instructor.AsyncInstructor, text: str
 ):
    logger.info("Extracting tuples from text")
    text = await coreference_resolution(model, client, text)
    sentences = segment_sentences(text)
    logger.info(f"Processing {len(sentences)} sentences")
    tasks = [
        extract_tuples_from_sentence(model, client, sentence) for sentence in sentences
    ]
    for future in asyncio.as_completed(tasks):
        try:
            tuples = await future
            for t in tuples:
                logger.debug(f"Yielding tuple: {t}")
                yield t
        except Exception as e:
            logger.error(f"Error processing sentence: {e}")
    logger.info("Finished extracting tuples from text")


 async def extract_keywords_from_query(
    model: str, client: instructor.AsyncInstructor, query: str
 ) -> list[str]:
    logger.info(f"Extracting keywords from query: {query}")
    result = await client.chat.completions.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "system",
                "content": """
                You are a world-class knowledge graph builder.
                You are given a query and you must extract the keywords from the query so you can search for the relevant entities in the knowledge graph.
                """,
            },
            {"role": "user", "content": query},
        ],
        response_model=list[str],
    )
    logger.info(f"Extracted {len(result)} keywords from query")
    return result
	import asyncio
	import logging
	import re

	import instructor
	import pydantic

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def segment_sentences(text: str) -> list[str]:
	logger.debug("Segmenting sentences")
	sentences = re.split(r"[.?!]\s+\|\n+", text)
	logger.debug(f"Segmented {len(sentences)} sentences")
	return sentences


	async def coreference_resolution(
	model: str, client: instructor.AsyncInstructor, text: str
	) -> str:
	logger.info("Performing coreference resolution")
	result = await client.chat.completions.create(
	model=model,
	max_tokens=4096,
	messages=[
	{
	"role": "system",
	"content": """
	# Task
	Resolve the coreferences in the given text.
	Coreferences are pronouns or phrases that refer to the same entity.
	Replace these coreferences with the appropriate noun to make the text clearer.

	# Instructions
	1. Identify the coreferences in the text.
	2. Replace each coreference with the noun it refers to.

	# Examples

	Input: Jason is a 25-year-old software engineer. He likes to code in Python.
	Output: Jason is a 25-year-old software engineer. Jason likes to code in Python.

	Input: Minnesota is in the United States, which is in North America.
	Output: Minnesota is in the United States. The United States is in North America.

	Input: Sarah and her friends went to the park. They had a great time.
	Output: Sarah and her friends went to the park. Sarah and her friends had a great time.

	Input: The cat chased the mouse because it was hungry.
	Output: The cat chased the mouse because the cat was hungry.

	Input: The company released its new product. It received great reviews.
	Output: The company released the company's new product. The new product received great reviews.

	# Text to resolve
	""",
	},
	{"role": "user", "content": text},
	],
	response_model=str,
	)

	if not result:
	logger.error("No result from coreference resolution")
	raise Exception("No result from coreference resolution")
	logger.info("Coreference resolution completed")
	return result


	class ModelOutputTuple(pydantic.BaseModel):
	relationship: str
	source: str
	target: str

	def __hash__(self):
	return hash((self.source, self.relationship, self.target))


	async def extract_tuples_from_sentence(
	model: str,
	client: instructor.AsyncInstructor,
	text: str,
	) -> list[ModelOutputTuple]:
	logger.info(f"Extracting tuples from sentence: {text[:50]}...")
	model_tuples = await client.messages.create(
	model=model,
	max_tokens=4096,
	messages=[
	{
	"role": "system",
	"content": """
	You are a world-class knowledge graph builder.
	You are given a sentence and you must extract the relationship triples from the sentence.

	# GUIDELINES
	There can be one or more relationship triples in a sentence.
	Relationship triples consist of the relationship between two named entities, followed by the entities themselves.
	Entities are people, places, ideas, groups, things, and basically any nouns. DO NOT include dates or years as entities.
	Identify entities as clearly as possible. Compound entities (e.g., "Chicago, Illinois") should be treated as separate, related entities.

	# EXAMPLES
	INPUT: Katie Got Bandz' favorite rappers were Waka Flocka Flame, Nicki Minaj, Drake, and Lil Wayne.
	OUTPUT: [('influenced_by', 'Katie Got Bandz', 'Waka Flocka Flame'),
	('influenced_by', 'Katie Got Bandz', 'Nicki Minaj'),
	('influenced_by', 'Katie Got Bandz', 'Drake'),
	('influenced_by', 'Katie Got Bandz', 'Lil Wayne')]

	INPUT: The United States is in North America.
	OUTPUT: [('located_in', 'The United States', 'North America')]

	INPUT: Barack Obama is a former United States President. He is from Chicago, Illinois.
	OUTPUT: [('former_president_of', 'Barack Obama', 'United States'),
	('from', 'Barack Obama', 'Chicago'),
	('located_in', 'Chicago', 'Illinois')]
	NOTE: It is not necessary to also indicate that 'Barack Obama' is from 'Illinois', since this can be deduced from the relationships.

	INPUT: Barack Obama was elected president, defeating Republican Party nominee John McCain in the presidential election and was inaugurated on January 20, 2009.
	OUTPUT: [('candidate', 'Barack Obama', '2008 United States presidential election'),
	('candidate', 'John McCain', '2008 United States presidential election'),
	('won', 'Barack Obama', '2008 United States presidential election'),
	('lost', 'John McCain', '2008 United States presidential election'),
	('republican_nominee', 'John McCain', '2008 United States presidential election'),
	('democratic_nominee', 'Barack Obama', '2008 United States presidential election'),
	('defeated', 'Barack Obama', 'John McCain'),
	('in', '2008 United States presidential election', 'United States')]

	INPUT: Chicago is in Illinois. Illinois is in the United States.
	OUTPUT: [('located_in', 'Chicago', 'Illinois'),
	('located_in', 'Illinois', 'United States')]

	INPUT: Mark, Brad, and Dan are graduates of Indiana University.
	OUTPUT: [('graduated_from', 'Mark', 'Indiana University'),
	('graduated_from', 'Brad', 'Indiana University'),
	('graduated_from', 'Dan', 'Indiana University')]
	""",
	},
	{
	"role": "user",
	"content": text,
	},
	],
	response_model=list[ModelOutputTuple],
	)

	logger.info(f"Extracted {len(model_tuples)} tuples from sentence")
	return list(set(model_tuples))


	async def extract_tuples_from_text(
	model: str, client: instructor.AsyncInstructor, text: str
	):
	logger.info("Extracting tuples from text")
	text = await coreference_resolution(model, client, text)
	sentences = segment_sentences(text)
	logger.info(f"Processing {len(sentences)} sentences")
	tasks = [
	extract_tuples_from_sentence(model, client, sentence) for sentence in sentences
	]
	for future in asyncio.as_completed(tasks):
	try:
	tuples = await future
	for t in tuples:
	logger.debug(f"Yielding tuple: {t}")
	yield t
	except Exception as e:
	logger.error(f"Error processing sentence: {e}")
	logger.info("Finished extracting tuples from text")


	async def extract_keywords_from_query(
	model: str, client: instructor.AsyncInstructor, query: str
	) -> list[str]:
	logger.info(f"Extracting keywords from query: {query}")
	result = await client.chat.completions.create(
	model=model,
	max_tokens=4096,
	messages=[
	{
	"role": "system",
	"content": """
	You are a world-class knowledge graph builder.
	You are given a query and you must extract the keywords from the query so you can search for the relevant entities in the knowledge graph.
	""",
	},
	{"role": "user", "content": query},
	],
	response_model=list[str],
	)
	logger.info(f"Extracted {len(result)} keywords from query")
	return result