Last active
July 27, 2024 10:51
-
-
Save matthiasa4/556a474fd60319e958547d180a0d8900 to your computer and use it in GitHub Desktop.
This script helps do the data checks on your input files when using the Search Tuning feature in Google Cloud Vertex AI Agent Builder (https://cloud.google.com/generative-ai-app-builder/docs/tune-search). Usage: `python check.py <corpus_path> <query_path> <scoring_path>`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import jsonlines | |
import pandas as pd | |
def jsonl_to_df(file_path): | |
data = [] | |
with jsonlines.open(file_path) as reader: | |
for line in reader: | |
# Process each line of the JSONL file | |
data.append(line) | |
return pd.DataFrame(data) | |
def prep_full_dataset(query_data: pd.DataFrame, corpus_data: pd.DataFrame, | |
scoring_data: pd.DataFrame) -> bool: | |
# If it appears in Scoring data, it must have a corresponding segment in the Corpus data | |
# If it appears in the Corpus data, it doesn't necessarily need to appear in the Scoring data | |
# Hence we left join scoring data to corpus data | |
corpus_scoring_merged = pd.merge(corpus_data, | |
scoring_data, | |
left_on='corpus-id', | |
right_on='corpus-id-scoring', | |
how='left') | |
# If it appears in Scoring data, it must have a corresponding segment in the Query data | |
# If it appears in the Query data, it doesn't necessarily need to appear in the Scoring data | |
# Hence we left join scoring data to query data | |
full_dataset = pd.merge(corpus_scoring_merged, | |
query_data, | |
left_on='query-id-scoring', | |
right_on='query-id', | |
how='left') | |
return full_dataset | |
def check_training_queries(query_data: pd.DataFrame) -> bool: | |
""" | |
[Training queries](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data) | |
Provide at least 100. | |
""" | |
return query_data.shape[0] >= 100 | |
def check_extractive_segments(full_dataset: pd.DataFrame) -> tuple: | |
""" | |
[Extractive segments](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data) | |
You must provide two types of extractive segments: | |
- Segments that contain relevant information needed to answer the training queries. These are segments that have positive matching with queries. | |
- Segments that are not associated with any training queries. These segments are used as random negatives in the model tuning. | |
Provide at least one extractive segment per query and at least 10,000 additional extractive segments. | |
""" | |
# Check if there is at least one extractive segment per query | |
extractive_segments_per_query = full_dataset.groupby( | |
'query-id').size().min() > 0 | |
extractive_segments_per_query_result_string = f"|___ Subcheck: At least one extractive segment per query: {get_result(extractive_segments_per_query)}" | |
return (extractive_segments_per_query, | |
extractive_segments_per_query_result_string) | |
def check_relevance_scores(full_dataset: pd.DataFrame) -> bool: | |
""" | |
[Relevance scores](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data) | |
Provide at least 100 relevant scores and, optionally, additional non-relevant scores. | |
""" | |
return full_dataset[full_dataset['score'] > 0].shape[0] >= 100 | |
def check_datasets(query_data: pd.DataFrame, corpus_data: pd.DataFrame, | |
scoring_data: pd.DataFrame) -> bool: | |
corpus_scoring_merged = pd.merge(corpus_data, | |
scoring_data, | |
left_on='corpus-id', | |
right_on='corpus-id-scoring', | |
how='outer') | |
full_dataset = pd.merge(corpus_scoring_merged, | |
query_data, | |
left_on='query-id-scoring', | |
right_on='query-id', | |
how='outer') | |
print( | |
f"Number of segments in Corpus file that don't have a match in Scoring file: {full_dataset[~full_dataset['corpus-id'].isna() & full_dataset['corpus-id-scoring'].isna()].shape[0]}" | |
) | |
print( | |
f"Number of segments in Scoring file that don't have a match in Corpus file: {full_dataset[full_dataset['corpus-id'].isna() & ~full_dataset['corpus-id-scoring'].isna()].shape[0]}" | |
) | |
print( | |
f"Number of queries in Query file that don't have a match in Scoring file: {full_dataset[~full_dataset['query-id'].isna() & full_dataset['query-id-scoring'].isna()].shape[0]}" | |
) | |
print( | |
f"Number of queries in Scoring file that don't have a match in Query file: {full_dataset[full_dataset['query-id'].isna() & ~full_dataset['query-id-scoring'].isna()].shape[0]}" | |
) | |
print() | |
def check_corpus_file(full_dataset: pd.DataFrame) -> tuple: | |
""" | |
The [corpus file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#corpus) contains extractive segments: segments that contain information to answer the queries in the query file and many additional segments to be used as random negatives when tuning the model. You should have at least 100 segments that contain query answers; queries can be answered by multiple segments. You should also have at least 10,000 random segments. | |
Otherwise, programmatically create random segments of 250–500 words from the documents in your data store and add those to the corpus file. | |
The corpus file is a JSONL (JSON lines) file where each line has the fields _id and text with string values. The maximum size of the file is 250,000 lines. | |
""" | |
# Check if there are at least 100 segments that contain query answers | |
query_answers = full_dataset[~full_dataset['query-id'].isna( | |
)].shape[0] >= 100 | |
query_answers_result_string = f"|___ Subcheck: At least 100 segments that contain query answers: {get_result(query_answers)}" | |
# Check if there are at least 10,000 random segments | |
random_segments_min = full_dataset[ | |
~full_dataset['corpus-id'].isna()].shape[0] >= 10000 | |
random_segments_min_result_string = f"|___ Subcheck: At least 10 000 random segments: {get_result(random_segments_min)}" | |
# Check if there are less than 250,000 random segments | |
random_segments_max = full_dataset[ | |
full_dataset['query-id'].isna()].shape[0] < 250000 | |
random_segments_max_result_string = f"|___ Subcheck: At most 250 000 random segments: {get_result(random_segments_max)}" | |
return (query_answers and random_segments_min | |
and random_segments_max, query_answers_result_string + "\n" + | |
random_segments_min_result_string + "\n" + | |
random_segments_max_result_string) | |
def check_query_file(full_dataset: pd.DataFrame) -> bool: | |
""" | |
The [query file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training-files) contains the example queries that will be used for tuning the model. Each query should have one or more corresponding extractive segments in the corpus file. You should provide at least 100 positive match queries. You can also provide non-relevant queries: these are queries that correspond to extractive segments with a relevance score of zero. | |
""" | |
min_one_extractive_segment_per_query = full_dataset[ | |
~full_dataset['query-id'].isna() | |
& ~full_dataset['corpus-id'].isna()].groupby( | |
'query-id-scoring').size().min() > 0 | |
positive_match_queries = full_dataset[full_dataset['score'] > | |
0].shape[0] >= 100 | |
return min_one_extractive_segment_per_query and positive_match_queries | |
def check_training_labels(query_data: pd.DataFrame, | |
scoring_data: pd.DataFrame) -> tuple: | |
""" | |
The [training labels file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training) connects the queries with the extractive segments and scores each query and segment pair. | |
If the test labels file is not present, then 20% of the queries in the training labels file are reserved for evaluating the tuned model after training. | |
The file contains the ID of a query and the ID of its matching (or non-matching) extractive segment and a score for the relevance of the segment to the query. **There must be at least one line per query**; if a query is answered by two segments, then there are two lines for that query. Score is a non-negative integer value. Any score greater than zero indicates that the document is related to the query. Larger numbers indicate a greater level of relevance. If the score is omitted, the default value is 1. | |
The training labels file is a TSV (tab-separated values) file with a header. The file must have the columns query-id, corpus-id and score. The query-id is a string that matches the _id key from the query file, and the corpus-id is a string that matches the _id in the corpus file. | |
Extracted requirements: | |
- There must be at least one line per query. | |
- Score is a non-negative integer value. | |
""" | |
query_ids = set(query_data['query-id']) | |
scoring_ids = set(scoring_data['query-id-scoring']) | |
same_items = query_ids == scoring_ids | |
same_items_result_string = f"|___ Subcheck: Same ids in query and scoring data: {get_result(same_items)}" | |
score_values = scoring_data['score'] | |
non_negative_integers = all( | |
isinstance(score, int) and score >= 0 for score in score_values) | |
non_negative_integers_result_string = f"|___ Subcheck: Column 'score' contains non-negative integer values: {get_result(non_negative_integers)}" | |
return (same_items and non_negative_integers, same_items_result_string + | |
"\n" + non_negative_integers_result_string) | |
def get_result(result: bool) -> str: | |
if result: | |
return "✅ met" | |
else: | |
return "❌ not met" | |
def main(): | |
if len(sys.argv) != 4: | |
print( | |
"Usage: python check.py <corpus_path> <query_path> <scoring_path>") | |
return | |
corpus_path = sys.argv[1] | |
query_path = sys.argv[2] | |
scoring_path = sys.argv[3] | |
# Read the query file | |
query_data = jsonl_to_df(query_path) | |
query_data = query_data.rename(columns={ | |
'_id': 'query-id', | |
'text': 'query-text' | |
}) | |
# Read the corpus file | |
corpus_data = jsonl_to_df(corpus_path) | |
corpus_data = corpus_data.rename(columns={ | |
'_id': 'corpus-id', | |
'text': 'corpus-text' | |
}) | |
# Read the scoring file | |
scoring_data = pd.read_csv(scoring_path, sep='\t') | |
scoring_data = scoring_data.rename( | |
columns={ | |
'corpus-id': 'corpus-id-scoring', | |
'query-id': 'query-id-scoring', | |
'score': 'score' | |
}) | |
print("\nGeneral dataset checks\n----------------------") | |
check_datasets(query_data, corpus_data, scoring_data) | |
full_dataset = prep_full_dataset(query_data, corpus_data, scoring_data) | |
print("\nDocumentation dataset checks\n----------------------------") | |
print( | |
f"Training query requirements met: {get_result(check_training_queries(query_data))}" | |
) | |
extractive_segment_result = check_extractive_segments(full_dataset) | |
print( | |
f"Extractive segment requirements met: {get_result(extractive_segment_result[0])}" | |
) | |
print(extractive_segment_result[1]) | |
print( | |
f"Relevance score requirements met: {get_result(check_relevance_scores(full_dataset))}" | |
) | |
### | |
corpus_result = check_corpus_file(full_dataset) | |
print(f"Corpus file requirements met: {get_result(corpus_result[0])}") | |
print(corpus_result[1]) | |
print( | |
f"Query file requirements met: {get_result(check_query_file(full_dataset))}" | |
) | |
training_labels_result = check_training_labels(query_data, scoring_data) | |
print( | |
f"Training labels requirements met: {get_result(training_labels_result[0])}" | |
) | |
print(training_labels_result[1]) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment