Skip to content

Instantly share code, notes, and snippets.

@matthiasa4
Last active July 27, 2024 10:51
Show Gist options
  • Save matthiasa4/556a474fd60319e958547d180a0d8900 to your computer and use it in GitHub Desktop.
Save matthiasa4/556a474fd60319e958547d180a0d8900 to your computer and use it in GitHub Desktop.
This script helps do the data checks on your input files when using the Search Tuning feature in Google Cloud Vertex AI Agent Builder (https://cloud.google.com/generative-ai-app-builder/docs/tune-search). Usage: `python check.py <corpus_path> <query_path> <scoring_path>`
import sys
import jsonlines
import pandas as pd
def jsonl_to_df(file_path):
data = []
with jsonlines.open(file_path) as reader:
for line in reader:
# Process each line of the JSONL file
data.append(line)
return pd.DataFrame(data)
def prep_full_dataset(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
scoring_data: pd.DataFrame) -> bool:
# If it appears in Scoring data, it must have a corresponding segment in the Corpus data
# If it appears in the Corpus data, it doesn't necessarily need to appear in the Scoring data
# Hence we left join scoring data to corpus data
corpus_scoring_merged = pd.merge(corpus_data,
scoring_data,
left_on='corpus-id',
right_on='corpus-id-scoring',
how='left')
# If it appears in Scoring data, it must have a corresponding segment in the Query data
# If it appears in the Query data, it doesn't necessarily need to appear in the Scoring data
# Hence we left join scoring data to query data
full_dataset = pd.merge(corpus_scoring_merged,
query_data,
left_on='query-id-scoring',
right_on='query-id',
how='left')
return full_dataset
def check_training_queries(query_data: pd.DataFrame) -> bool:
"""
[Training queries](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)
Provide at least 100.
"""
return query_data.shape[0] >= 100
def check_extractive_segments(full_dataset: pd.DataFrame) -> tuple:
"""
[Extractive segments](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)
You must provide two types of extractive segments:
- Segments that contain relevant information needed to answer the training queries. These are segments that have positive matching with queries.
- Segments that are not associated with any training queries. These segments are used as random negatives in the model tuning.
Provide at least one extractive segment per query and at least 10,000 additional extractive segments.
"""
# Check if there is at least one extractive segment per query
extractive_segments_per_query = full_dataset.groupby(
'query-id').size().min() > 0
extractive_segments_per_query_result_string = f"|___ Subcheck: At least one extractive segment per query: {get_result(extractive_segments_per_query)}"
return (extractive_segments_per_query,
extractive_segments_per_query_result_string)
def check_relevance_scores(full_dataset: pd.DataFrame) -> bool:
"""
[Relevance scores](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)
Provide at least 100 relevant scores and, optionally, additional non-relevant scores.
"""
return full_dataset[full_dataset['score'] > 0].shape[0] >= 100
def check_datasets(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
scoring_data: pd.DataFrame) -> bool:
corpus_scoring_merged = pd.merge(corpus_data,
scoring_data,
left_on='corpus-id',
right_on='corpus-id-scoring',
how='outer')
full_dataset = pd.merge(corpus_scoring_merged,
query_data,
left_on='query-id-scoring',
right_on='query-id',
how='outer')
print(
f"Number of segments in Corpus file that don't have a match in Scoring file: {full_dataset[~full_dataset['corpus-id'].isna() & full_dataset['corpus-id-scoring'].isna()].shape[0]}"
)
print(
f"Number of segments in Scoring file that don't have a match in Corpus file: {full_dataset[full_dataset['corpus-id'].isna() & ~full_dataset['corpus-id-scoring'].isna()].shape[0]}"
)
print(
f"Number of queries in Query file that don't have a match in Scoring file: {full_dataset[~full_dataset['query-id'].isna() & full_dataset['query-id-scoring'].isna()].shape[0]}"
)
print(
f"Number of queries in Scoring file that don't have a match in Query file: {full_dataset[full_dataset['query-id'].isna() & ~full_dataset['query-id-scoring'].isna()].shape[0]}"
)
print()
def check_corpus_file(full_dataset: pd.DataFrame) -> tuple:
"""
The [corpus file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#corpus) contains extractive segments: segments that contain information to answer the queries in the query file and many additional segments to be used as random negatives when tuning the model. You should have at least 100 segments that contain query answers; queries can be answered by multiple segments. You should also have at least 10,000 random segments.
Otherwise, programmatically create random segments of 250–500 words from the documents in your data store and add those to the corpus file.
The corpus file is a JSONL (JSON lines) file where each line has the fields _id and text with string values. The maximum size of the file is 250,000 lines.
"""
# Check if there are at least 100 segments that contain query answers
query_answers = full_dataset[~full_dataset['query-id'].isna(
)].shape[0] >= 100
query_answers_result_string = f"|___ Subcheck: At least 100 segments that contain query answers: {get_result(query_answers)}"
# Check if there are at least 10,000 random segments
random_segments_min = full_dataset[
~full_dataset['corpus-id'].isna()].shape[0] >= 10000
random_segments_min_result_string = f"|___ Subcheck: At least 10 000 random segments: {get_result(random_segments_min)}"
# Check if there are less than 250,000 random segments
random_segments_max = full_dataset[
full_dataset['query-id'].isna()].shape[0] < 250000
random_segments_max_result_string = f"|___ Subcheck: At most 250 000 random segments: {get_result(random_segments_max)}"
return (query_answers and random_segments_min
and random_segments_max, query_answers_result_string + "\n" +
random_segments_min_result_string + "\n" +
random_segments_max_result_string)
def check_query_file(full_dataset: pd.DataFrame) -> bool:
"""
The [query file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training-files) contains the example queries that will be used for tuning the model. Each query should have one or more corresponding extractive segments in the corpus file. You should provide at least 100 positive match queries. You can also provide non-relevant queries: these are queries that correspond to extractive segments with a relevance score of zero.
"""
min_one_extractive_segment_per_query = full_dataset[
~full_dataset['query-id'].isna()
& ~full_dataset['corpus-id'].isna()].groupby(
'query-id-scoring').size().min() > 0
positive_match_queries = full_dataset[full_dataset['score'] >
0].shape[0] >= 100
return min_one_extractive_segment_per_query and positive_match_queries
def check_training_labels(query_data: pd.DataFrame,
scoring_data: pd.DataFrame) -> tuple:
"""
The [training labels file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training) connects the queries with the extractive segments and scores each query and segment pair.
If the test labels file is not present, then 20% of the queries in the training labels file are reserved for evaluating the tuned model after training.
The file contains the ID of a query and the ID of its matching (or non-matching) extractive segment and a score for the relevance of the segment to the query. **There must be at least one line per query**; if a query is answered by two segments, then there are two lines for that query. Score is a non-negative integer value. Any score greater than zero indicates that the document is related to the query. Larger numbers indicate a greater level of relevance. If the score is omitted, the default value is 1.
The training labels file is a TSV (tab-separated values) file with a header. The file must have the columns query-id, corpus-id and score. The query-id is a string that matches the _id key from the query file, and the corpus-id is a string that matches the _id in the corpus file.
Extracted requirements:
- There must be at least one line per query.
- Score is a non-negative integer value.
"""
query_ids = set(query_data['query-id'])
scoring_ids = set(scoring_data['query-id-scoring'])
same_items = query_ids == scoring_ids
same_items_result_string = f"|___ Subcheck: Same ids in query and scoring data: {get_result(same_items)}"
score_values = scoring_data['score']
non_negative_integers = all(
isinstance(score, int) and score >= 0 for score in score_values)
non_negative_integers_result_string = f"|___ Subcheck: Column 'score' contains non-negative integer values: {get_result(non_negative_integers)}"
return (same_items and non_negative_integers, same_items_result_string +
"\n" + non_negative_integers_result_string)
def get_result(result: bool) -> str:
if result:
return "✅ met"
else:
return "❌ not met"
def main():
if len(sys.argv) != 4:
print(
"Usage: python check.py <corpus_path> <query_path> <scoring_path>")
return
corpus_path = sys.argv[1]
query_path = sys.argv[2]
scoring_path = sys.argv[3]
# Read the query file
query_data = jsonl_to_df(query_path)
query_data = query_data.rename(columns={
'_id': 'query-id',
'text': 'query-text'
})
# Read the corpus file
corpus_data = jsonl_to_df(corpus_path)
corpus_data = corpus_data.rename(columns={
'_id': 'corpus-id',
'text': 'corpus-text'
})
# Read the scoring file
scoring_data = pd.read_csv(scoring_path, sep='\t')
scoring_data = scoring_data.rename(
columns={
'corpus-id': 'corpus-id-scoring',
'query-id': 'query-id-scoring',
'score': 'score'
})
print("\nGeneral dataset checks\n----------------------")
check_datasets(query_data, corpus_data, scoring_data)
full_dataset = prep_full_dataset(query_data, corpus_data, scoring_data)
print("\nDocumentation dataset checks\n----------------------------")
print(
f"Training query requirements met: {get_result(check_training_queries(query_data))}"
)
extractive_segment_result = check_extractive_segments(full_dataset)
print(
f"Extractive segment requirements met: {get_result(extractive_segment_result[0])}"
)
print(extractive_segment_result[1])
print(
f"Relevance score requirements met: {get_result(check_relevance_scores(full_dataset))}"
)
###
corpus_result = check_corpus_file(full_dataset)
print(f"Corpus file requirements met: {get_result(corpus_result[0])}")
print(corpus_result[1])
print(
f"Query file requirements met: {get_result(check_query_file(full_dataset))}"
)
training_labels_result = check_training_labels(query_data, scoring_data)
print(
f"Training labels requirements met: {get_result(training_labels_result[0])}"
)
print(training_labels_result[1])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment