matthiasa4 · July 27, 2024 10:51
diff --git a/vertex_ai_search_tuning_file_checks.py b/vertex_ai_search_tuning_file_checks.py
 import sys
 import jsonlines
 import pandas as pd


 def jsonl_to_df(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for line in reader:
            # Process each line of the JSONL file
            data.append(line)
    return pd.DataFrame(data)


 def prep_full_dataset(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
                      scoring_data: pd.DataFrame) -> bool:
    # If it appears in Scoring data, it must have a corresponding segment in the Corpus data
    # If it appears in the Corpus data, it doesn't necessarily need to appear in the Scoring data
    # Hence we left join scoring data to corpus data
    corpus_scoring_merged = pd.merge(corpus_data,
                                     scoring_data,
                                     left_on='corpus-id',
                                     right_on='corpus-id-scoring',
                                     how='left')

    # If it appears in Scoring data, it must have a corresponding segment in the Query data
    # If it appears in the Query data, it doesn't necessarily need to appear in the Scoring data
    # Hence we left join scoring data to query data
    full_dataset = pd.merge(corpus_scoring_merged,
                            query_data,
                            left_on='query-id-scoring',
                            right_on='query-id',
                            how='left')

    return full_dataset


 def check_training_queries(query_data: pd.DataFrame) -> bool:
    """
    [Training queries](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

    Provide at least 100.
    """
    return query_data.shape[0] >= 100


 def check_extractive_segments(full_dataset: pd.DataFrame) -> tuple:
    """
    [Extractive segments](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

    You must provide two types of extractive segments:

    - Segments that contain relevant information needed to answer the training queries. These are segments that have positive matching with queries.

    - Segments that are not associated with any training queries. These segments are used as random negatives in the model tuning.

    Provide at least one extractive segment per query and at least 10,000 additional extractive segments.
    """
    # Check if there is at least one extractive segment per query
    extractive_segments_per_query = full_dataset.groupby(
        'query-id').size().min() > 0
    extractive_segments_per_query_result_string = f"|___ Subcheck: At least one extractive segment per query: {get_result(extractive_segments_per_query)}"

    return (extractive_segments_per_query,
            extractive_segments_per_query_result_string)


 def check_relevance_scores(full_dataset: pd.DataFrame) -> bool:
    """
    [Relevance scores](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

    Provide at least 100 relevant scores and, optionally, additional non-relevant scores.
    """
    return full_dataset[full_dataset['score'] > 0].shape[0] >= 100


 def check_datasets(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
                   scoring_data: pd.DataFrame) -> bool:
    corpus_scoring_merged = pd.merge(corpus_data,
                                     scoring_data,
                                     left_on='corpus-id',
                                     right_on='corpus-id-scoring',
                                     how='outer')

    full_dataset = pd.merge(corpus_scoring_merged,
                            query_data,
                            left_on='query-id-scoring',
                            right_on='query-id',
                            how='outer')

    print(
        f"Number of segments in Corpus file that don't have a match in Scoring file: {full_dataset[~full_dataset['corpus-id'].isna() & full_dataset['corpus-id-scoring'].isna()].shape[0]}"
    )
    print(
        f"Number of segments in Scoring file that don't have a match in Corpus file: {full_dataset[full_dataset['corpus-id'].isna() & ~full_dataset['corpus-id-scoring'].isna()].shape[0]}"
    )
    print(
        f"Number of queries in Query file that don't have a match in Scoring file: {full_dataset[~full_dataset['query-id'].isna() & full_dataset['query-id-scoring'].isna()].shape[0]}"
    )
    print(
        f"Number of queries in Scoring file that don't have a match in Query file: {full_dataset[full_dataset['query-id'].isna() & ~full_dataset['query-id-scoring'].isna()].shape[0]}"
    )
    print()


 def check_corpus_file(full_dataset: pd.DataFrame) -> tuple:
    """
    The [corpus file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#corpus) contains extractive segments: segments that contain information to answer the queries in the query file and many additional segments to be used as random negatives when tuning the model. You should have at least 100 segments that contain query answers; queries can be answered by multiple segments. You should also have at least 10,000 random segments.

    Otherwise, programmatically create random segments of 250–500 words from the documents in your data store and add those to the corpus file.

    The corpus file is a JSONL (JSON lines) file where each line has the fields _id and text with string values. The maximum size of the file is 250,000 lines.
    """
    # Check if there are at least 100 segments that contain query answers
    query_answers = full_dataset[~full_dataset['query-id'].isna(
    )].shape[0] >= 100
    query_answers_result_string = f"|___ Subcheck: At least 100 segments that contain query answers: {get_result(query_answers)}"

    # Check if there are at least 10,000 random segments
    random_segments_min = full_dataset[
        ~full_dataset['corpus-id'].isna()].shape[0] >= 10000
    random_segments_min_result_string = f"|___ Subcheck: At least 10 000 random segments: {get_result(random_segments_min)}"

    # Check if there are less than 250,000 random segments
    random_segments_max = full_dataset[
        full_dataset['query-id'].isna()].shape[0] < 250000
    random_segments_max_result_string = f"|___ Subcheck: At most 250 000 random segments: {get_result(random_segments_max)}"

    return (query_answers and random_segments_min
            and random_segments_max, query_answers_result_string + "\n" +
            random_segments_min_result_string + "\n" +
            random_segments_max_result_string)


 def check_query_file(full_dataset: pd.DataFrame) -> bool:
    """
    The [query file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training-files) contains the example queries that will be used for tuning the model. Each query should have one or more corresponding extractive segments in the corpus file. You should provide at least 100 positive match queries. You can also provide non-relevant queries: these are queries that correspond to extractive segments with a relevance score of zero.
    """
    min_one_extractive_segment_per_query = full_dataset[
        ~full_dataset['query-id'].isna()
        & ~full_dataset['corpus-id'].isna()].groupby(
            'query-id-scoring').size().min() > 0

    positive_match_queries = full_dataset[full_dataset['score'] >
                                          0].shape[0] >= 100

    return min_one_extractive_segment_per_query and positive_match_queries


 def check_training_labels(query_data: pd.DataFrame,
                          scoring_data: pd.DataFrame) -> tuple:
    """
    The [training labels file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training) connects the queries with the extractive segments and scores each query and segment pair.

    If the test labels file is not present, then 20% of the queries in the training labels file are reserved for evaluating the tuned model after training.

    The file contains the ID of a query and the ID of its matching (or non-matching) extractive segment and a score for the relevance of the segment to the query. **There must be at least one line per query**; if a query is answered by two segments, then there are two lines for that query. Score is a non-negative integer value. Any score greater than zero indicates that the document is related to the query. Larger numbers indicate a greater level of relevance. If the score is omitted, the default value is 1.

    The training labels file is a TSV (tab-separated values) file with a header. The file must have the columns query-id, corpus-id and score. The query-id is a string that matches the _id key from the query file, and the corpus-id is a string that matches the _id in the corpus file.

    Extracted requirements:
    - There must be at least one line per query.
    - Score is a non-negative integer value.
    """
    query_ids = set(query_data['query-id'])
    scoring_ids = set(scoring_data['query-id-scoring'])
    same_items = query_ids == scoring_ids

    same_items_result_string = f"|___ Subcheck: Same ids in query and scoring data: {get_result(same_items)}"

    score_values = scoring_data['score']
    non_negative_integers = all(
        isinstance(score, int) and score >= 0 for score in score_values)
    non_negative_integers_result_string = f"|___ Subcheck: Column 'score' contains non-negative integer values: {get_result(non_negative_integers)}"

    return (same_items and non_negative_integers, same_items_result_string +
            "\n" + non_negative_integers_result_string)


 def get_result(result: bool) -> str:
    if result:
        return "✅ met"
    else:
        return "❌ not met"


 def main():
    if len(sys.argv) != 4:
        print(
            "Usage: python check.py <corpus_path> <query_path> <scoring_path>")
        return

    corpus_path = sys.argv[1]
    query_path = sys.argv[2]
    scoring_path = sys.argv[3]

    # Read the query file
    query_data = jsonl_to_df(query_path)
    query_data = query_data.rename(columns={
        '_id': 'query-id',
        'text': 'query-text'
    })

    # Read the corpus file
    corpus_data = jsonl_to_df(corpus_path)
    corpus_data = corpus_data.rename(columns={
        '_id': 'corpus-id',
        'text': 'corpus-text'
    })

    # Read the scoring file
    scoring_data = pd.read_csv(scoring_path, sep='\t')
    scoring_data = scoring_data.rename(
        columns={
            'corpus-id': 'corpus-id-scoring',
            'query-id': 'query-id-scoring',
            'score': 'score'
        })

    print("\nGeneral dataset checks\n----------------------")
    check_datasets(query_data, corpus_data, scoring_data)

    full_dataset = prep_full_dataset(query_data, corpus_data, scoring_data)

    print("\nDocumentation dataset checks\n----------------------------")
    print(
        f"Training query requirements met: {get_result(check_training_queries(query_data))}"
    )

    extractive_segment_result = check_extractive_segments(full_dataset)
    print(
        f"Extractive segment requirements met: {get_result(extractive_segment_result[0])}"
    )
    print(extractive_segment_result[1])

    print(
        f"Relevance score requirements met: {get_result(check_relevance_scores(full_dataset))}"
    )

    ###
    corpus_result = check_corpus_file(full_dataset)
    print(f"Corpus file requirements met: {get_result(corpus_result[0])}")
    print(corpus_result[1])

    print(
        f"Query file requirements met: {get_result(check_query_file(full_dataset))}"
    )

    training_labels_result = check_training_labels(query_data, scoring_data)
    print(
        f"Training labels requirements met: {get_result(training_labels_result[0])}"
    )
    print(training_labels_result[1])


 if __name__ == "__main__":
    main()
	import sys
	import jsonlines
	import pandas as pd


	def jsonl_to_df(file_path):
	data = []
	with jsonlines.open(file_path) as reader:
	for line in reader:
	# Process each line of the JSONL file
	data.append(line)
	return pd.DataFrame(data)


	def prep_full_dataset(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
	scoring_data: pd.DataFrame) -> bool:
	# If it appears in Scoring data, it must have a corresponding segment in the Corpus data
	# If it appears in the Corpus data, it doesn't necessarily need to appear in the Scoring data
	# Hence we left join scoring data to corpus data
	corpus_scoring_merged = pd.merge(corpus_data,
	scoring_data,
	left_on='corpus-id',
	right_on='corpus-id-scoring',
	how='left')

	# If it appears in Scoring data, it must have a corresponding segment in the Query data
	# If it appears in the Query data, it doesn't necessarily need to appear in the Scoring data
	# Hence we left join scoring data to query data
	full_dataset = pd.merge(corpus_scoring_merged,
	query_data,
	left_on='query-id-scoring',
	right_on='query-id',
	how='left')

	return full_dataset


	def check_training_queries(query_data: pd.DataFrame) -> bool:
	"""
	[Training queries](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

	Provide at least 100.
	"""
	return query_data.shape[0] >= 100


	def check_extractive_segments(full_dataset: pd.DataFrame) -> tuple:
	"""
	[Extractive segments](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

	You must provide two types of extractive segments:

	- Segments that contain relevant information needed to answer the training queries. These are segments that have positive matching with queries.

	- Segments that are not associated with any training queries. These segments are used as random negatives in the model tuning.

	Provide at least one extractive segment per query and at least 10,000 additional extractive segments.
	"""
	# Check if there is at least one extractive segment per query
	extractive_segments_per_query = full_dataset.groupby(
	'query-id').size().min() > 0
	extractive_segments_per_query_result_string = f"\|___ Subcheck: At least one extractive segment per query: {get_result(extractive_segments_per_query)}"

	return (extractive_segments_per_query,
	extractive_segments_per_query_result_string)


	def check_relevance_scores(full_dataset: pd.DataFrame) -> bool:
	"""
	[Relevance scores](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#about_training_data)

	Provide at least 100 relevant scores and, optionally, additional non-relevant scores.
	"""
	return full_dataset[full_dataset['score'] > 0].shape[0] >= 100


	def check_datasets(query_data: pd.DataFrame, corpus_data: pd.DataFrame,
	scoring_data: pd.DataFrame) -> bool:
	corpus_scoring_merged = pd.merge(corpus_data,
	scoring_data,
	left_on='corpus-id',
	right_on='corpus-id-scoring',
	how='outer')

	full_dataset = pd.merge(corpus_scoring_merged,
	query_data,
	left_on='query-id-scoring',
	right_on='query-id',
	how='outer')

	print(
	f"Number of segments in Corpus file that don't have a match in Scoring file: {full_dataset[~full_dataset['corpus-id'].isna() & full_dataset['corpus-id-scoring'].isna()].shape[0]}"
	)
	print(
	f"Number of segments in Scoring file that don't have a match in Corpus file: {full_dataset[full_dataset['corpus-id'].isna() & ~full_dataset['corpus-id-scoring'].isna()].shape[0]}"
	)
	print(
	f"Number of queries in Query file that don't have a match in Scoring file: {full_dataset[~full_dataset['query-id'].isna() & full_dataset['query-id-scoring'].isna()].shape[0]}"
	)
	print(
	f"Number of queries in Scoring file that don't have a match in Query file: {full_dataset[full_dataset['query-id'].isna() & ~full_dataset['query-id-scoring'].isna()].shape[0]}"
	)
	print()


	def check_corpus_file(full_dataset: pd.DataFrame) -> tuple:
	"""
	The [corpus file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#corpus) contains extractive segments: segments that contain information to answer the queries in the query file and many additional segments to be used as random negatives when tuning the model. You should have at least 100 segments that contain query answers; queries can be answered by multiple segments. You should also have at least 10,000 random segments.

	Otherwise, programmatically create random segments of 250–500 words from the documents in your data store and add those to the corpus file.

	The corpus file is a JSONL (JSON lines) file where each line has the fields _id and text with string values. The maximum size of the file is 250,000 lines.
	"""
	# Check if there are at least 100 segments that contain query answers
	query_answers = full_dataset[~full_dataset['query-id'].isna(
	)].shape[0] >= 100
	query_answers_result_string = f"\|___ Subcheck: At least 100 segments that contain query answers: {get_result(query_answers)}"

	# Check if there are at least 10,000 random segments
	random_segments_min = full_dataset[
	~full_dataset['corpus-id'].isna()].shape[0] >= 10000
	random_segments_min_result_string = f"\|___ Subcheck: At least 10 000 random segments: {get_result(random_segments_min)}"

	# Check if there are less than 250,000 random segments
	random_segments_max = full_dataset[
	full_dataset['query-id'].isna()].shape[0] < 250000
	random_segments_max_result_string = f"\|___ Subcheck: At most 250 000 random segments: {get_result(random_segments_max)}"

	return (query_answers and random_segments_min
	and random_segments_max, query_answers_result_string + "\n" +
	random_segments_min_result_string + "\n" +
	random_segments_max_result_string)


	def check_query_file(full_dataset: pd.DataFrame) -> bool:
	"""
	The [query file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training-files) contains the example queries that will be used for tuning the model. Each query should have one or more corresponding extractive segments in the corpus file. You should provide at least 100 positive match queries. You can also provide non-relevant queries: these are queries that correspond to extractive segments with a relevance score of zero.
	"""
	min_one_extractive_segment_per_query = full_dataset[
	~full_dataset['query-id'].isna()
	& ~full_dataset['corpus-id'].isna()].groupby(
	'query-id-scoring').size().min() > 0

	positive_match_queries = full_dataset[full_dataset['score'] >
	0].shape[0] >= 100

	return min_one_extractive_segment_per_query and positive_match_queries


	def check_training_labels(query_data: pd.DataFrame,
	scoring_data: pd.DataFrame) -> tuple:
	"""
	The [training labels file](https://cloud.google.com/generative-ai-app-builder/docs/tune-search#training) connects the queries with the extractive segments and scores each query and segment pair.

	If the test labels file is not present, then 20% of the queries in the training labels file are reserved for evaluating the tuned model after training.

	The file contains the ID of a query and the ID of its matching (or non-matching) extractive segment and a score for the relevance of the segment to the query. There must be at least one line per query; if a query is answered by two segments, then there are two lines for that query. Score is a non-negative integer value. Any score greater than zero indicates that the document is related to the query. Larger numbers indicate a greater level of relevance. If the score is omitted, the default value is 1.

	The training labels file is a TSV (tab-separated values) file with a header. The file must have the columns query-id, corpus-id and score. The query-id is a string that matches the _id key from the query file, and the corpus-id is a string that matches the _id in the corpus file.

	Extracted requirements:
	- There must be at least one line per query.
	- Score is a non-negative integer value.
	"""
	query_ids = set(query_data['query-id'])
	scoring_ids = set(scoring_data['query-id-scoring'])
	same_items = query_ids == scoring_ids

	same_items_result_string = f"\|___ Subcheck: Same ids in query and scoring data: {get_result(same_items)}"

	score_values = scoring_data['score']
	non_negative_integers = all(
	isinstance(score, int) and score >= 0 for score in score_values)
	non_negative_integers_result_string = f"\|___ Subcheck: Column 'score' contains non-negative integer values: {get_result(non_negative_integers)}"

	return (same_items and non_negative_integers, same_items_result_string +
	"\n" + non_negative_integers_result_string)


	def get_result(result: bool) -> str:
	if result:
	return "✅ met"
	else:
	return "❌ not met"


	def main():
	if len(sys.argv) != 4:
	print(
	"Usage: python check.py <corpus_path> <query_path> <scoring_path>")
	return

	corpus_path = sys.argv[1]
	query_path = sys.argv[2]
	scoring_path = sys.argv[3]

	# Read the query file
	query_data = jsonl_to_df(query_path)
	query_data = query_data.rename(columns={
	'_id': 'query-id',
	'text': 'query-text'
	})

	# Read the corpus file
	corpus_data = jsonl_to_df(corpus_path)
	corpus_data = corpus_data.rename(columns={
	'_id': 'corpus-id',
	'text': 'corpus-text'
	})

	# Read the scoring file
	scoring_data = pd.read_csv(scoring_path, sep='\t')
	scoring_data = scoring_data.rename(
	columns={
	'corpus-id': 'corpus-id-scoring',
	'query-id': 'query-id-scoring',
	'score': 'score'
	})

	print("\nGeneral dataset checks\n----------------------")
	check_datasets(query_data, corpus_data, scoring_data)

	full_dataset = prep_full_dataset(query_data, corpus_data, scoring_data)

	print("\nDocumentation dataset checks\n----------------------------")
	print(
	f"Training query requirements met: {get_result(check_training_queries(query_data))}"
	)

	extractive_segment_result = check_extractive_segments(full_dataset)
	print(
	f"Extractive segment requirements met: {get_result(extractive_segment_result[0])}"
	)
	print(extractive_segment_result[1])

	print(
	f"Relevance score requirements met: {get_result(check_relevance_scores(full_dataset))}"
	)

	###
	corpus_result = check_corpus_file(full_dataset)
	print(f"Corpus file requirements met: {get_result(corpus_result[0])}")
	print(corpus_result[1])

	print(
	f"Query file requirements met: {get_result(check_query_file(full_dataset))}"
	)

	training_labels_result = check_training_labels(query_data, scoring_data)
	print(
	f"Training labels requirements met: {get_result(training_labels_result[0])}"
	)
	print(training_labels_result[1])


	if __name__ == "__main__":
	main()