Last active
March 10, 2023 04:02
-
-
Save rohitgarud/0260fb79a9f65d424a04197b79e656ca to your computer and use it in GitHub Desktop.
Gist for calculating cosine similarity between resultants of different groups of feature vectors (ASReview screening)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
from asreview import ASReviewData, ASReviewProject, open_state | |
from asreview.models.balance import DoubleBalance | |
from asreview.models.classifiers import NaiveBayesClassifier | |
from asreview.models.feature_extraction import Tfidf | |
from asreview.models.query import MaxQuery | |
from asreview.review import ReviewSimulate | |
from asreviewcontrib.insights.metrics import time_to_discovery | |
from sklearn.metrics.pairwise import cosine_similarity | |
project_path = Path("tmp_data") | |
project_path.mkdir(exist_ok=True) | |
try: | |
import shutil | |
shutil.rmtree(project_path) | |
except Exception: | |
pass | |
finally: | |
project_path = Path("tmp_data") | |
project_path.mkdir(exist_ok=True) | |
# Create a project object and folder | |
project = ASReviewProject.create( | |
project_path=project_path / "api_simulation", | |
project_id="api_example", | |
project_mode="simulate", | |
project_name="api_example", | |
) | |
dataset = "van_de_Schoot_2017.csv" | |
# dataset = "van_Dis_2020_raw.csv" | |
filepath = Path("tmp_data", "api_simulation", "data", dataset) | |
data_df = pd.read_csv(dataset) | |
data_df.to_csv(filepath) | |
project.add_dataset(dataset) | |
# Select models to use | |
train_model = NaiveBayesClassifier() | |
query_model = MaxQuery() | |
balance_model = DoubleBalance() | |
feature_model = Tfidf() | |
data_obj = ASReviewData.from_file(filepath) | |
# Initialize the simulation reviewer | |
reviewer = ReviewSimulate( | |
as_data=data_obj, | |
model=train_model, | |
query_model=query_model, | |
balance_model=balance_model, | |
feature_model=feature_model, | |
n_instances=10, | |
project=project, | |
n_prior_included=1, | |
n_prior_excluded=1, | |
init_seed=165, | |
) | |
# Start the review process | |
project.update_review(status="review") | |
try: | |
reviewer.review() | |
project.mark_review_finished() | |
except Exception as err: | |
project.update_review(status="error") | |
raise err | |
# Finish and export the project | |
project.export(Path("tmp_data", "api_example.asreview")) | |
with open_state("tmp_data/api_example.asreview") as s: | |
tds = time_to_discovery(s) | |
rids = [rid for rid, td in tds] | |
# Cosine Similarity calculations | |
with open_state("tmp_data/api_example.asreview") as state: | |
df = state.get_dataset() | |
df["labeling_order"] = df.index | |
labels = state.get_labels(priors=True) | |
labeling_order = df.record_id | |
td_last = time_to_discovery(state)[-1][1] | |
feature_extraction_id = project.feature_matrices[0]["id"] | |
print(feature_extraction_id) | |
feature_matrix = project.get_feature_matrix(feature_extraction_id) | |
tfidf_features = feature_matrix.toarray() | |
# Calculating the combined vector of each group (unlabelled, relevant and | |
# irrelevant) of dataset by simply summing, Average can also be used, it | |
# gives the same vector with normalised magnitude | |
relevant = tfidf_features[labeling_order[0]].reshape(1, -1) | |
irrelevant = tfidf_features[labeling_order[1]].reshape(1, -1) | |
unlabelled = sum(tfidf_features).reshape(1, -1) - relevant - irrelevant | |
unlabelled_relevant = [] | |
unlabelled_irrelevant = [] | |
relevant_irrelevant = [] | |
# Iterate through the labeling order | |
for i, record in enumerate(labeling_order[2:]): | |
print(i) | |
if labels[record] == 1: | |
relevant += tfidf_features[record].reshape(1, -1) | |
if labels[record] == 0: | |
irrelevant += tfidf_features[record].reshape(1, -1) | |
unlabelled -= tfidf_features[record].reshape(1, -1) | |
# Calcuate cosine similarities | |
unlabelled_relevant.append(cosine_similarity(unlabelled, relevant)[0]) | |
unlabelled_irrelevant.append(cosine_similarity(unlabelled, irrelevant)[0]) | |
relevant_irrelevant.append(cosine_similarity(relevant, irrelevant)[0]) | |
# Plotting the cosine similarities between different groups | |
for td in tds: | |
plt.axvline(x=td[1], color="y") | |
plt.plot(unlabelled_relevant, label="unlabelled_relevant") | |
plt.plot(unlabelled_irrelevant, label="unlabelled_irrelevant") | |
plt.plot(relevant_irrelevant, label="relevant_irrelevant") | |
plt.title( | |
f"Dataset: {dataset.split('.')[0]} (Total: {len(data_df)}, Relevant: {len(tds)+1})" | |
) | |
plt.xlabel("Screened Records") | |
plt.ylabel("Cosine Similarity") | |
plt.legend() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment