Last active
August 15, 2021 03:57
-
-
Save shreya-singh-tech/ac9fe2e5445dff0ec81f818f051d840f to your computer and use it in GitHub Desktop.
Program to find cosine similaity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import textstat | |
from textstat.textstat import textstatistics | |
import pandas as pd | |
import pysentiment2 as ps | |
import math | |
import re | |
from collections import Counter | |
import numpy as np | |
df1 = pd.read_csv( "/Final_Data_Tsv_form.tsv", sep="\t") | |
unique =[] | |
for index, row in df1.iterrows(): | |
iden = str(row['series']).strip()+str(row['class']).strip()+str(row['tag']).strip()+str(row['form']).strip()+str(row['cik']).strip() | |
unique.append(iden) | |
df1['unique_identifier_form'] = unique | |
df1['value_lag'] = df1.groupby('unique_identifier_form')['value'].shift() | |
df1.value_lag = df1.value_lag.fillna('') | |
print("done") | |
WORD = re.compile(r"\w+") | |
def get_cosine(vec1, vec2): | |
intersection = set(vec1.keys()) & set(vec2.keys()) | |
numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())]) | |
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())]) | |
denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
if not denominator: | |
return 0.0 | |
else: | |
return float(numerator) / denominator | |
def text_to_vector(text): | |
words = WORD.findall(text) | |
return Counter(words) | |
df1['vector1']=df1['value'].apply(lambda x: text_to_vector(str(x))) | |
df1['vector2']=df1['value_lag'].apply(lambda x: text_to_vector(str(x))) | |
df1['cos_Sim_score']=df1.apply(lambda x: get_cosine(x['vector1'],x['vector2']),axis=1) | |
df1.to_csv("/home/singh.shreya1/Task-Wang/cos_sample_form.tsv",sep ="\t") | |
#part where only cosine similarity of not same value_lag will be considered. | |
df1 = pd.read_csv( "/cos_sample_form.tsv", sep="\t") | |
cos_sim=[] | |
comparison_column = np.where(df1["value_lag"].isnull(), True, False) | |
df1["is_equal"] = comparison_column | |
for index, row in df1.iterrows(): | |
if(row['is_equal']): | |
cos_sim.append(" ") | |
else: | |
cos_sim.append(row['cos_Sim_score']) | |
df1['cos_sim_score'] = cos_sim | |
select_col = df1[['id','adsh','series','class','tag','cik','filed','form','cos_sim_score']] | |
df2 = select_col.copy() | |
df2.to_csv( "/Final_Data_Tsv_with_cos_sim.tsv", sep="\t") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment