Skip to content

Instantly share code, notes, and snippets.

@hsauers5
Created February 18, 2020 23:45
Show Gist options
  • Save hsauers5/1c28b782116aa7237d87c5e16ac38d19 to your computer and use it in GitHub Desktop.
Save hsauers5/1c28b782116aa7237d87c5e16ac38d19 to your computer and use it in GitHub Desktop.
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from bs4 import BeautifulSoup
import gensim
nltk.download('punkt')
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return (nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(text1, text2):
tfidf = vectorizer.fit_transform([text1, text2])
return ((tfidf * tfidf.T).A)[0,1]
def get_similarity(new_filing_url, old_filing_url):
new_filing_text = BeautifulSoup(requests.get(new_filing_url).content, "lxml").text
old_filing_text = BeautifulSoup(requests.get(old_filing_url).content, "lxml").text
return cosine_sim(new_filing_text, old_filing_text)
'''
new_filing_url = "https://www.sec.gov/Archives/edgar/data/922612/000119312520007306/d857866d10q.htm"
old_filing_url = "https://www.sec.gov/Archives/edgar/data/922612/000119312519264545/d801919d10q.htm"
'''
'''
new_filing_url = "https://www.sec.gov/Archives/edgar/data/1002135/000100213519000017/wstl-2019331x10k.htm"
old_filing_url = "https://www.sec.gov/Archives/edgar/data/1002135/000100213518000018/wstl-2018331x10k.htm"
print(get_similarity(new_filing_url, old_filing_url))
'''
"""
'https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/0000320193-20-000010.txt',
'https://www.sec.gov/Archives/edgar/data/320193/000032019319000076/0000320193-19-000076.txt',
'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/0000320193-19-000066.txt'
"""
def test():
url1 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/0000320193-20-000010.txt'
url2 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000076/0000320193-19-000076.txt'
url3 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/0000320193-19-000066.txt'
print(get_similarity(url1, url2))
# print(get_similarity(url2, url3))
if __name__ == '__main__':
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment