hsauers5 · February 18, 2020 23:45
diff --git a/similarity.py b/similarity.py
 import nltk, string
 from sklearn.feature_extraction.text import TfidfVectorizer
 import requests
 from bs4 import BeautifulSoup
 import gensim

 nltk.download('punkt')

 stemmer = nltk.stem.porter.PorterStemmer()
 remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

 def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

 '''remove punctuation, lowercase, stem'''
 def normalize(text):
    return (nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

 vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

 def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

 def get_similarity(new_filing_url, old_filing_url):
    new_filing_text = BeautifulSoup(requests.get(new_filing_url).content, "lxml").text
    old_filing_text = BeautifulSoup(requests.get(old_filing_url).content, "lxml").text
    
    return cosine_sim(new_filing_text, old_filing_text)

 '''
 new_filing_url = "https://www.sec.gov/Archives/edgar/data/922612/000119312520007306/d857866d10q.htm"
 old_filing_url = "https://www.sec.gov/Archives/edgar/data/922612/000119312519264545/d801919d10q.htm"
 '''

 '''
 new_filing_url = "https://www.sec.gov/Archives/edgar/data/1002135/000100213519000017/wstl-2019331x10k.htm"
 old_filing_url = "https://www.sec.gov/Archives/edgar/data/1002135/000100213518000018/wstl-2018331x10k.htm"

 print(get_similarity(new_filing_url, old_filing_url))
 '''

 """
 'https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/0000320193-20-000010.txt', 
 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000076/0000320193-19-000076.txt', 
 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/0000320193-19-000066.txt'
 """

 def test():
    url1 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/0000320193-20-000010.txt'
    url2 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000076/0000320193-19-000076.txt'
    url3 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/0000320193-19-000066.txt'

    print(get_similarity(url1, url2))
    # print(get_similarity(url2, url3))

 if __name__ == '__main__':
    test()
	import nltk, string
	from sklearn.feature_extraction.text import TfidfVectorizer
	import requests
	from bs4 import BeautifulSoup
	import gensim

	nltk.download('punkt')

	stemmer = nltk.stem.porter.PorterStemmer()
	remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

	def stem_tokens(tokens):
	return [stemmer.stem(item) for item in tokens]

	'''remove punctuation, lowercase, stem'''
	def normalize(text):
	return (nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

	vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

	def cosine_sim(text1, text2):
	tfidf = vectorizer.fit_transform([text1, text2])
	return ((tfidf * tfidf.T).A)[0,1]

	def get_similarity(new_filing_url, old_filing_url):
	new_filing_text = BeautifulSoup(requests.get(new_filing_url).content, "lxml").text
	old_filing_text = BeautifulSoup(requests.get(old_filing_url).content, "lxml").text

	return cosine_sim(new_filing_text, old_filing_text)

	'''
	new_filing_url = "https://www.sec.gov/Archives/edgar/data/922612/000119312520007306/d857866d10q.htm"
	old_filing_url = "https://www.sec.gov/Archives/edgar/data/922612/000119312519264545/d801919d10q.htm"
	'''

	'''
	new_filing_url = "https://www.sec.gov/Archives/edgar/data/1002135/000100213519000017/wstl-2019331x10k.htm"
	old_filing_url = "https://www.sec.gov/Archives/edgar/data/1002135/000100213518000018/wstl-2018331x10k.htm"

	print(get_similarity(new_filing_url, old_filing_url))
	'''

	"""
	'https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/0000320193-20-000010.txt',
	'https://www.sec.gov/Archives/edgar/data/320193/000032019319000076/0000320193-19-000076.txt',
	'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/0000320193-19-000066.txt'
	"""

	def test():
	url1 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/0000320193-20-000010.txt'
	url2 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000076/0000320193-19-000076.txt'
	url3 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/0000320193-19-000066.txt'

	print(get_similarity(url1, url2))
	# print(get_similarity(url2, url3))

	if __name__ == '__main__':
	test()