Created
June 22, 2019 20:18
-
-
Save 8enmann/68941a6c35bee10fe4326e0b2db65791 to your computer and use it in GitHub Desktop.
Experimenting with pytest, cosine similarity, ngram counting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from typing import List, Sequence | |
from collections import Counter | |
import numpy as np | |
TEST_URL = 'http://titan.dcs.bbk.ac.uk/~kikpef01/testpage.html' | |
URL_RE = re.compile(r'href="(http.+?)"') | |
TOKENIZER_RE: re.Pattern = re.compile(r'\w+') | |
TAG_RE: re.Pattern = re.compile(r'<.+?>', flags=re.S) | |
def pull_page(url: str): | |
response = requests.get(url) | |
if response.status_code >= 300: | |
return None | |
return response.text | |
def get_urls(page: str) -> List[str]: | |
return URL_RE.findall(page) | |
def window(s: str, window_size: int) -> List[str]: | |
return [tuple(s[j:j+window_size]) for j in range(len(s) - (window_size - 1))] | |
def count_ngrams(page, n=2) -> Counter: | |
"""Return a Counter of ngram counts from 1 to n.""" | |
tokens = TOKENIZER_RE.findall(strip_tags(page).lower()) | |
counter = Counter() | |
for i in range(1, n + 1): | |
counter.update(window(tokens, i)) | |
return counter | |
def strip_tags(page): | |
return TAG_RE.sub(' ', page) | |
def word2vec(counter: Counter, vocab: Sequence[str]): | |
return [counter.get(word, 0) for word in vocab] | |
def cosine(a, b): | |
vocab = sorted(a | b) | |
a = word2vec(a, vocab) | |
b = word2vec(b, vocab) | |
lens = [sum(xx**2 for xx in x) ** .5 for x in (a,b)] | |
dot = sum(aa * bb for aa, bb in zip(a,b)) | |
return dot / (lens[0] * lens[1]) | |
def np_cosine(a, b): | |
vocab = sorted(a | b) | |
a = np.array(word2vec(a, vocab)) | |
b = np.array(word2vec(b, vocab)) | |
lens = np.sqrt(np.sum(np.square(np.stack([a,b])), axis=1)) | |
return np.dot(a,b) / np.prod(lens) | |
import time | |
class Timer: | |
def __enter__(self): | |
self.start = time.time() | |
return self | |
def __exit__(self, *args): | |
self.end = time.time() | |
self.interval = self.end - self.start | |
def main(): | |
# Pull urls from TEST_URL | |
page = pull_page(TEST_URL) | |
urls = get_urls(page) | |
print('Found', len(urls), 'urls:', urls) | |
counts = count_ngrams(page) | |
print(counts.most_common(10)) | |
with Timer() as t: | |
print(cosine(counts, counts)) | |
print(t.interval) | |
with Timer() as t: | |
print(np_cosine(counts, counts)) | |
print(t.interval) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pytest | |
import mock | |
from unittest import mock | |
from collections import Counter | |
from counter import pull_page, get_urls, count_ngrams, window, strip_tags, cosine | |
FAKE_HTML = '<html><a href="http://google.com">link</a><a href="http://google.com">link</a></html>' | |
dummy_response = mock.MagicMock(requests.Response) | |
dummy_response.status_code = 200 | |
dummy_response.text = FAKE_HTML | |
mockGet = mock.MagicMock(return_value=dummy_response) | |
@mock.patch('requests.get', mockGet) | |
def test_pull_page(): | |
assert 'google' in pull_page("http://ignored.com") | |
def test_regex(): | |
results = get_urls(FAKE_HTML) | |
assert len(results) == 2 | |
assert results[0] == 'http://google.com' | |
def test_regex_empty(): | |
assert len(get_urls('')) == 0 | |
def test_ngrams(): | |
counts = count_ngrams('a a aa b bbc', n=1) | |
assert len(counts) == 4 | |
assert counts['a'] == 2 | |
def test_ngrams_2(): | |
counts = count_ngrams('a a a b b', n=2) | |
assert len(counts) == 5 | |
assert counts[('a', 'a')] == 2 | |
assert counts[('a', 'b')] == 1 | |
def test_window(): | |
TEST_STR = 'asdfasdfasdf' | |
for i in range(2,4): | |
ret = window(TEST_STR, i) | |
assert len(ret) == len(TEST_STR) + 1 - i | |
assert len(ret[0]) == i | |
def test_strip(): | |
assert strip_tags(FAKE_HTML).split() == ['link', 'link'] | |
def test_cosine(): | |
a = Counter('aaaabbb') | |
b = Counter('bbbccc') | |
assert cosine(a, a) == 1.0 | |
assert cosine(a, b) == pytest.approx(.4, .1) | |
assert cosine(a, b) == cosine(b, a) | |
if __name__ == '__main__': | |
pytest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment