This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import tensorflow as tf | |
def np_cosine_similarity(u, v): | |
u = np.expand_dims(u, 1) | |
n = np.sum(u * v, axis=2) | |
d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1) | |
return n / d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
# Generate random data | |
x = np.random.rand(1000, 25) | |
y = np.random.rand(50, 25) | |
# Time NumPy | |
start = time.time() | |
for _ in range(100): | |
np_cosine_similarity(x, y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generate random data | |
x = np.random.rand(5, 5) | |
y = np.random.rand(1, 5) | |
print("x:", "\n", x) | |
print("y:", "\n", y) | |
# Calculate cosine similarity in NumPy | |
results = np_cosine_similarity(x, y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pprint | |
import random | |
from statistics import mean, stdev | |
pp = pprint.PrettyPrinter() | |
# Set seed to generate predictable data | |
random.seed(500) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
from nltk.stem.porter import PorterStemmer | |
class Tokenizer(object): | |
# Standard stop words used by Lucene/Elasticsearch | |
STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", | |
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", | |
"they", "this", "to", "was", "will", "with"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import pytz | |
>>> from pytz import timezone | |
>>> utc = pytz.utc | |
>>> eastern = timezone('US/Eastern') | |
>>> date = datetime.utcnow().astimezone(utc) | |
>>> date | |
datetime.datetime(2020, 2, 6, 21, 22, 49, 384018, tzinfo=<UTC>) | |
>>> date.astimezone(eastern) | |
datetime.datetime(2020, 2, 6, 16, 22, 49, 384018, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from datetime import datetime, timedelta | |
>>> date = datetime.now() | |
>>> date | |
datetime.datetime(2020, 2, 6, 14, 49, 14, 277747) | |
>>> date - timedelta(minutes=10) | |
datetime.datetime(2020, 2, 6, 14, 39, 14, 277747) | |
>>> date - timedelta(hours=10) | |
datetime.datetime(2020, 2, 6, 4, 49, 14, 277747) | |
>>> date - timedelta(days=10) | |
datetime.datetime(2020, 1, 27, 14, 49, 14, 277747) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai.embeddings import Embeddings | |
# Create embeddings model, backed by sentence-transformers & transformers | |
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"}) | |
data = ["US tops 5 million confirmed virus cases", | |
"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", | |
"Beijing mobilises invasion craft along coast as Taiwan tensions escalate", | |
"The National Park Service warns against sacrificing slower friends in a bear attack", | |
"Maine man wins $1M from $25 lottery ticket", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create an index for the list of text | |
embeddings.index([(uid, text, None) for uid, text in enumerate(data)]) | |
print("%-20s %s" % ("Query", "Best Match")) | |
print("-" * 50) | |
# Run an embeddings search for each query | |
for query in ("feel good story", "climate change", "public health story", "war", "wildlife", | |
"asia", "lucky", "dishonest junk"): | |
# Extract uid of first result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai.embeddings import Embeddings | |
from txtai.pipeline import Extractor | |
# Create embeddings model, backed by sentence-transformers & transformers | |
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"}) | |
# Create extractor instance | |
extractor = Extractor(embeddings, "distilbert-base-cased-distilled-squad") |
OlderNewer