David Mezzetti davidmezzetti

Founder/CEO at NeuML. Building easy-to-use semantic search and workflow applications with txtai.

davidmezzetti / similarity.py

Last active January 24, 2020 01:33

	import numpy as np
	import tensorflow as tf

	def np_cosine_similarity(u, v):
	u = np.expand_dims(u, 1)
	n = np.sum(u * v, axis=2)
	d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)

	return n / d

davidmezzetti / similarity_timing.py

Last active January 24, 2020 00:52

davidmezzetti / similarity_indices.py

Last active January 24, 2020 00:24

	# Generate random data
	x = np.random.rand(5, 5)
	y = np.random.rand(1, 5)

	print("x:", "\n", x)
	print("y:", "\n", y)

	# Calculate cosine similarity in NumPy
	results = np_cosine_similarity(x, y)

davidmezzetti / transformations.py

Created January 24, 2020 18:48

	import math
	import pprint
	import random
	from statistics import mean, stdev

	pp = pprint.PrettyPrinter()

	# Set seed to generate predictable data
	random.seed(500)

davidmezzetti / tokenizer.py

Created January 29, 2020 22:36

	import re
	import sys

	from nltk.stem.porter import PorterStemmer

	class Tokenizer(object):
	# Standard stop words used by Lucene/Elasticsearch
	STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
	"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
	"they", "this", "to", "was", "will", "with"])

davidmezzetti / timezone.py

Created February 6, 2020 16:25

	>>> import pytz
	>>> from pytz import timezone
	>>> utc = pytz.utc
	>>> eastern = timezone('US/Eastern')
	>>> date = datetime.utcnow().astimezone(utc)
	>>> date
	datetime.datetime(2020, 2, 6, 21, 22, 49, 384018, tzinfo=<UTC>)
	>>> date.astimezone(eastern)
	datetime.datetime(2020, 2, 6, 16, 22, 49, 384018, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>)

davidmezzetti / datemath.py

Created February 6, 2020 19:50

	>>> from datetime import datetime, timedelta
	>>> date = datetime.now()
	>>> date
	datetime.datetime(2020, 2, 6, 14, 49, 14, 277747)
	>>> date - timedelta(minutes=10)
	datetime.datetime(2020, 2, 6, 14, 39, 14, 277747)
	>>> date - timedelta(hours=10)
	datetime.datetime(2020, 2, 6, 4, 49, 14, 277747)
	>>> date - timedelta(days=10)
	datetime.datetime(2020, 1, 27, 14, 49, 14, 277747)

davidmezzetti / txtai-similarity.py

Last active August 18, 2021 14:30

	from txtai.embeddings import Embeddings

	# Create embeddings model, backed by sentence-transformers & transformers
	embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

	data = ["US tops 5 million confirmed virus cases",
	"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
	"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
	"The National Park Service warns against sacrificing slower friends in a bear attack",
	"Maine man wins $1M from $25 lottery ticket",

davidmezzetti / txtai-index.py

Last active August 18, 2021 14:29

	# Create an index for the list of text
	embeddings.index([(uid, text, None) for uid, text in enumerate(data)])

	print("%-20s %s" % ("Query", "Best Match"))
	print("-" * 50)

	# Run an embeddings search for each query
	for query in ("feel good story", "climate change", "public health story", "war", "wildlife",
	"asia", "lucky", "dishonest junk"):
	# Extract uid of first result

davidmezzetti / txtai-qa.py

Last active October 11, 2021 14:41

	from txtai.embeddings import Embeddings
	from txtai.pipeline import Extractor

	# Create embeddings model, backed by sentence-transformers & transformers
	embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

	# Create extractor instance
	extractor = Extractor(embeddings, "distilbert-base-cased-distilled-squad")