Skip to content

Instantly share code, notes, and snippets.

@rdisipio
Last active January 22, 2020 15:32
Show Gist options
  • Save rdisipio/d635a8fcb1bebc49ff0535f5cdfbc555 to your computer and use it in GitHub Desktop.
Save rdisipio/d635a8fcb1bebc49ff0535f5cdfbc555 to your computer and use it in GitHub Desktop.
Script to download abstracts from the arXiv server
#!/usr/bin/env python
import os, sys
import pickle
import numpy as np
import pandas as pd
import urllib.request
import re
import feedparser
import time
from tqdm import tqdm
from utils import clean_text, normalize_text_nltk
import tensorflow_hub as hub
categories = ['astro-ph', 'hep-ex', 'cs.AI', 'q-bio.BM', 'math.NT']
map_categories = { i:c for c,i in enumerate(categories) }
total_results = 1000
if len(sys.argv) > 1:
total_results = int(sys.argv[1])
results_per_iteration = total_results//10 if total_results < 1000 else total_results//100
wait_time = 3 # seconds
print("There are {} known categories: {}".format(len(categories), categories))
print("Each batch will contain {} articles".format(results_per_iteration))
base_url = 'http://export.arxiv.org/api/query?'
df = pd.DataFrame(columns=["abstract", "category_txt", "category_id"])
for category in categories:
print("Processing category {}...".format(category))
c_id = map_categories[category]
search_query = "cat:{}".format(category)
start = 0
articles_in_batch = []
for i in tqdm(range(start, total_results, results_per_iteration)):
query = "search_query={}&start={}&max_results={}".format(search_query, i, results_per_iteration )
url = base_url + query
response = urllib.request.urlopen(url)
feed = feedparser.parse(response)
for entry in feed.entries:
abstract = entry.summary
#clean_abstract = clean_text(abstract)
article = {
'abstract':clean_abstract,
'category_txt':category,
'category_id':c_id,
}
articles_in_batch.append(article)
print("Found {} articles for category {}".format(len(articles_in_batch), category))
for article in articles_in_batch:
df = df.append(article, ignore_index=True)
print("Found {} articles".format(df.shape[0]))
print(df.sample(n=10))
print("Creating embeddings...")
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)
X_txt = df.abstract.values
X_embed = embed(X_txt)
print(X_embed.shape)
X_embed = [ np.array(emb) for emb in X_embed]
df['embedding'] = X_embed
print(df.sample(n=10))
f_out = open("arxiv_abstracts.pkl", 'wb')
pickle.dump(df, f_out)
f_out.close()
#!/usr/bin/env python
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
#########################################
def calc_similarities(X, Y):
sims = cosine_similarity(X,Y)
sims = sims.flatten()
sims = sims[sims<1.]
#sims = np.arccos(sims)/np.pi # radians
#sims = 1. - sims
return sims
#########################################
if __name__ == '__main__':
SUBSPACE_DIM = .20
f_name = "arxiv_abstracts.pkl"
f = open(f_name, 'rb')
df = pickle.load(f)
print(df.sample(n=10))
print("Total number of examples: {}".format(df.shape[0]))
cat_txt = df['category_txt'].drop_duplicates().values
print("Known categories:", cat_txt)
cat_ids = df['category_id'].drop_duplicates().values
n_categories = len(cat_ids)
print("Number of categories: {}".format(n_categories))
pca = PCA(n_components=SUBSPACE_DIM)
X = np.array([emb for emb in df['embedding'].values])
pca.fit(X)
X = pca.transform(X)
pca_dim = X.shape[1]
X = [ list(X[i]) for i in range(X.shape[0])]
df['pca'] = X
print(df.head())
fig, axes = plt.subplots(n_categories, n_categories, sharex=False, sharey=False, figsize=(12,10))
for i in range(n_categories):
for j in range(n_categories):
if i > j:
axes[i, j].axis('off')
all_similarities = []
y_max = 0.30
for id1 in range(n_categories):
X = df.loc[df.category_id==id1]
X_512 = np.array([ x for x in X['embedding'].values ])
X_pca = np.array([ x for x in X['pca'].values ])
for id2 in range(id1, n_categories):
Y = df.loc[df.category_id==id2]
Y_512 = np.array([ y for y in Y['embedding'].values ])
Y_pca = np.array([ y for y in Y['pca'].values ])
sims_512 = calc_similarities(X_512, Y_512)
sims_pca = calc_similarities(X_pca, Y_pca)
axes[id1, id2].hist(sims_512, 40, histtype='stepfilled', weights=np.ones(len(sims_512)) / len(sims_512), label="512-dim")
axes[id1, id2].hist(sims_pca, 40, histtype='stepfilled', alpha=0.7, weights=np.ones(len(sims_pca)) / len(sims_pca), color='red', label="PCA ({})".format(SUBSPACE_DIM))
axes[id1, id2].legend(loc='upper left', prop={'size': 6})
axes[id1, id2].set_xlim([-1., 1.])
axes[id1, id2].set_ylim([0., y_max])
#axes[id1, id2].set_ylabel("Fraction")
axes[id1, id2].text(0.1, y_max-0.03, cat_txt[id1], fontsize=9)
if id1 != id2:
axes[id1, id2].text(0.1, y_max-0.06, cat_txt[id2], fontsize=9)
plt.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.95, hspace = 0.5)
plt.suptitle("Universal Sentence Embeddings - Dimensionality Reduction 512 -> {}".format(pca_dim))
plt.show()
fig.savefig(f_name.replace(".pkl", ".png"))
#!/usr/bin/env python
import pickle
from functools import partial
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
#########################################
def USELayer(embed,x):
return embed(tf.squeeze(tf.cast(x, tf.string)))
def make_model(embed, n_categories, latent_dim=16, embedding_dim=512):
UniversalEmbedding = partial(USELayer,embed)
text_in = keras.Input( shape=(1,), dtype=tf.string, name="text_in")
x = layers.Lambda(UniversalEmbedding, output_shape=(embedding_dim, ))(text_in)
x = layers.Dense(latent_dim, activation='relu')(x)
x_out = layers.Dense(n_categories, activation='softmax')(x)
return keras.Model(inputs=text_in, outputs=x_out, name="AbstractClassifier")
#########################################
if __name__ == '__main__':
LATENT_DIM = 16
TEST_SIZE = 0.2
N_EPOCHS = 20
BATCH_SIZE = 128
f_name = "arxiv_abstracts.pkl"
f = open(f_name, 'rb')
df = pickle.load(f)
categories = list(set(df['category_txt'].values))
n_categories = len(categories)
print("There are {} known categories: {}".format(n_categories, categories))
X_txt = df['abstract'].values
y = np.array(df['category_id'].values)
y = to_categorical(y)
X_txt_train, X_txt_test, y_train, y_test = train_test_split(X_txt, y)
print("Training set has {} samples".format(X_txt_train.shape[0]))
print("Testing set has {} samples".format(X_txt_test.shape[0]))
# initialize USE embedder
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)
#model = make_model(embed, n_categories=n_categories, latent_dim=LATENT_DIM)
model = make_model_quantum(embed, n_categories=n_categories)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(
loss='categorical_crossentropy',
optmizer=optimizer,
metrics=['acc'],
)
print("Training...")
callback = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.005)
model.fit(
X_txt_train, y_train,
epochs=N_EPOCHS,
batch_size=BATCH_SIZE,
validation_split=0.2,
callbacks=[],
)
print("Done training")
print("Testing...")
test_score = model.evaluate(X_txt_test, y_test, verbose=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment