Skip to content

Instantly share code, notes, and snippets.

View cydal's full-sized avatar

Sijuade Oguntayo cydal

View GitHub Profile
<!DOCTYPE html>
<meta charset="utf-8">
<title>Research Papers Graphing Network</title>
<script src="//d3js.org/d3.v4.min.js"></script>
<style>
text {
font-family: "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif;
from sknetwork.clustering import Louvain
## Tfidf vectorizer
tfidf_vectorizer_relevant = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
x_relevant_transformed = tfidf_vectorizer_relevant.fit_transform(df_core["abstract"])
### Cosine Similarity
pairwise_similarity = x_relevant_transformed * x_relevant_transformed.T
from wordcloud import WordCloud
plt.figure(figsize=(40,40))
j = np.ceil(len(set(labels_words))/10)
for topic_nbr, words in enumerate(set(labels_words)):
plt.subplot(j, 10, topic_nbr+1).set_title(str(topic_nbr))
wordcloud = WordCloud(background_color="white", contour_color='steelblue')
wordcloud.generate(words)
plt.imshow(wordcloud.to_image())
## Get Article Title
def get_title(topic_num):
idxs = np.where(km.labels_ == topic_num)[0]
titles = [x_train.iloc[idx]["Title"] for idx in idxs]
return(titles)
# Print Top 10 words for each cluster
for i, x in enumerate(lsa.inverse_transform(km.cluster_centers_).argsort()[:, ::-1][:, :10]):
words = [tfidf_vectorizer.get_feature_names()[n] for n in x]
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
sum_of_sq_distance = []
silhouette_score_list = []
K = range(2, 80)
## Loop through K and calculate silhouette score
for k in K:
from sklearn.decomposition import TruncatedSVD
for i in range(1100, 3000, 100):
lsa = TruncatedSVD(n_components=i, n_iter=10, random_state=42)
lsa.fit(x_transformed)
print("Num components - ", i, " ", lsa.explained_variance_ratio_.sum())
lsa = TruncatedSVD(n_components=2800, n_iter=10, random_state=42)
lsa.fit(x_transformed)
@cydal
cydal / tfidf.py
Last active March 18, 2021 10:21
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
x_transformed = tfidf_vectorizer.fit_transform(core_df["cleaned_text"])
keywords = {
"online": ['online', 'technology', 'internet', 'web','social media',
'Facebook', 'chat', 'chatroom', 'digital', 'webcam', 'cyber'],
"children": ['children', 'child', 'minor', 'minors', 'infant', 'infants',
'underage', 'under-age', 'kid', 'teenager', 'teenagers', 'adolescent',
'adolescents', 'girl', 'girls', 'boy', 'boys'],
"abuse": ['abuse', 'abusive', 'exploiting', 'exploitation', 'harrassment',
'prostitution', 'groom', 'grooming', 'predator', 'predators',
'pedophile', 'paedophile', 'maltreatment', 'trafficking', 'violence'],
"sexual": ['sex', 'sexual', 'pornography', 'pornographic']
import pandas as pd
import numpy as np
import nltk
import string
from cleantext import clean
nltk.download('stopwords')
def clean_text(text):
clean(text, all=False, extra_spaces=True, lowercase=True, numbers=True, punct=True)
# Import Libraries
import requests
import pandas as pd
import json
## Define lists to hold article information
titles = []
authors = []
publisher = []
doi = []