This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<title>Research Papers Graphing Network</title> | |
<script src="//d3js.org/d3.v4.min.js"></script> | |
<style> | |
text { | |
font-family: "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sknetwork.clustering import Louvain | |
## Tfidf vectorizer | |
tfidf_vectorizer_relevant = TfidfVectorizer(max_features=10000, ngram_range=(1,2)) | |
x_relevant_transformed = tfidf_vectorizer_relevant.fit_transform(df_core["abstract"]) | |
### Cosine Similarity | |
pairwise_similarity = x_relevant_transformed * x_relevant_transformed.T | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from wordcloud import WordCloud | |
plt.figure(figsize=(40,40)) | |
j = np.ceil(len(set(labels_words))/10) | |
for topic_nbr, words in enumerate(set(labels_words)): | |
plt.subplot(j, 10, topic_nbr+1).set_title(str(topic_nbr)) | |
wordcloud = WordCloud(background_color="white", contour_color='steelblue') | |
wordcloud.generate(words) | |
plt.imshow(wordcloud.to_image()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Get Article Title | |
def get_title(topic_num): | |
idxs = np.where(km.labels_ == topic_num)[0] | |
titles = [x_train.iloc[idx]["Title"] for idx in idxs] | |
return(titles) | |
# Print Top 10 words for each cluster | |
for i, x in enumerate(lsa.inverse_transform(km.cluster_centers_).argsort()[:, ::-1][:, :10]): | |
words = [tfidf_vectorizer.get_feature_names()[n] for n in x] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cluster import KMeans | |
from sklearn.metrics import silhouette_score | |
sum_of_sq_distance = [] | |
silhouette_score_list = [] | |
K = range(2, 80) | |
## Loop through K and calculate silhouette score | |
for k in K: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import TruncatedSVD | |
for i in range(1100, 3000, 100): | |
lsa = TruncatedSVD(n_components=i, n_iter=10, random_state=42) | |
lsa.fit(x_transformed) | |
print("Num components - ", i, " ", lsa.explained_variance_ratio_.sum()) | |
lsa = TruncatedSVD(n_components=2800, n_iter=10, random_state=42) | |
lsa.fit(x_transformed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2)) | |
x_transformed = tfidf_vectorizer.fit_transform(core_df["cleaned_text"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
keywords = { | |
"online": ['online', 'technology', 'internet', 'web','social media', | |
'Facebook', 'chat', 'chatroom', 'digital', 'webcam', 'cyber'], | |
"children": ['children', 'child', 'minor', 'minors', 'infant', 'infants', | |
'underage', 'under-age', 'kid', 'teenager', 'teenagers', 'adolescent', | |
'adolescents', 'girl', 'girls', 'boy', 'boys'], | |
"abuse": ['abuse', 'abusive', 'exploiting', 'exploitation', 'harrassment', | |
'prostitution', 'groom', 'grooming', 'predator', 'predators', | |
'pedophile', 'paedophile', 'maltreatment', 'trafficking', 'violence'], | |
"sexual": ['sex', 'sexual', 'pornography', 'pornographic'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import nltk | |
import string | |
from cleantext import clean | |
nltk.download('stopwords') | |
def clean_text(text): | |
clean(text, all=False, extra_spaces=True, lowercase=True, numbers=True, punct=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import Libraries | |
import requests | |
import pandas as pd | |
import json | |
## Define lists to hold article information | |
titles = [] | |
authors = [] | |
publisher = [] | |
doi = [] |
NewerOlder