Skip to content

Instantly share code, notes, and snippets.

@estasney
Created February 26, 2018 00:32
Show Gist options
  • Select an option

  • Save estasney/c124f83285d858ea72b893796113cb8d to your computer and use it in GitHub Desktop.

Select an option

Save estasney/c124f83285d858ea72b893796113cb8d to your computer and use it in GitHub Desktop.
Graph Structure - From Corpus, Using SQLAlchemy, CoreNLP, Gensim
# coding: utf-8
# In[2]:
import sys
sys.path.append(r"/home/eric/NLP_Tools/MyTools")
import Resume_Cleaner
cleaner = Resume_Cleaner.CleanerServer()
# from Standford CoreNLP Folder
# "java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer"
DB_ADDRESS = ""
from sqlalchemy import Column, Integer, ForeignKey, String, create_engine, Float
from sqlalchemy.orm import relationship, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import or_
Base = declarative_base()
def has_edge(word1_txt, word2_txt):
word1 = session.query(Node).filter_by(name=word1_txt).first()
if not word1:
word1 = Node(name=word1_txt)
session.add(word1)
word2 = session.query(Node).filter_by(name=word2_txt).first()
if not word2:
word2 = Node(name=word2_txt)
session.add(word2)
edge = session.query(Edge).filter(or_(Edge.lower_node == word1, Edge.higher_node == word1)) .filter(or_(Edge.lower_node == word2, Edge.higher_node == word2)).first()
if edge:
edge.freq += 1
session.add(edge)
return True, edge
else:
return False, [word1, word2]
class Node(Base):
__tablename__ = 'node'
id = Column(Integer, primary_key=True)
name = Column(String)
scores = Column(Float)
def neighbors(self):
highers = [(x.higher_node.name, x.freq) for x in self.lower_edges]
lowers = [(x.lower_node.name, x.freq) for x in self.higher_edges]
return highers + lowers
class Edge(Base):
__tablename__ = 'edge'
lower_id = Column(
Integer,
ForeignKey('node.id'),
primary_key=True)
higher_id = Column(
Integer,
ForeignKey('node.id'),
primary_key=True)
lower_node = relationship(
Node,
primaryjoin=lower_id == Node.id,
backref='lower_edges')
higher_node = relationship(
Node,
primaryjoin=higher_id == Node.id,
backref='higher_edges')
freq = Column(Integer, default=0)
def __init__(self, nodes: list):
node_dict = {node.id: node for node in nodes}
lowest_id, highest_id = min([node.id for node in nodes]), max([node.id for node in nodes])
n1, n2 = node_dict[lowest_id], node_dict[highest_id]
self.lower_node = n1
self.higher_node = n2
engine = create_engine(DB_ADDRESS)
Base.metadata.create_all(engine)
session = sessionmaker(engine)()
# In[6]:
from gensim.utils import chunkize_serial
from itertools import combinations
import csv
f = CSV_LOCATION
WINDOW_SIZE = 7
def read_csv(f):
with open(f, "r", newline='') as csv_file:
reader = csv.reader(csv_file)
for r in reader:
yield r[0]
def split_text(text):
for doc in text:
yield doc.split()
def generate_chunks(doc, window_size):
return chunkize_serial(doc, window_size, as_numpy=False)
def generate_combos(doc, window_size):
chunks = generate_chunks(doc, window_size)
for chunk in chunks:
for word_a, word_b in combinations(chunk, 2):
yield word_a, word_b
raw_text = read_csv(f)
text = cleaner.stream_clean(raw_text)
# In[7]:
my_words = set([])
for doc in text:
my_words.update(set(doc))
my_words = list(my_words)
my_i_word = {i : word for i, word in enumerate(my_words)}
my_word_i = {word: i for i, word in enumerate(my_words)}
# In[8]:
for k, v in my_i_word.items():
assert my_word_i[v]==k
# In[10]:
def token_lookup(token):
return my_word_i[token]
def doc2ids_stream(raw_stream):
cleaned_stream = cleaner.stream_clean(raw_stream)
for doc in cleaned_stream:
g = generate_combos(doc, WINDOW_SIZE)
for wp in g:
word1, word2 = token_lookup(wp[0]), token_lookup(wp[1])
sp = sorted((word1, word2))
sp = tuple(sp)
yield sp
# In[11]:
get_ipython().run_cell_magic('time', '', '\nword_occur = {}\n\niddocs = doc2ids_stream(read_csv(f))\n\nfor wp in iddocs:\n cc = word_occur.get(wp, 0)\n nc = cc + 1\n word_occur[wp] = nc\n ')
# In[16]:
for k, v in my_i_word.items():
node = Node(id=k, name=v)
session.add(node)
session.commit()
# In[ ]:
get_ipython().run_cell_magic('time', '', '\nwc_length = len(word_occur.items())\ncounter = 0\n\nfor k, v in word_occur.items():\n word1 = session.query(Node).get(k[0])\n word2 = session.query(Node).get(k[1])\n \n new_edge = Edge([word1, word2])\n new_edge.freq = v\n session.add(new_edge)\n counter += 1\n if counter%10000==0:\n print ("{} of {}".format(counter, wc_length))\n\nsession.commit()')
# In[ ]:
import numpy
from numpy import empty as empty_matrix
from scipy.linalg import eig
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigs
nodes = session.query(Node).all()
row = []
col = []
data = []
length = len(nodes)
# In[ ]:
def adj_matrix(node_list):
# effectively creates a sparse matrix
# where i is the index, n_node_index is the column
length = len(node_list)
for i in range(length):
current_node = node_list[i]
neighbors = current_node.neighbors()
neighbors_sum = sum([w for wt, w in neighbors])
for n_data in neighbors:
n_node, n_weight = n_data[0], n_data[1]
n_node_index = my_word_i[n_node]
edge_weight = float(n_weight)
row.append(i)
col.append(n_node_index)
data.append(edge_weight / neighbors_sum)
return csr_matrix((data, (row, col)), shape=(length, length))
def prob_matrix(node_list):
dimension = len(node_list)
matrix = empty_matrix((dimension, dimension))
probability = 1.0 / float(dimension)
matrix.fill(probability)
return matrix
def principal_eigenvector(a):
if len(a) < 3:
vals, vecs = eig(a)
ind = numpy.abs(vals).argmax()
return vecs[:, ind]
else:
vals, vecs = eigs(a, k=1)
return vecs[:, 0]
def pagerank_scores(node_list, damping=0.85):
c = adj_matrix(nodes)
p = prob_matrix(nodes)
pagerank_matrix = damping * c.todense() + (1 - damping) * p
vec = principal_eigenvector(pagerank_matrix.T)
scores = {}
for node in node_list:
score = abs(vec[node.id])
scores[node.name] = score
return scores
# In[ ]:
get_ipython().run_cell_magic('time', '', '\nfrom operator import itemgetter\n\nscores = pagerank_scores(nodes)\n\nsorted_scores = sorted(list(scores.items()), key=itemgetter(1), reverse=True)')
# In[ ]:
for score in sorted_scores:
node_id = my_word_i[score[0]]
s = score[1]
node_obj = session.query(Node).get(node_id)
node_obj.scores = s
session.add(node_obj)
session.commit()
# In[ ]:
sorted_scores[:50]
# In[ ]:
import pandas as pd
df = pd.DataFrame(edge_freqs)
# In[ ]:
df.loc[df[0]>50].describe()
# In[ ]:
from matplotlib import pyplot as plt
# In[ ]:
plt.plot(edge_freqs)
# In[ ]:
session.commit()
# In[ ]:
edges = session.query(Edge).all()
# In[ ]:
edges
# In[ ]:
n = session.query(Node).filter_by(name='java').first()
# In[ ]:
n.neighbors()
# In[ ]:
n.name
# In[ ]:
n.neighbors()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment