Created
February 26, 2018 00:32
-
-
Save estasney/c124f83285d858ea72b893796113cb8d to your computer and use it in GitHub Desktop.
Graph Structure - From Corpus, Using SQLAlchemy, CoreNLP, Gensim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| # In[2]: | |
| import sys | |
| sys.path.append(r"/home/eric/NLP_Tools/MyTools") | |
| import Resume_Cleaner | |
| cleaner = Resume_Cleaner.CleanerServer() | |
| # from Standford CoreNLP Folder | |
| # "java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer" | |
| DB_ADDRESS = "" | |
| from sqlalchemy import Column, Integer, ForeignKey, String, create_engine, Float | |
| from sqlalchemy.orm import relationship, sessionmaker | |
| from sqlalchemy.ext.declarative import declarative_base | |
| from sqlalchemy import or_ | |
| Base = declarative_base() | |
| def has_edge(word1_txt, word2_txt): | |
| word1 = session.query(Node).filter_by(name=word1_txt).first() | |
| if not word1: | |
| word1 = Node(name=word1_txt) | |
| session.add(word1) | |
| word2 = session.query(Node).filter_by(name=word2_txt).first() | |
| if not word2: | |
| word2 = Node(name=word2_txt) | |
| session.add(word2) | |
| edge = session.query(Edge).filter(or_(Edge.lower_node == word1, Edge.higher_node == word1)) .filter(or_(Edge.lower_node == word2, Edge.higher_node == word2)).first() | |
| if edge: | |
| edge.freq += 1 | |
| session.add(edge) | |
| return True, edge | |
| else: | |
| return False, [word1, word2] | |
| class Node(Base): | |
| __tablename__ = 'node' | |
| id = Column(Integer, primary_key=True) | |
| name = Column(String) | |
| scores = Column(Float) | |
| def neighbors(self): | |
| highers = [(x.higher_node.name, x.freq) for x in self.lower_edges] | |
| lowers = [(x.lower_node.name, x.freq) for x in self.higher_edges] | |
| return highers + lowers | |
| class Edge(Base): | |
| __tablename__ = 'edge' | |
| lower_id = Column( | |
| Integer, | |
| ForeignKey('node.id'), | |
| primary_key=True) | |
| higher_id = Column( | |
| Integer, | |
| ForeignKey('node.id'), | |
| primary_key=True) | |
| lower_node = relationship( | |
| Node, | |
| primaryjoin=lower_id == Node.id, | |
| backref='lower_edges') | |
| higher_node = relationship( | |
| Node, | |
| primaryjoin=higher_id == Node.id, | |
| backref='higher_edges') | |
| freq = Column(Integer, default=0) | |
| def __init__(self, nodes: list): | |
| node_dict = {node.id: node for node in nodes} | |
| lowest_id, highest_id = min([node.id for node in nodes]), max([node.id for node in nodes]) | |
| n1, n2 = node_dict[lowest_id], node_dict[highest_id] | |
| self.lower_node = n1 | |
| self.higher_node = n2 | |
| engine = create_engine(DB_ADDRESS) | |
| Base.metadata.create_all(engine) | |
| session = sessionmaker(engine)() | |
| # In[6]: | |
| from gensim.utils import chunkize_serial | |
| from itertools import combinations | |
| import csv | |
| f = CSV_LOCATION | |
| WINDOW_SIZE = 7 | |
| def read_csv(f): | |
| with open(f, "r", newline='') as csv_file: | |
| reader = csv.reader(csv_file) | |
| for r in reader: | |
| yield r[0] | |
| def split_text(text): | |
| for doc in text: | |
| yield doc.split() | |
| def generate_chunks(doc, window_size): | |
| return chunkize_serial(doc, window_size, as_numpy=False) | |
| def generate_combos(doc, window_size): | |
| chunks = generate_chunks(doc, window_size) | |
| for chunk in chunks: | |
| for word_a, word_b in combinations(chunk, 2): | |
| yield word_a, word_b | |
| raw_text = read_csv(f) | |
| text = cleaner.stream_clean(raw_text) | |
| # In[7]: | |
| my_words = set([]) | |
| for doc in text: | |
| my_words.update(set(doc)) | |
| my_words = list(my_words) | |
| my_i_word = {i : word for i, word in enumerate(my_words)} | |
| my_word_i = {word: i for i, word in enumerate(my_words)} | |
| # In[8]: | |
| for k, v in my_i_word.items(): | |
| assert my_word_i[v]==k | |
| # In[10]: | |
| def token_lookup(token): | |
| return my_word_i[token] | |
| def doc2ids_stream(raw_stream): | |
| cleaned_stream = cleaner.stream_clean(raw_stream) | |
| for doc in cleaned_stream: | |
| g = generate_combos(doc, WINDOW_SIZE) | |
| for wp in g: | |
| word1, word2 = token_lookup(wp[0]), token_lookup(wp[1]) | |
| sp = sorted((word1, word2)) | |
| sp = tuple(sp) | |
| yield sp | |
| # In[11]: | |
| get_ipython().run_cell_magic('time', '', '\nword_occur = {}\n\niddocs = doc2ids_stream(read_csv(f))\n\nfor wp in iddocs:\n cc = word_occur.get(wp, 0)\n nc = cc + 1\n word_occur[wp] = nc\n ') | |
| # In[16]: | |
| for k, v in my_i_word.items(): | |
| node = Node(id=k, name=v) | |
| session.add(node) | |
| session.commit() | |
| # In[ ]: | |
| get_ipython().run_cell_magic('time', '', '\nwc_length = len(word_occur.items())\ncounter = 0\n\nfor k, v in word_occur.items():\n word1 = session.query(Node).get(k[0])\n word2 = session.query(Node).get(k[1])\n \n new_edge = Edge([word1, word2])\n new_edge.freq = v\n session.add(new_edge)\n counter += 1\n if counter%10000==0:\n print ("{} of {}".format(counter, wc_length))\n\nsession.commit()') | |
| # In[ ]: | |
| import numpy | |
| from numpy import empty as empty_matrix | |
| from scipy.linalg import eig | |
| from scipy.sparse import csr_matrix | |
| from scipy.sparse.linalg import eigs | |
| nodes = session.query(Node).all() | |
| row = [] | |
| col = [] | |
| data = [] | |
| length = len(nodes) | |
| # In[ ]: | |
| def adj_matrix(node_list): | |
| # effectively creates a sparse matrix | |
| # where i is the index, n_node_index is the column | |
| length = len(node_list) | |
| for i in range(length): | |
| current_node = node_list[i] | |
| neighbors = current_node.neighbors() | |
| neighbors_sum = sum([w for wt, w in neighbors]) | |
| for n_data in neighbors: | |
| n_node, n_weight = n_data[0], n_data[1] | |
| n_node_index = my_word_i[n_node] | |
| edge_weight = float(n_weight) | |
| row.append(i) | |
| col.append(n_node_index) | |
| data.append(edge_weight / neighbors_sum) | |
| return csr_matrix((data, (row, col)), shape=(length, length)) | |
| def prob_matrix(node_list): | |
| dimension = len(node_list) | |
| matrix = empty_matrix((dimension, dimension)) | |
| probability = 1.0 / float(dimension) | |
| matrix.fill(probability) | |
| return matrix | |
| def principal_eigenvector(a): | |
| if len(a) < 3: | |
| vals, vecs = eig(a) | |
| ind = numpy.abs(vals).argmax() | |
| return vecs[:, ind] | |
| else: | |
| vals, vecs = eigs(a, k=1) | |
| return vecs[:, 0] | |
| def pagerank_scores(node_list, damping=0.85): | |
| c = adj_matrix(nodes) | |
| p = prob_matrix(nodes) | |
| pagerank_matrix = damping * c.todense() + (1 - damping) * p | |
| vec = principal_eigenvector(pagerank_matrix.T) | |
| scores = {} | |
| for node in node_list: | |
| score = abs(vec[node.id]) | |
| scores[node.name] = score | |
| return scores | |
| # In[ ]: | |
| get_ipython().run_cell_magic('time', '', '\nfrom operator import itemgetter\n\nscores = pagerank_scores(nodes)\n\nsorted_scores = sorted(list(scores.items()), key=itemgetter(1), reverse=True)') | |
| # In[ ]: | |
| for score in sorted_scores: | |
| node_id = my_word_i[score[0]] | |
| s = score[1] | |
| node_obj = session.query(Node).get(node_id) | |
| node_obj.scores = s | |
| session.add(node_obj) | |
| session.commit() | |
| # In[ ]: | |
| sorted_scores[:50] | |
| # In[ ]: | |
| import pandas as pd | |
| df = pd.DataFrame(edge_freqs) | |
| # In[ ]: | |
| df.loc[df[0]>50].describe() | |
| # In[ ]: | |
| from matplotlib import pyplot as plt | |
| # In[ ]: | |
| plt.plot(edge_freqs) | |
| # In[ ]: | |
| session.commit() | |
| # In[ ]: | |
| edges = session.query(Edge).all() | |
| # In[ ]: | |
| edges | |
| # In[ ]: | |
| n = session.query(Node).filter_by(name='java').first() | |
| # In[ ]: | |
| n.neighbors() | |
| # In[ ]: | |
| n.name | |
| # In[ ]: | |
| n.neighbors() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment