Skip to content

Instantly share code, notes, and snippets.

@fnielsen
Last active April 29, 2020 23:49
Show Gist options
  • Save fnielsen/3603279 to your computer and use it in GitHub Desktop.
Save fnielsen/3603279 to your computer and use it in GitHub Desktop.
Text mining example in Python
# $Id: Nielsen2012Python_case.py,v 1.2 2012/09/02 16:55:25 fn Exp $
# Define a url as a Python string (note we are only getting 100 documents)
url = "http://wikilit.referata.com/" + \
"wiki/Special:Ask/" + \
"-5B-5BCategory:Publications-5D-5D/" + \
"-3FHas-20author%3DAuthor(s)/-3FYear/" + \
"-3FPublished-20in/-3FAbstract/-3FHas-20topic%3DTopic(s)/" + \
"-3FHas-20domain%3DDomain(s)/" + \
"format%3D-20csv/limit%3D-20100/offset%3D0"
# Import the 'urllib' module for Web page retrieval
from urllib import urlopen
# Get help on how to use the module
help('urllib')
# Get and read the web page
doc = urlopen(url).read() # Object from urlopen has read function
# Show the first 1000 characters
print(doc[:1000])
# Import a CSV reader/writer library.
# Note: usually you will have all the imports at the top of the Python code.
import csv
web = urlopen(url)
# 'web' is now a file-like handle
lines = csv.reader(web, delimiter=',', quotechar='"')
# 'papers' is now an object that can be iterated over
# Iterate over 'papers'
for line in lines:
print(line)
# Each row is of a Python 'list' type
isinstance(line, list) == True
# Or
type(line) == list
# JSON format instead that Semantic MediaWiki also exports
url_json = "http://wikilit.referata.com/" + \
"wiki/Special:Ask/" + \
"-5B-5BCategory:Publications-5D-5D/" + \
"-3FHas-20author/-3FYear/" + \
"-3FPublished-20in/-3FAbstract/-3FHas-20topic)/" + \
"-3FHas-20domain/" + \
"format%3D-20json"
# Python module for JSON reading
import simplejson as json
# Read JSON into a Python structure
response = json.load(urlopen(url_json))
# 'response' is now a hash/dictionary
response.keys()
# Result: ['rows', 'results', 'printrequests']
# response['printrequests'] is a list, map iterates over the list
columns = map(lambda item: item['label'], response['printrequests'])
# gives ['', 'Has author', 'Year', 'Published in', 'Abstract',
# 'Has topic)', 'Has domain']
# Reread CSV
lines = csv.reader(urlopen(url), delimiter=',', quotechar='"')
# Iterate over 'lines' and insert the into a list of dictionaries
header = []
papers = []
for row in lines: # csv module lacks unicode support!
line = [unicode(cell, 'utf-8') for cell in row]
if not header: # Read the first line as header
header = line
continue
papers.append(dict(zip(header, line)))
# 'papers' is now an list of dictionaries
# To get the first abstract:
papers[0]['Abstract']
# Get some natural language processing tools
import nltk
# Get words from first abstract
nltk.word_tokenize(papers[0]['Abstract'])
# Result: [u'Accounts', u'of', u'open', u'source', ...
# Lower case words. 'string' module from Python library
import string
map(string.lower, nltk.word_tokenize(papers[0]['Abstract']))
# Result: [u'accounts', u'of', u'open', u'source'
# Now for all papers
for paper in papers:
words = map(string.lower, nltk.word_tokenize(paper['Abstract']))
paper.update({'words': words})
# Double list comprehension
all_words = [ word for paper in papers for word in paper['words'] ]
len(all_words)
# Result: 17059
# Unique words
len(set(all_words))
# Result: 3484
# Count the occurences of all words
wordcounts = dict([ [t, all_words.count(t)] for t in set(all_words) ])
# Another way
wordcounts = {}
for term in all_words:
wordcounts[term] = wordcounts.get(term, 0) + 1
# Change the ordering of value and key for sorting
items = [(v, k) for k, v in wordcounts.items()]
for count, word in sorted(items, reverse=True)[:5]:
print("%5d %s" % (count, word))
# 913 the
# 706 of
# 658 ,
# 507 and
# 433 to
# Filter out common words
import nltk.corpus
stopwords = nltk.corpus.stopwords.words('english')
terms = {}
for word, count in wordcounts.iteritems():
if count > 2 and word not in stopwords and word.isalpha():
terms[word] = count
# Change the ordering of value and key for sorting
items = [(v, k) for k, v in terms.items()]
for count, word in sorted(items, reverse=True)[:5]:
print("%5d %s" % (count, word))
# 213 wikipedia
# 64 knowledge
# 64 article
# 54 information
# 50 articles
# Wikipedia is the main topic of all the papers to remove it
terms.pop('wikipedia')
# Convert the dictionary to a list.
terms = list(terms)
# Import of the numerical module
import numpy as np
# Construct a bag-of-words matrix
M = np.asmatrix(np.zeros([len(papers), len(terms)]))
for n, paper in enumerate(papers):
for m, term in enumerate(terms):
M[n,m] = paper['words'].count(term)
# Define a topic mining function (non-negative matrix factorization)
def nmf(M, components=5, iterations=5000):
# Initialize to matrices
W = np.asmatrix(np.random.random(([M.shape[0], components])))
H = np.asmatrix(np.random.random(([components, M.shape[1]])))
for n in range(0, iterations):
H = np.multiply(H, (W.T * M) / (W.T * W * H + 0.001))
W = np.multiply(W, (M * H.T) / (W * (H * H.T) + 0.001))
print "%d/%d" % (n, iterations) # Note 'logging' module
return (W, H)
# Perform the actual computation
W, H = nmf(M, iterations=50, components=3)
# Show the results in some format
for component in range(W.shape[1]):
print("="*80)
print("COMPONENT %d: " % (component,))
indices = (-H[component,:]).getA1().argsort()
print(" - ".join([ terms[i] for i in indices[:6] ]))
print("-")
indices = (-W[:,component]).getA1().argsort()
print("\n".join([ papers[i][''] for i in indices[:5] ]))
results = """
================================================================================
COMPONENT 0:
knowledge - article - information - articles - approach - use
-
Constructing commons in the cultural environment
A systemic and cognitive view on collaborative knowledge building with wikis
Academics and Wikipedia: reframing Web 2.0+as a disruptor of traditional academic power-knowledge arrangements
Addressing gaps in knowledge while reading
Contextual retrieval of single Wikipedia articles to support the reading of academic abstracts
================================================================================
COMPONENT 1:
web - media - users - content - production - sites
-
A cultural and political economy of Web 2.0
Academics and Wikipedia: reframing Web 2.0+as a disruptor of traditional academic power-knowledge arrangements
Applications of semantic web methodologies and techniques to social networks and social websites
Classifying tags using open content resources
A Wikipedia matching approach to contextual advertising
================================================================================
COMPONENT 2:
question - classification - answer - systems - answering - method
-
A semantic approach for question classification using WordNet and Wikipedia
A comparison of World Wide Web resources for identifying medical information
Adaptive indexing for content-based search in P2P systems
BinRank: scaling dynamic authority-based search using materialized subgraphs
Classifying tags using open content resources
"""
# Import a graph library and plotting library
import networkx as nx
import matplotlib.pyplot as plt
# Generate coauthor graph
coauthor_graph = nx.Graph()
for paper in papers:
coauthors = paper['Author(s)'].split(',')
for n in range(len(coauthors)-1):
for m in range(n, len(coauthors)):
coauthor_graph.add_edge(coauthors[n], coauthors[m])
# Extract a subgraph
author_communities = nx.connected_component_subgraphs(coauthor_graph)
# Plot the graph
nx.draw(author_communities[0])
plt.show()
# plt.savefig("coauthorgraph.png")
@jtrenzel
Copy link

jtrenzel commented Jun 6, 2018

Should this code run as-written? Or is there a certain webpage or csv file that needs to be opened prior to running?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment