Skip to content

Instantly share code, notes, and snippets.

with open("C:\Users\graingec\spillovers\\abstracts\\abstracts.txt",'rb') as f:
for line in f:
i = 0
while i < 10:
print(line)
i = i+1
import sys
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
import re, string
pattern=re.compile(r'[^a-zA-Z ]')
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
Determining the novelty of patents using topic models
===
The novelty measure builds on work by Kaplan & Vakili (2013), who use topic models to find 'breakthrough technologies'.
The novelty measure ($\lambda$) for each patent $p$ in each time period $y$ is determined by the sum of the novelty score ($\gamma$) in that time period for each topic $t$ over the cutoff score $c$. This is found by a simple algorithm:
1. For each topic-period, find the sum of patents with a topic proportion over the threshold $c$ (where $\beta_{pt}$ is the proportion of topic $t$ in the distribution of topics over patent $p$): $$\theta_{ty}=\sum^{p}_{i=1}x_{i} \text{ where } x_{i} = \begin{cases} 1 & \text{if} & \beta_{pt} \ge c \\ 0 & \text{if} & \beta_{pt} \lt c \end{cases}$$
3. To find the novelty score for each topic-period ($\gamma_{ty}$), find the period of the first period where $\theta_{ty}\ge 1$ ($y_{init}$) and set $\gamma_{ty}$ to 1, find the period of full diffusion ($y_{\text{max}[\theta_{t}]}$) and set $\gamma_{ty
import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
<body style='text-transform:none;'>
The Colombian Coffee Company:<br>
We are a social enterprise ethically committed to supporting coffee-growing communities in Colombia through direct trade and single-origin coffee sales, art, and photography projects.<br>
<br>
The Job:<br>
Upon understanding our ethical initiative you will
Prepare premium coffee drinks
Make sure that all runs smoothly at the bar
Provide excellent customer service to all of the lovely festival-goers!
<br>
# Imports and housekeeping
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from gensim import corpora, models, similarities
import numpy as np
import matplotlib.pyplot as plt
# Define KL functions
def kl(p,q):
call(['export PYRO_SERIALIZERS_ACCEPTED=pickle',
'export PYRO_SERIALIZER=pickle',
'python -m Pyro4.naming -n 0.0.0.0 &',
'python -m gensim.models.lda_worker &',
'python -m gensim.models.lda_dispatcher &'])
# Imports and housekeeping
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from gensim import corpora, models, similarities
import numpy as np
import matplotlib.pyplot as plt
from subprocess import call
# Initialise distributed workers
import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
kl = []
num = range(0,25000,10)
for i in num:
lda = models.ldamodel.LdaModel(corpus=my_corpus,
id2word=dictionary,num_topics=i,distributed=True)
#Topic-word matrix
m1 = lda.expElogbeta
U,s,V = np.linalg.svd(p)
cm1 = s
#Document-topic matrix