Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created January 31, 2015 14:35
Show Gist options
  • Save cigrainger/435b4ec9b67ef7a96b65 to your computer and use it in GitHub Desktop.
Save cigrainger/435b4ec9b67ef7a96b65 to your computer and use it in GitHub Desktop.
import pandas as pd
import re, string
import numpy as np
from scipy.spatial.distance import pdist, squareform, euclidean
def firmmeans(data,year,key):
d = data[data['bvdid'].isin(key['bvdid'][key['year']==year].tolist())]
d[['year']] = d[['year']].astype(int)
d = d[d['year']<=year]
cols = [col for col in d.columns.values if col not in ['year','appln_id']]
d = d[cols].groupby('bvdid',as_index=False)
d = d.aggregate(np.mean)
cols = [col for col in d.columns.values if col not in ['bvdid']]
firms = d['bvdid'].tolist()
d = d[cols].as_matrix()
d = pdist(d,metric='euclidean')
d = squareform(d)
d = pd.DataFrame(d,columns=firms)
d['bvdid'] = firms
d = d.set_index('bvdid')
return d
if __name__ == '__main__':
pattern = re.compile('[\W_]+')
df = pd.read_csv('bvd_small.csv')
df = df[['bvd_id','year']]
patents = pd.read_csv('labelledpredictions.txt',header=None)
link = pd.read_csv('bvdid_patents_link.csv')
date = pd.read_csv('appln_date.csv')
new_columns = df.columns.values
new_columns[0] = 'bvdid'
df.columns = new_columns
new_columns = patents.columns.values.tolist()
new_columns[0] = 'appln_id'
patents.columns = new_columns
new_columns = link.columns.values
new_columns[0] = 'bvdid'
link.columns = new_columns
link['bvdid'] = [pattern.sub('', x) for x in link['bvdid']]
df['bvdid'] = [pattern.sub('', x) for x in df['bvdid']]
date['appln_date'] = [x[-4:] for x in date['appln_date']]
new_columns = date.columns.values
new_columns[1] = 'year'
date.columns = new_columns
link = link[link['bvdid'].isin(df['bvdid'].unique().tolist())]
patents = patents.dropna()
patents['appln_id'] = patents[['appln_id']].astype(int)
patents = patents[patents['appln_id'].isin(link['appln_id'].tolist())]
data = patents.merge(date,on='appln_id',how='left')
data = data.merge(link,on='appln_id',how='left')
for i in df['year'].unique().tolist():
a = firmmeans(data,i,df)
year = str(i)
name = 'firmtechdist'+year+'.csv'
a.to_csv(path=str(name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment