Skip to content

Instantly share code, notes, and snippets.

@mazieres
Created December 8, 2014 11:34
Show Gist options
  • Save mazieres/572449eab5cb69bf673d to your computer and use it in GitHub Desktop.
Save mazieres/572449eab5cb69bf673d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# by @mazieres for cortext.fr
import sqlite3
import sys
import os
from collections import defaultdict
# PATH to the DB downloaded from cortext
PATH_TO_DB = 'machine_learning.db'
# PATH to the original WoS file uploaded to cortext
PATH_TO_TXT = 'machine_learning.txt'
# Connect to DB
c = sqlite3.connect(PATH_TO_DB)
cursor = c.cursor()
# List tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
all_tables = cursor.fetchall()
proj_tables = [x[0] for x in all_tables if x[0].startswith("projection_")]
# If several projections, ask for which to use
if len(proj_tables) > 1:
ask = {proj_tables.index(x):x for x in proj_tables}
ans = raw_input(ask)
else:
ans = 0
# Get list of references IDs per clusters
cursor.execute("SELECT * FROM {0};".format(ask[int(ans)]))
all_in_clusters = cursor.fetchall()
res_index = defaultdict(lambda: list())
for entry in all_in_clusters:
_, paper_id, _, _, cluster_id = entry
res_index[cluster_id].append(paper_id)
# Load reference original content
try:
isi_file = open(PATH_TO_TXT)
isi_refs = isi_file.read().split("\n\n")
except Exception as e:
print e
sys.exit(1)
finally:
isi_file.close()
# Fill cluster with content
res_content = defaultdict(lambda: list())
for k, v in res_index.iteritems():
for ref_id in v:
res_content[k].append(isi_refs[ref_id-1])
# Save each cluster content to file
os.mkdir('clusters_dbs')
for k, v in res_content.iteritems():
o = open('clusters_dbs/'+k+'.txt', 'wb')
o.write('\n\n'.join(v))
o.close()
print "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment