Skip to content

Instantly share code, notes, and snippets.

@selenamarie
Created March 20, 2011 05:40
Show Gist options
  • Save selenamarie/878123 to your computer and use it in GitHub Desktop.
Save selenamarie/878123 to your computer and use it in GitHub Desktop.
and is formatted like crap. also, other things that suck.
#!/usr/bin/python -u
"""
Quickie demo of FTS - looking for instances of monkey and friend in the IMDB titles list
Have INSERT and COPY subroutines. Timing only for FTS portion of demo.
"""
import re
import psycopg2
from StringIO import StringIO
from timeit import Timer
"""
Connect to the database
"""
dsn = "" # quick hack to use same database/username as person running script
conn = psycopg2.connect(dsn)
conn.set_client_encoding('LATIN1')
conn.commit()
c = conn.cursor()
"""
Grab file and return a dict of movies
movie list file from: ftp://ftp.sunet.se/pub/tv+movies/imdb/movies.list.gz
"""
def load_movies():
print "Loading movies"
movies = open('movies.list')
trips = 1
potential = {}
for line in movies:
trips = trips + 1
if trips % 1000 == 0:
print "#",
row = ()
scrubbed = ''
try:
scrubbed = re.sub('\t+', '\t', line)
scrubbed = re.sub('( \(\d{4}\))', '', scrubbed)
row = re.search("(.*)\t+(.*)", scrubbed)
potential[trips] = row.groups(0)
except:
pass
print
return potential
def cleanup():
print "Cleaning up imdb_movies table"
c.execute("DROP TABLE IF EXISTS imdb_movies");
def create_table():
print "Creating imdb_movies table"
c.execute("CREATE TABLE imdb_movies (title TEXT, release_year TEXT)")
conn.commit()
def import_data_copy(movielist):
print "Importing imdb movie data into database"
string = ''
for num,movie in movielist.items():
title = re.sub('\t', ' ', movie[0])
year = re.sub('\t', ' ', movie[1])
string = string + title + "\t" + year + "\n"
#c.execute("INSERT INTO imdb_movies (title, release_year) VALUES(%s, %s)", (movie[0], movie[1]))
if num % 1000 == 0:
f = StringIO(string)
c.copy_from(f, 'imdb_movies', columns=('title', 'release_year'))
conn.commit()
string = ''
print "#",
f = StringIO(string)
c.copy_from(f, 'imdb_movies', columns=('title', 'release_year'))
conn.commit()
print
def import_data_insert(movielist):
print "Importing imdb movie data into database"
for num,movie in movielist.items():
c.execute("INSERT INTO imdb_movies (title, release_year) VALUES(%s, %s)", (movie[0], movie[1]))
if num % 1000 == 0:
conn.commit()
print "#",
print
"""
Demo of Full Text Search
"""
def count_monkey():
c.execute("SELECT count(title) as numtitles from imdb_movies WHERE to_tsvector('english', title) @@ to_tsquery('english', 'monkey & friend')")
print c.fetchone()[0]
def create_gin_index():
c.execute("CREATE INDEX title_idx ON imdb_movies USING gin(to_tsvector('english', title))");
if __name__=='__main__':
dsn=""
movielist = load_movies()
cleanup()
create_table()
import_data_copy(movielist)
t = Timer("count_monkey()", "from __main__ import count_monkey")
print "Running search without index on titles"
try:
print "Search took: %.4f seconds" % t.timeit(number=1)
except:
t.print_exc()
print "Creating GIN index"
x = Timer("create_gin_index()", "from __main__ import create_gin_index")
try:
print "Create took: %.4f seconds" % x.timeit(number=1)
except:
t.print_exc()
print "Running search with GIN index on titles"
try:
print "Search took: %.4f seconds" % t.timeit(number=1)
except:
t.print_exc()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment