Created
March 20, 2011 05:40
-
-
Save selenamarie/878123 to your computer and use it in GitHub Desktop.
and is formatted like crap. also, other things that suck.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python -u | |
""" | |
Quickie demo of FTS - looking for instances of monkey and friend in the IMDB titles list | |
Have INSERT and COPY subroutines. Timing only for FTS portion of demo. | |
""" | |
import re | |
import psycopg2 | |
from StringIO import StringIO | |
from timeit import Timer | |
""" | |
Connect to the database | |
""" | |
dsn = "" # quick hack to use same database/username as person running script | |
conn = psycopg2.connect(dsn) | |
conn.set_client_encoding('LATIN1') | |
conn.commit() | |
c = conn.cursor() | |
""" | |
Grab file and return a dict of movies | |
movie list file from: ftp://ftp.sunet.se/pub/tv+movies/imdb/movies.list.gz | |
""" | |
def load_movies(): | |
print "Loading movies" | |
movies = open('movies.list') | |
trips = 1 | |
potential = {} | |
for line in movies: | |
trips = trips + 1 | |
if trips % 1000 == 0: | |
print "#", | |
row = () | |
scrubbed = '' | |
try: | |
scrubbed = re.sub('\t+', '\t', line) | |
scrubbed = re.sub('( \(\d{4}\))', '', scrubbed) | |
row = re.search("(.*)\t+(.*)", scrubbed) | |
potential[trips] = row.groups(0) | |
except: | |
pass | |
return potential | |
def cleanup(): | |
print "Cleaning up imdb_movies table" | |
c.execute("DROP TABLE IF EXISTS imdb_movies"); | |
def create_table(): | |
print "Creating imdb_movies table" | |
c.execute("CREATE TABLE imdb_movies (title TEXT, release_year TEXT)") | |
conn.commit() | |
def import_data_copy(movielist): | |
print "Importing imdb movie data into database" | |
string = '' | |
for num,movie in movielist.items(): | |
title = re.sub('\t', ' ', movie[0]) | |
year = re.sub('\t', ' ', movie[1]) | |
string = string + title + "\t" + year + "\n" | |
#c.execute("INSERT INTO imdb_movies (title, release_year) VALUES(%s, %s)", (movie[0], movie[1])) | |
if num % 1000 == 0: | |
f = StringIO(string) | |
c.copy_from(f, 'imdb_movies', columns=('title', 'release_year')) | |
conn.commit() | |
string = '' | |
print "#", | |
f = StringIO(string) | |
c.copy_from(f, 'imdb_movies', columns=('title', 'release_year')) | |
conn.commit() | |
def import_data_insert(movielist): | |
print "Importing imdb movie data into database" | |
for num,movie in movielist.items(): | |
c.execute("INSERT INTO imdb_movies (title, release_year) VALUES(%s, %s)", (movie[0], movie[1])) | |
if num % 1000 == 0: | |
conn.commit() | |
print "#", | |
""" | |
Demo of Full Text Search | |
""" | |
def count_monkey(): | |
c.execute("SELECT count(title) as numtitles from imdb_movies WHERE to_tsvector('english', title) @@ to_tsquery('english', 'monkey & friend')") | |
print c.fetchone()[0] | |
def create_gin_index(): | |
c.execute("CREATE INDEX title_idx ON imdb_movies USING gin(to_tsvector('english', title))"); | |
if __name__=='__main__': | |
dsn="" | |
movielist = load_movies() | |
cleanup() | |
create_table() | |
import_data_copy(movielist) | |
t = Timer("count_monkey()", "from __main__ import count_monkey") | |
print "Running search without index on titles" | |
try: | |
print "Search took: %.4f seconds" % t.timeit(number=1) | |
except: | |
t.print_exc() | |
print "Creating GIN index" | |
x = Timer("create_gin_index()", "from __main__ import create_gin_index") | |
try: | |
print "Create took: %.4f seconds" % x.timeit(number=1) | |
except: | |
t.print_exc() | |
print "Running search with GIN index on titles" | |
try: | |
print "Search took: %.4f seconds" % t.timeit(number=1) | |
except: | |
t.print_exc() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment