Created
October 12, 2019 12:33
-
-
Save hannes/2cf7f96f4150fc88d38d0b6022f215d6 to your computer and use it in GitHub Desktop.
Create and query a local search engine for PirateBay torrents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sqlite3 # yay SQLite! | |
import base64 | |
import os | |
import sys | |
if sys.version_info<(3,6,0): | |
print("You need python 3.6 or later to run this script.") | |
exit(-1) | |
if (len(sys.argv) < 2): | |
print("Usage: torrent.py your search term") | |
print("Boolean operators (AND OR NOT) should work.") | |
exit(-1) | |
q = ' '.join(sys.argv[1:]).replace('\'', ' ') | |
fname = "torrent_dump_full.csv.gz" | |
if not os.path.isfile(fname): | |
print("Missing file 'torrent_dump_full.csv.gz'.") | |
print("Download from https://thepiratebay.org/static/dump/csv/torrent_dump_full.csv.gz") | |
exit(-1) | |
mtime = int(os.path.getmtime(fname)) | |
con = sqlite3.connect('torrents.db') | |
cur = con.cursor() | |
# check if the db has fts5 | |
available_pragmas = cur.execute('PRAGMA compile_options').fetchall() | |
if not ('ENABLE_FTS5',) in available_pragmas: | |
raise "no fts5 extension" | |
# check a bunch of conditions whether we need to recompute the index | |
table_cnt = cur.execute("SELECT COUNT(*) FROM sqlite_master \ | |
WHERE type='table' AND name IN('torrent_index', 'last_updated')").fetchone()[0] | |
recompute = False | |
if (table_cnt == 2): | |
old_mtime = cur.execute("SELECT mtime FROM last_updated").fetchone()[0] | |
if (old_mtime < mtime): | |
recompute = True | |
else: | |
recompute = True | |
if recompute: | |
print("Re-creating index, please stand by...") | |
import pandas | |
df = pandas.read_csv(fname, sep=";", header=0, quotechar='"', escapechar="\\", | |
names=['added', 'hash', 'name', 'size'], usecols=[1, 2, 3]) | |
cur.execute("BEGIN") | |
cur.execute("DROP TABLE IF EXISTS torrent_index") | |
cur.execute("CREATE VIRTUAL TABLE torrent_index USING FTS5(hash, name, size)") | |
# hack alert, use to_sql to directly add to fulltext index | |
df.to_sql("torrent_index", con, index=False, if_exists='append') | |
cur.execute("DROP TABLE IF EXISTS last_updated") | |
cur.execute("CREATE TABLE last_updated (mtime INTEGER)") | |
cur.execute("INSERT INTO last_updated VALUES (%d)" % mtime) | |
cur.execute("COMMIT") # woo ho | |
def sizeof_fmt(num, suffix='B'): | |
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: | |
if abs(num) < 1024.0: | |
return "%3.1f%s%s" % (num, unit, suffix) | |
num /= 1024.0 | |
return "%.1f%s%s" % (num, 'Yi', suffix) | |
# this is the magic, a query that matches the cmdline params against the fulltext index | |
# yes this should be a prepared statement, but thats not supported | |
cur.execute("SELECT hash, name, size FROM torrent_index \ | |
WHERE torrent_index MATCH '\"name\" : %s' ORDER BY rank LIMIT 100" % q) | |
for row in cur: | |
print("%-64s\t%.10s\tmagnet:?xt=urn:btih:%.40s" % | |
(row[1], sizeof_fmt(row[2]).rjust(10), base64.b64decode(row[0]).hex())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment