Created
June 25, 2016 14:03
-
-
Save gyu-don/65be8d273502c0ed7ce682f835e1680b to your computer and use it in GitHub Desktop.
wp_hops Part-1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import getpass | |
import mysql.connector | |
class WordNotFoundError(Exception): | |
def __init__(self, word): | |
self.word = word | |
def __str__(self): | |
return self.word + " was not found." | |
class PageIdNotFoundError(Exception): | |
def __init__(self, pid): | |
self.pid = pid | |
def __str(self): | |
return self.pid + " was not found." | |
class LinkNotFoundError(Exception): | |
def __init__(self, msg): | |
self.msg = msg | |
def __str__(self): | |
return self.msg | |
def get_pageid(c, w): | |
c.execute("SELECT page_id FROM page WHERE page_namespace=0 AND page_title=%s", (w,)) | |
result = c.fetchone() | |
if result is None: | |
raise WordNotFoundError(w) | |
return result[0] | |
def get_title(c, pid): | |
c.execute("SELECT page_title FROM page WHERE page_id=%s", (pid,)) | |
result = c.fetchone() | |
if result is None: | |
raise PageIdNotFoundError(pid) | |
return result[0].decode("utf-8") | |
def get_linkfrom(c, w): | |
c.execute("SELECT pl_from FROM pagelinks WHERE pl_from_namespace=0 AND pl_namespace=0 AND pl_title=%s", (w,)) | |
result = c.fetchall() | |
if result: | |
return [t[0] for t in result] | |
else: | |
return [] | |
def wp_hops(c, w_from, w_to): | |
# Raise WordNotFoundError if input w_from, w_to are not in Wikipedia. | |
w_to_pid = get_pageid(c, w_to) | |
target = get_pageid(c, w_from) | |
title_list = [w_to] | |
links = {} | |
pids = set() | |
n_link = 0 # for debug purpose. | |
while 1: | |
next_title_list = [] | |
for title in title_list: | |
print(n_link, title) | |
linkfrom = get_linkfrom(c, title) | |
if target in linkfrom: | |
result = [w_from, title] | |
t = title | |
while t != w_to: | |
t = links[t] | |
result.append(t) | |
return result | |
for lf in linkfrom: | |
if lf not in pids: | |
try: | |
t = get_title(c, lf) | |
except PageIdNotFoundError: | |
pass | |
else: | |
links[t] = title | |
pids.add(lf) | |
next_title_list.append(t) | |
title_list = next_title_list | |
n_link += 1 | |
if __name__ == "__main__": | |
if len(sys.argv) == 4: | |
user = sys.argv[1] | |
pw = getpass.getpass() | |
w_from = sys.argv[2] | |
w_to = sys.argv[3] | |
elif len(sys.argv) == 5: | |
user = sys.argv[1] | |
pw = sys.argv[2] | |
w_from = sys.argv[3] | |
w_to = sys.argv[4] | |
conn = mysql.connector.Connect(user=user, password=pw, db="jawiki", charset="utf8") | |
c = conn.cursor() | |
print(wp_hops(c, w_from, w_to)) | |
c.close() | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment