Created
December 19, 2012 11:41
-
-
Save morrah/4336117 to your computer and use it in GitHub Desktop.
2ch.hk parser provides Thread- and Post-classes output
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: UTF-8 -*- | |
import re, urllib, urllib2 | |
from cookielib import CookieJar, DefaultCookiePolicy | |
class WebPage: | |
DEFAULT_HEADERS = { | |
"Content-type": "application/x-www-form-urlencoded", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.13) Gecko/20100914 Firefox/3.5.13", | |
"Connection": "Keep-Alive", "Keep-Alive": "115" | |
} | |
cj = CookieJar( DefaultCookiePolicy(rfc2965=True, strict_ns_domain=DefaultCookiePolicy.DomainStrict) ) | |
_source = None | |
_headers = None | |
_url = None | |
def __init__(self, url, headers=None): | |
if headers==None: | |
self._headers = self.DEFAULT_HEADERS | |
self._url = url | |
def clear_cookies(self): | |
self.cj.clear_session_cookies() | |
self.cj.clear() | |
def download(self): | |
req = urllib2.Request(self._url, None, self._headers) | |
self.cj.add_cookie_header(req) | |
try: | |
response = urllib2.urlopen(req) | |
self.cj.extract_cookies(response, req) | |
self._source = response.read() | |
return self._source | |
except Exception, err: | |
print '[!] Exception: ' + str(err) | |
return None | |
def get_source(self): | |
return self._source | |
def get_url(self): | |
return self._url | |
class Post(): | |
def __init__(self, id, type, name, posttime, subject, postmessage): | |
self._id = id | |
self._type = type | |
self._name = name | |
self._posttime = posttime | |
self._subject = subject | |
self._postmessage = postmessage | |
def get_id(self): | |
return self._id | |
def get_type(self): | |
return self._type | |
def get_name(self): | |
return self._name | |
def get_posttime(self): | |
return self._posttime | |
def get_subject(self): | |
return self._subject | |
def get_postmessage(self): | |
return self._postmessage | |
class Thread(): | |
def __init__(self, thread_id, matchPos): | |
self._thread_id = thread_id | |
self._matchPos = matchPos | |
self._posts = [] | |
def add_post(self, Post): | |
self._posts.append(Post) | |
def get_post(self, index): | |
return self._posts[index] | |
def get_post_all(self): | |
return self._posts | |
def get_thread_id(self): | |
return self._thread_id | |
def get_match_pos(self): | |
return self._matchPos | |
def get_post_count(self): | |
return len(self._posts) | |
class BoardPage(WebPage): | |
_threads = [] | |
def __init__(self, url, headers=None): | |
if headers==None: | |
self._headers = self.DEFAULT_HEADERS | |
self._url = url | |
self._parse_source( self.download() ) | |
def get_thread_all(self): | |
return self._threads | |
def _parse_source(self, source): | |
THREAD_PATTERN = '<div id="thread_(?P<thread_id>\d*?)" class="thread">.*?' | |
self._threads = self._parse_threads(THREAD_PATTERN, source) | |
# seems like OPPOST is dublicated further in source with POST-format | |
# OPPOST_PATTERN = '<div id="post_(?P<post_id>\d*?)" class="(?P<type>oppost)">' | |
# OPPOST_PATTERN += '.*?<span class="name">(?P<post_name>.*?)</span>' | |
# OPPOST_PATTERN += '.*?<span class="posttime">(?P<post_posttime>.*?)</span>' | |
# OPPOST_PATTERN += '.*?<span class="subject">(?P<post_subject>.*?)</span>' | |
# OPPOST_PATTERN += '.*?<blockquote id="m\d*?" class="postMessage">(?P<post_postmessage>.*?)</blockquote>' | |
# self._threads = self._parse_posts(OPPOST_PATTERN, self._threads, source) | |
POST_PATTERN = '<table id="post_(?P<post_id>\d*?)" class="(?P<type>post)">' | |
POST_PATTERN += '.*?<span class="name">(?P<post_name>.*?)</span>' | |
POST_PATTERN += '.*?<span class="subject">(?P<post_subject>.*?)</span>' | |
POST_PATTERN += '.*?<span class="posttime">(?P<post_posttime>.*?)</span>' | |
POST_PATTERN += '.*?<blockquote id="m\d*?" class="postMessage">(?P<post_postmessage>.*?)</blockquote>' | |
self._threads = self._parse_posts(POST_PATTERN, self._threads, source) | |
def _parse_threads(self, thread_pattern, source): | |
# in: regexp-pattern; | |
# out: [Thread(thread_id, match_pos)] | |
threads = [] | |
pattern = re.compile(thread_pattern) | |
for match in pattern.finditer(source): | |
threads.append( Thread( match.group('thread_id'), match.start() ) ) | |
return threads | |
def _parse_posts(self, post_pattern, threads, source): | |
# in: regexp-pattern, threads[Thread()]; | |
# out: threads[Thread()._posts[Post(post_id,type,post_name,post_posttime,post_subject,post_postmessage)]] | |
pattern = re.compile(post_pattern, re.DOTALL) | |
for thread_num in range(len(threads)): | |
s_pos = self._threads[thread_num].get_match_pos() | |
if thread_num+1 == len(threads): | |
e_pos = len(source) | |
else: | |
e_pos = threads[thread_num+1].get_match_pos() | |
for match in pattern.finditer(source, s_pos, e_pos): | |
new_oppost = Post(match.group('post_id'), | |
match.group('type'), | |
match.group('post_name'), | |
match.group('post_posttime'), | |
match.group('post_subject'), | |
match.group('post_postmessage')) | |
threads[thread_num].add_post(new_oppost) | |
return threads |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: UTF-8 -*- | |
from huikach_parser import * | |
b = BoardPage('http://2ch.hk/b') | |
dump_page = open('b.txt', 'w') | |
for thrd in b.get_thread_all(): | |
dump_page.write( 'thread id%s with %s posts inside\n' % (thrd.get_thread_id(), thrd.get_post_count()) ) | |
for pst in thrd.get_post_all(): | |
dump_page.write( ' > post id%s by %s - %s %s\n' % (pst.get_id(), pst.get_name(), pst.get_subject(), pst.get_posttime()) ) | |
dump_page.write( ' > %s\n' % (pst.get_postmessage()) ) | |
dump_page.close | |
pr = BoardPage('http://2ch.hk/pr/res/221414.html') | |
dump_page = open('221414.txt', 'w') | |
for thrd in pr.get_thread_all(): | |
dump_page.write( 'thread id%s with %s posts inside\n' % (thrd.get_thread_id(), thrd.get_post_count()) ) | |
for pst in thrd.get_post_all(): | |
dump_page.write( ' > post id%s by %s - %s %s\n' % (pst.get_id(), pst.get_name(), pst.get_subject(), pst.get_posttime()) ) | |
dump_page.write( ' > %s\n' % (pst.get_postmessage()) ) | |
dump_page.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment