Created
December 16, 2016 10:44
-
-
Save chengluyu/745c3be37a40799d2928382a8aeb9e6d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from requests import request | |
from bs4 import BeautifulSoup as bs | |
from pyquery import PyQuery as pq | |
from time import sleep | |
import json | |
import re | |
import MySQLdb | |
BASE = 'http://www.tmvan.com' | |
class SimpleIterator: | |
def __init__(self, elems): | |
self.elems = elems | |
self.index = 0 | |
def next(self): | |
if self.index >= len(self.elems): | |
return None | |
save = self.elems[self.index] | |
self.index += 1 | |
return save.strip() | |
class ItemListBuilder: | |
def __init__(self, id, title): | |
self.result = [] | |
self.current = None | |
self.item = None | |
self.id = id | |
self.title = title | |
self.comment_for_parent = None | |
self.reference_for_parent = None | |
def end_up_item(self): | |
if self.item is not None: | |
self.current['children'].append(self.item) | |
self.item = None | |
def end_up_child(self): | |
if self.current is not None: | |
self.end_up_item() | |
self.result.append(self.current) | |
def add(self, id): | |
self.end_up_child() | |
self.current = { 'id': self.title + id, 'children': [] } | |
def text(self, text): | |
if self.current is None: | |
self.current = { 'children': [] } | |
if self.item is not None: | |
self.current['children'].append(self.item) | |
self.item = dict() | |
self.item['content'] = text.split(',') | |
def reference(self, text): | |
if self.current is None: | |
self.reference_for_parent = text | |
return | |
if self.current is None: | |
self.current = { 'children': [] } | |
self.item['reference'] = map(lambda x: x.strip(), text.split(',')) | |
def comment(self, text): | |
if self.current is None: | |
self.comment_for_parent = text | |
return | |
if self.current.get('comment') is None: | |
self.current['comment'] = text | |
else: | |
self.current['comment'] += text | |
def get(self): | |
self.end_up_item() | |
self.end_up_child() | |
obj = { | |
'id': self.id, | |
'title': self.title, | |
'children': self.result | |
} | |
if self.comment_for_parent is not None: | |
obj['comment'] = self.comment_for_parent | |
if self.reference_for_parent is not None: | |
obj['reference'] = self.reference_for_parent | |
return obj | |
# download the content of an url | |
def download(url): | |
return request('GET', url).text.replace(u'\xa0', u' ') | |
def fetch_comment(index): | |
URL = "http://www.tmvan.com/Script/goodsclass.ashx" | |
payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"goodsclassid\"\r\n\r\n%d\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"%index | |
headers2 = { | |
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", | |
'accept': "text/html, */*; q=0.01", | |
'cache-control': "no-cache", | |
'postman-token': "f0d491b7-14e7-728c-a17b-a212ce38272c" | |
} | |
return request("POST", URL, data=payload, headers=headers2).text | |
def process_text_node(text): | |
stripped = text.strip() | |
if stripped == '': | |
return None | |
m = re.match(r'((.{1,3}))(.*)$', stripped) | |
if m is not None: | |
return { | |
'type': 'heading', | |
'name': m.group(1), | |
'text': m.group(2) | |
} | |
m = re.match(r'^※(.*)$', stripped) | |
if m is not None: | |
return { | |
'type': 'ref', | |
'text': m.group(1) | |
} | |
m = re.match(r'^注:(.*)$', stripped) | |
if m is not None: | |
return { | |
'type': 'comment', | |
'text': m.group(1) | |
} | |
m = re.match(r'^\d+\.(.*)$', stripped) | |
if m is not None: | |
return { | |
'type': 'list', | |
'text': stripped | |
} | |
return { | |
'type': 'text', | |
'text': stripped | |
} | |
# process each single pages | |
def page(url, id, title): | |
URL = BASE + url | |
q = pq(download(URL)) | |
child_nodes = q('.content1').contents() | |
text_nodes = filter(lambda x: isinstance(x, str), child_nodes) | |
builder = ItemListBuilder(id, title) | |
for obj in filter(lambda x: x is not None, map(process_text_node, text_nodes)): | |
if obj['type'] == 'heading': | |
builder.add(obj['name']) | |
builder.text(obj['text']) | |
elif obj['type'] == 'text': | |
builder.text(obj['text']) | |
elif obj['type'] == 'ref': | |
builder.reference(obj['text']) | |
elif obj['type'] == 'list': | |
builder.comment(obj['text']) | |
elif obj['type'] == 'comment': | |
builder.comment(obj['text']) | |
else: | |
raise Exception('This will never happen') | |
result = builder.get() | |
comment = q('div[style="float:left; width:890px;"]').text().strip() | |
if comment is not None and len(comment) != 0: | |
if result.get('comment') is not None: | |
result['comment'] += '\n' + comment | |
else: | |
result['comment'] = comment | |
return result | |
# process content page | |
def content(): | |
URL = BASE + '/tool/goodsclass.aspx' | |
tree = bs(download(URL), 'html.parser') | |
sections = [] | |
for index, section in enumerate(tree.find_all('div', { 'class': 'one' })): | |
# limits | |
if index > 0: | |
break | |
# Fetch metadata of this section | |
left = section.find('div', { 'class': 'left' }) | |
title = left.find('div', { 'class': 'title' }).get_text().strip() | |
content = left.find('div', { 'class': 'content1' }).get_text().strip() | |
print '-----------------', title, '-----------------' | |
# Fetch each subsection | |
right = section.find('div', { 'class': 'right' }) | |
parts = [] | |
for sub_index, link in enumerate(right.find_all('a', { 'target': '_blank' })): | |
url = link.get('href') | |
sub_title = link.get_text() | |
print sub_index + 1, sub_title | |
part = page(url, id = '{:02d}{:02d}'.format(index + 1, sub_index + 1), title = sub_title) | |
parts.append(part) | |
sleep(0.01) | |
sections.append({ | |
'id': '{:02d}'.format(index + 1), | |
'title': title, | |
'content': content, | |
'comment': fetch_comment(index + 1), | |
'children': parts | |
}) | |
return sections | |
def db_dump(node, parent_id = ''): | |
print node['id'], node['title'], len(node['children']) | |
for child in node['children']: | |
db_dump(child, node['id']) | |
def db_dump_all(elems): | |
for el in elems: | |
db_dump(el) | |
# for debug use only | |
def inspect(obj): | |
print json.dumps(obj, indent = 2, ensure_ascii = True) | |
db_dump_all(content()) | |
# inspect(page('/tool/goodsclassitem.aspx?goodsclassid=47')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment