Created
May 7, 2012 05:54
-
-
Save vadimii/2626164 to your computer and use it in GitHub Desktop.
Extract domain structure of lomonosov-fund.ru
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
import urllib2 | |
import string | |
import os.path | |
from lxml.html import fromstring | |
from lxml import etree | |
from datetime import datetime | |
base_url = 'http://www.lomonosov-fund.ru/enc/ru/encyclopedia' | |
local_cache = 'cache/' | |
def load_main(): | |
localfn = local_cache + 'main.html' | |
load_to_cache(base_url, localfn) | |
def load_category(cid): | |
url = base_url + ':' + cid | |
localfn = local_cache + cid + '.html' | |
load_to_cache(url, localfn) | |
def load_to_cache(url, localfn): | |
request = urllib2.urlopen(url) | |
content = request.read().decode('windows-1251') | |
with open(localfn, 'w') as store: | |
store.write(content.encode('utf-8')) | |
def parse_main(): | |
localfn = local_cache + 'main.html' | |
if not os.path.exists(localfn): | |
load_main(cid) | |
with open(localfn, 'r') as store: | |
content = store.read().decode('utf-8') | |
doc = fromstring(content) | |
catalog = doc.cssselect('#catalog')[0] | |
panels = catalog.cssselect('.level-1-panel') | |
panels = { | |
('017', u'Наука', ''): panels[0], | |
('018', u'Искусство, культура и религия', ''): panels[1], | |
('019', u'Современная Россия', ''): panels[2] | |
} | |
def parse_panel(panel): | |
level2s = panel.cssselect('p > big > a') | |
l2cats = {} | |
for l2e in level2s: | |
parent = l2e.getparent().getparent() | |
l2id = string.split(l2e.get('href'), ':')[-1] | |
l2title = l2e.text.strip() | |
l2desc = parent.text_content().strip() | |
level3s = parent.getnext().cssselect('ul > li > a') | |
l2k = (l2id, l2title, l2desc) | |
l2cats[l2k] = [] | |
for l3e in level3s: | |
l3title = l3e.text_content().strip() | |
l3id = string.split(l3e.get('href'), ':')[-1] | |
l2cats[l2k].append((l3id, l3title, '')) | |
return l2cats | |
rootcats = {} | |
for k, v in panels.iteritems(): | |
rootcats[k] = parse_panel(v) | |
return rootcats | |
def parse_content(cid): | |
def extract_categories(ul): | |
for a in ul.cssselect('li > a'): | |
href = a.get('href') | |
cid = string.split(href, ':')[-1] | |
title = a.text.strip() | |
desc = a.get('title') | |
desc = desc.strip() if desc else '' | |
yield (cid, title, desc) | |
localfn = local_cache + cid + '.html' | |
if not os.path.exists(localfn): | |
load_category(cid) | |
with open(localfn, 'r') as store: | |
content = store.read().decode('utf-8') | |
doc = fromstring(content) | |
container = doc.cssselect('.category-binds')[0] | |
collections = {} | |
for e in container.cssselect('article > h3'): | |
collections[e.text_content()] = e.getnext() | |
subcats = extract_categories(collections[u'↓']) | |
relcats = extract_categories(collections[u'←']) | |
parentcat = extract_categories(collections[u'↑']) | |
#assert len(list(parentcat)) == 1, 'Only one element possible' | |
return (subcats, relcats) | |
category_refs = {} | |
def process(elems, level, rootxml): | |
for el in elems: | |
print '-'*level, el[1].encode('utf-8'), '['+el[0]+']' | |
if el[0] not in category_refs: | |
subs, refs = parse_content(el[0]) | |
category_refs[el[0]] = [r[0] for r in refs] | |
subs = list(subs) | |
isparent = len(subs) > 0 | |
xmlel = append_category_xml(rootxml, isparent, *el) | |
process(subs, level+1, xmlel) | |
else: | |
append_category_xml_ref(rootxml, el[0]) | |
def append_category_xml(root, isparent, cid, name, description): | |
e = etree.SubElement(root, 'cat', { 'id': cid }) | |
etree.SubElement(e, 'name').text = name | |
if len(description) > 0 and description != name: | |
etree.SubElement(e, 'desc').text = description | |
if (isparent): | |
return etree.SubElement(e, 'subs') | |
def append_category_xml_ref(root, ref_id): | |
return etree.SubElement(root, 'ref', { 'id': ref_id }) | |
def append_category_refs(parent, refs): | |
relem = etree.SubElement(parent, 'refs') | |
for r in refs: | |
append_category_xml_ref(relem, r) | |
def parse(): | |
root = parse_main() | |
now = datetime.isoformat(datetime.utcnow())+'Z' | |
xmlroot = etree.Element('cats', { 'created': now }) | |
for l0, l1s in root.iteritems(): | |
print '-'*1, l0[1].encode('utf-8'), '['+l0[0]+']' | |
l0xml = append_category_xml(xmlroot, True, *l0) | |
for l1, l2s in l1s.iteritems(): | |
print '-'*2, l1[1].encode('utf-8'), '['+l1[0]+']' | |
l1xml = append_category_xml(l0xml, True, *l1) | |
process(l2s, 3, l1xml) | |
for k, v in category_refs.iteritems(): | |
if (len(v) > 0): | |
e = xmlroot.xpath('//cat[@id='+k+']')[0] | |
append_category_refs(e, v) | |
with open('lomonosov.xml', 'w') as f: | |
f.write(etree.tostring(xmlroot, encoding='utf-8', pretty_print=True)) | |
if __name__ == "__main__": | |
parse() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment