Created
February 11, 2010 05:55
-
-
Save banyan/301270 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #-*- coding: utf-8 -*- | |
| # vim:fileencoding=utf_8 | |
| import sys | |
| import re | |
| import time | |
| from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag | |
| class LDC(object): | |
| def __init__(self): | |
| xml = open('./export_utf.htm','r').read().decode("utf-8", "replace") | |
| self.soup = BeautifulStoneSoup(xml) | |
| def _sort_tags(original_func): | |
| def func(*args): | |
| h = {} | |
| similar = {} | |
| for tag in args[0].soup.findAll('dc:subject'): | |
| h[tag.string] = h[tag.string] + 1 if h.setdefault(tag.string) else 1 | |
| for k, v in sorted(h.items(), key=lambda x: x[1], reverse=True): | |
| list = [k, v] | |
| similar[k.upper()] = (similar[k.upper()] + [list] if similar.setdefault(k.upper()) else [list]) | |
| return original_func(similar) | |
| return func | |
| @_sort_tags | |
| def check_similar_tags(similar): | |
| for k, lists in similar.iteritems(): | |
| if len(lists) > 1: | |
| for k, v in lists: | |
| print k, v | |
| @_sort_tags | |
| def get_larger_tags(similar): | |
| ret = [] | |
| for k, lists in similar.iteritems(): | |
| if len(lists) > 1: | |
| for i, x in enumerate(lists): | |
| if i == 0: | |
| ret.append(lists[0][0]) | |
| return ret | |
| def convert_tags(self, tags): | |
| for tag in self.soup.findAll('dc:subject'): | |
| try: | |
| if tag.renderContents() in tags: continue | |
| i = [x.upper() for x in tags].index(tag.renderContents().upper()) | |
| new_tag = Tag(self.soup, "dc:subject") | |
| new_tag.insert(0, tags[i]) | |
| tag.replaceWith(new_tag) | |
| except (ValueError): | |
| pass | |
| except: | |
| print "Unexpected error:", sys.exc_info()[0] | |
| raise | |
| for i, tag in enumerate(self.soup.findAll('dc:subject')): | |
| print i | |
| print tag | |
| def format2Delicious(self): | |
| new_soup = BeautifulSoup() | |
| meta = Tag(new_soup, "META", [("HTTP-EQUIV", "Content-Type"), ("CONTENT", "text/html; charset=UTF-8")]) | |
| new_soup.insert(0, meta) | |
| title = Tag(new_soup, "TITLE") | |
| new_soup.insert(1, title) | |
| h1 = Tag(new_soup, "H1") | |
| new_soup.insert(2, h1) | |
| dl = Tag(new_soup, "DL") | |
| new_soup.insert(3, dl) | |
| p = Tag(new_soup, "P") | |
| dl.insert(0, p) | |
| for i, item in enumerate(self.soup.findAll('item')): | |
| print i | |
| print item | |
| print [tag.renderContents() for tag in item.findAll("dc:subject")] | |
| tags = ','.join([tag.renderContents() for tag in item.findAll("dc:subject")]) | |
| tags = tags if tags is not '' else '' | |
| descriptions = "\n".join([description.string for description in item.findAll("description") if description.string is not None]) | |
| descriptions = descriptions if descriptions is not '' else '' | |
| add_date = int(time.mktime(tuple([tm for tm in time.strptime(item.pubdate.string, '%a, %d %b %Y %H:%M:%S +0900')]))) | |
| dt = Tag(new_soup, "DT") | |
| a = Tag(new_soup, "A", [ | |
| ("HREF", str(item.link.string)), | |
| ("ADD_DATE", str(add_date)), | |
| ("PRIVATE", str(0)), | |
| ("TAGS", unicode(tags, 'utf_8')) | |
| ]) | |
| if item.title.string is not None: | |
| a.insert(0, item.title.string) | |
| dt.insert(0, a) | |
| dl.insert(i, dt) | |
| if descriptions != '': | |
| dd = Tag(new_soup, "DD", [ | |
| ("DESCRIPTION", descriptions) | |
| ]) | |
| dt.insert(1, dd) | |
| file = open('./format2Delicious.htm','w') | |
| file.write(new_soup.prettify()) | |
| file.close() | |
| def diff(self, a, b): | |
| list = [] | |
| for item in b: | |
| r = re.compile(re.escape(item), re.IGNORECASE) | |
| try: | |
| for x in a: | |
| if r.match(x): | |
| list.append(item) | |
| except: | |
| pass | |
| return [item for item in b if item not in list] | |
| def dump(self): | |
| for i, item in enumerate(self.soup.findAll('item')): | |
| print i | |
| print item | |
| print [tag.renderContents() for tag in item.findAll("dc:subject")] | |
| ldc = LDC() | |
| ldc.check_similar_tags() | |
| forced_tags = ["SNS", "DreamHost", "Eclipse", "CrowdSourcing", "SecondLife", "Subversion", "CNET", "MeCab", "YahooPipes", "Illustrator", "WordPress", "Windows", "ITpro", "RMagick", "Shibuya.pm", "Paypal", "Blog | |
| forced_tags = [x.encode('utf-8') for x in forced_tags] | |
| larger_tags = ldc.diff(forced_tags , ldc.get_larger_tags()) | |
| ldc.convert_tags(larger_tags + forced_tags) | |
| ldc.convert_tags(larger_tags + forced_tags) | |
| ldc.dump() | |
| #ldc.format2Delicious() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment