Skip to content

Instantly share code, notes, and snippets.

@banyan
Created February 11, 2010 05:55
Show Gist options
  • Select an option

  • Save banyan/301270 to your computer and use it in GitHub Desktop.

Select an option

Save banyan/301270 to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*-
# vim:fileencoding=utf_8
import sys
import re
import time
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag
class LDC(object):
def __init__(self):
xml = open('./export_utf.htm','r').read().decode("utf-8", "replace")
self.soup = BeautifulStoneSoup(xml)
def _sort_tags(original_func):
def func(*args):
h = {}
similar = {}
for tag in args[0].soup.findAll('dc:subject'):
h[tag.string] = h[tag.string] + 1 if h.setdefault(tag.string) else 1
for k, v in sorted(h.items(), key=lambda x: x[1], reverse=True):
list = [k, v]
similar[k.upper()] = (similar[k.upper()] + [list] if similar.setdefault(k.upper()) else [list])
return original_func(similar)
return func
@_sort_tags
def check_similar_tags(similar):
for k, lists in similar.iteritems():
if len(lists) > 1:
for k, v in lists:
print k, v
@_sort_tags
def get_larger_tags(similar):
ret = []
for k, lists in similar.iteritems():
if len(lists) > 1:
for i, x in enumerate(lists):
if i == 0:
ret.append(lists[0][0])
return ret
def convert_tags(self, tags):
for tag in self.soup.findAll('dc:subject'):
try:
if tag.renderContents() in tags: continue
i = [x.upper() for x in tags].index(tag.renderContents().upper())
new_tag = Tag(self.soup, "dc:subject")
new_tag.insert(0, tags[i])
tag.replaceWith(new_tag)
except (ValueError):
pass
except:
print "Unexpected error:", sys.exc_info()[0]
raise
for i, tag in enumerate(self.soup.findAll('dc:subject')):
print i
print tag
def format2Delicious(self):
new_soup = BeautifulSoup()
meta = Tag(new_soup, "META", [("HTTP-EQUIV", "Content-Type"), ("CONTENT", "text/html; charset=UTF-8")])
new_soup.insert(0, meta)
title = Tag(new_soup, "TITLE")
new_soup.insert(1, title)
h1 = Tag(new_soup, "H1")
new_soup.insert(2, h1)
dl = Tag(new_soup, "DL")
new_soup.insert(3, dl)
p = Tag(new_soup, "P")
dl.insert(0, p)
for i, item in enumerate(self.soup.findAll('item')):
print i
print item
print [tag.renderContents() for tag in item.findAll("dc:subject")]
tags = ','.join([tag.renderContents() for tag in item.findAll("dc:subject")])
tags = tags if tags is not '' else ''
descriptions = "\n".join([description.string for description in item.findAll("description") if description.string is not None])
descriptions = descriptions if descriptions is not '' else ''
add_date = int(time.mktime(tuple([tm for tm in time.strptime(item.pubdate.string, '%a, %d %b %Y %H:%M:%S +0900')])))
dt = Tag(new_soup, "DT")
a = Tag(new_soup, "A", [
("HREF", str(item.link.string)),
("ADD_DATE", str(add_date)),
("PRIVATE", str(0)),
("TAGS", unicode(tags, 'utf_8'))
])
if item.title.string is not None:
a.insert(0, item.title.string)
dt.insert(0, a)
dl.insert(i, dt)
if descriptions != '':
dd = Tag(new_soup, "DD", [
("DESCRIPTION", descriptions)
])
dt.insert(1, dd)
file = open('./format2Delicious.htm','w')
file.write(new_soup.prettify())
file.close()
def diff(self, a, b):
list = []
for item in b:
r = re.compile(re.escape(item), re.IGNORECASE)
try:
for x in a:
if r.match(x):
list.append(item)
except:
pass
return [item for item in b if item not in list]
def dump(self):
for i, item in enumerate(self.soup.findAll('item')):
print i
print item
print [tag.renderContents() for tag in item.findAll("dc:subject")]
ldc = LDC()
ldc.check_similar_tags()
forced_tags = ["SNS", "DreamHost", "Eclipse", "CrowdSourcing", "SecondLife", "Subversion", "CNET", "MeCab", "YahooPipes", "Illustrator", "WordPress", "Windows", "ITpro", "RMagick", "Shibuya.pm", "Paypal", "Blog
forced_tags = [x.encode('utf-8') for x in forced_tags]
larger_tags = ldc.diff(forced_tags , ldc.get_larger_tags())
ldc.convert_tags(larger_tags + forced_tags)
ldc.convert_tags(larger_tags + forced_tags)
ldc.dump()
#ldc.format2Delicious()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment