Skip to content

Instantly share code, notes, and snippets.

@tsumare
Created June 29, 2016 20:17
Show Gist options
  • Save tsumare/ef564cbaf6f9c34aaa419e4ae8034c4b to your computer and use it in GitHub Desktop.
Save tsumare/ef564cbaf6f9c34aaa419e4ae8034c4b to your computer and use it in GitHub Desktop.
# Download the xmarks "View Bookmarks Revision" from their webpage for all revisions, then run this to audit changes.
import glob, re
class BookmarkSeparator(object):
name = '<<SEPARATOR>>'
add_date = 0
last_modified = 0
description = '<<SEPARATOR>>'
def printout(self, pathprefix):
print('{}/<<SEPARATOR>>'.format(pathprefix))
def iter_path(self):
return iter('')
class Bookmark(object):
def __init__(self, href, add_date, last_modified, name, shortcut_url):
self.href = href
self.add_date = add_date
self.last_modified = last_modified
self.name = name
self.description = ''
self.shortcut_url = shortcut_url
def printout(self, pathprefix):
print('{}/{}'.format(pathprefix, self.name))
def iter_path(self):
yield (self.name, self)
class BookmarkFolder(object):
def __init__(self, depth, parseiter, add_date, last_modified, name, personal_toolbar_folder):
self.add_date = int((add_date if add_date else '0'),10)
self.last_modified = int((last_modified if last_modified else '0'),10)
self.name = name
self.description = ''
self.personal_toolbar_folder = personal_toolbar_folder
if self.personal_toolbar_folder:
self.name = '<<TOOLBAR>>'
self.contents = []
if parseiter is None:
return
debug_previous_line = (None, None)
prev_entity = self
prev_is_description = False
for line in parseiter:
line = line.rstrip('\n')
debug_previous_line = (line, debug_previous_line[0])
try:
if line.startswith('<DD>'):
prev_entity.description = line[4:]
prev_is_description = True
continue
elif prev_is_description and not line.lstrip(' ').startswith('<'):
prev_entity.description += '\n' + line
continue
else:
prev_is_description = False
if not line.strip(' '):
continue
if line == ' '*(depth-1) + '<DL><p>':
continue # our start tags
if not line.startswith(' '*depth + '<'):
assert line == ' '*(depth-1) + '</DL><p>' # our end tags
return
line = line.lstrip(' ')
if line.startswith('<DT><H3 '):
m = re.match('^<DT><H3 ADD_DATE="([0-9]+)" LAST_MODIFIED="([0-9]*)"( PERSONAL_TOOLBAR_FOLDER="true")?>(.+)</H3>$', line)
assert m is not None
prev_entity = BookmarkFolder(depth+1, parseiter, add_date=m.group(1), last_modified=m.group(2), name=m.group(4), personal_toolbar_folder=(m.group(3) is not None))
self.contents.append(prev_entity)
continue
elif line == '<HR>':
prev_entity = BookmarkSeparator()
self.contents.append(prev_entity)
elif line.startswith('<DT><A HREF='):
m = re.match('^<DT><A HREF="([^"]+)" ADD_DATE="([0-9]*)" LAST_MODIFIED="([0-9]*)"(?: SHORTCUTURL="([^"]+)")?>(.+)</A>$', line)
assert m is not None
prev_entity = Bookmark(href=m.group(1), add_date=m.group(2), last_modified=m.group(3), name=m.group(5), shortcut_url=m.group(4))
self.contents.append(prev_entity)
continue
else:
raise RuntimeError('Unable to parse line: {}'.format(line))
except:
print('Prev: {}'.format(debug_previous_line[1]))
print('Line: {}'.format(line))
raise
def printout(self, pathprefix=None):
name = '{}/{}'.format(pathprefix, self.name)
if pathprefix is None:
name = self.name
print('{}/'.format(name))
for f in self.contents:
f.printout(name)
def iter_path(self):
for child in self.contents:
for path, obj in child.iter_path():
yield ('{}/{}'.format(self.name, path), obj)
def parse_data(htmlfile):
with open(htmlfile,'r') as f:
parseiter = iter(f)
try:
while parseiter.next().rstrip('\n'):
pass
except StopIteration:
return BookmarkFolder(1, None, add_date='', last_modified='', name='', personal_toolbar_folder=False)
else:
return BookmarkFolder(1, parseiter, add_date='', last_modified='', name='', personal_toolbar_folder=False)
revisions = []
for htmlfile in sorted(glob.glob('*.html')):
try:
revisions.append((int(htmlfile.replace('.html',''),10), parse_data(htmlfile)))
except:
print htmlfile
raise
recent = (None, None)
for revid, rev in revisions:
paths = dict(rev.iter_path())
recent = (paths, recent[0])
if recent[1] is None:
continue
rev_changes = []
old_hrefs = set(map(lambda x: x.href, recent[1].values()))
new_hrefs = set(map(lambda x: x.href, recent[0].values()))
all_paths = set()
all_paths.update(recent[0].keys())
all_paths.update(recent[1].keys())
for path in sorted(all_paths):
if path in recent[1] and recent[1][path].href in new_hrefs:
continue # Not a deletion or insertion even if inconsistent
elif path in recent[0] and recent[0][path].href in old_hrefs:
continue # Not a deletion or insertion even if inconsistent
if path not in recent[0]:
rev_changes.append('Removed: {}'.format(path))
if path not in recent[1]:
rev_changes.append('Added: {}'.format(path))
if path in recent[1] and path in recent[0] and recent[1][path].href != recent[0][path].href:
rev_changes.append('Edited: {}'.format(path))
if rev_changes:
print('*** REVISION #{} ***'.format(revid))
print('')
print('\n'.join(rev_changes))
print('')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment