Created
June 29, 2016 20:17
-
-
Save tsumare/ef564cbaf6f9c34aaa419e4ae8034c4b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download the xmarks "View Bookmarks Revision" from their webpage for all revisions, then run this to audit changes. | |
import glob, re | |
class BookmarkSeparator(object): | |
name = '<<SEPARATOR>>' | |
add_date = 0 | |
last_modified = 0 | |
description = '<<SEPARATOR>>' | |
def printout(self, pathprefix): | |
print('{}/<<SEPARATOR>>'.format(pathprefix)) | |
def iter_path(self): | |
return iter('') | |
class Bookmark(object): | |
def __init__(self, href, add_date, last_modified, name, shortcut_url): | |
self.href = href | |
self.add_date = add_date | |
self.last_modified = last_modified | |
self.name = name | |
self.description = '' | |
self.shortcut_url = shortcut_url | |
def printout(self, pathprefix): | |
print('{}/{}'.format(pathprefix, self.name)) | |
def iter_path(self): | |
yield (self.name, self) | |
class BookmarkFolder(object): | |
def __init__(self, depth, parseiter, add_date, last_modified, name, personal_toolbar_folder): | |
self.add_date = int((add_date if add_date else '0'),10) | |
self.last_modified = int((last_modified if last_modified else '0'),10) | |
self.name = name | |
self.description = '' | |
self.personal_toolbar_folder = personal_toolbar_folder | |
if self.personal_toolbar_folder: | |
self.name = '<<TOOLBAR>>' | |
self.contents = [] | |
if parseiter is None: | |
return | |
debug_previous_line = (None, None) | |
prev_entity = self | |
prev_is_description = False | |
for line in parseiter: | |
line = line.rstrip('\n') | |
debug_previous_line = (line, debug_previous_line[0]) | |
try: | |
if line.startswith('<DD>'): | |
prev_entity.description = line[4:] | |
prev_is_description = True | |
continue | |
elif prev_is_description and not line.lstrip(' ').startswith('<'): | |
prev_entity.description += '\n' + line | |
continue | |
else: | |
prev_is_description = False | |
if not line.strip(' '): | |
continue | |
if line == ' '*(depth-1) + '<DL><p>': | |
continue # our start tags | |
if not line.startswith(' '*depth + '<'): | |
assert line == ' '*(depth-1) + '</DL><p>' # our end tags | |
return | |
line = line.lstrip(' ') | |
if line.startswith('<DT><H3 '): | |
m = re.match('^<DT><H3 ADD_DATE="([0-9]+)" LAST_MODIFIED="([0-9]*)"( PERSONAL_TOOLBAR_FOLDER="true")?>(.+)</H3>$', line) | |
assert m is not None | |
prev_entity = BookmarkFolder(depth+1, parseiter, add_date=m.group(1), last_modified=m.group(2), name=m.group(4), personal_toolbar_folder=(m.group(3) is not None)) | |
self.contents.append(prev_entity) | |
continue | |
elif line == '<HR>': | |
prev_entity = BookmarkSeparator() | |
self.contents.append(prev_entity) | |
elif line.startswith('<DT><A HREF='): | |
m = re.match('^<DT><A HREF="([^"]+)" ADD_DATE="([0-9]*)" LAST_MODIFIED="([0-9]*)"(?: SHORTCUTURL="([^"]+)")?>(.+)</A>$', line) | |
assert m is not None | |
prev_entity = Bookmark(href=m.group(1), add_date=m.group(2), last_modified=m.group(3), name=m.group(5), shortcut_url=m.group(4)) | |
self.contents.append(prev_entity) | |
continue | |
else: | |
raise RuntimeError('Unable to parse line: {}'.format(line)) | |
except: | |
print('Prev: {}'.format(debug_previous_line[1])) | |
print('Line: {}'.format(line)) | |
raise | |
def printout(self, pathprefix=None): | |
name = '{}/{}'.format(pathprefix, self.name) | |
if pathprefix is None: | |
name = self.name | |
print('{}/'.format(name)) | |
for f in self.contents: | |
f.printout(name) | |
def iter_path(self): | |
for child in self.contents: | |
for path, obj in child.iter_path(): | |
yield ('{}/{}'.format(self.name, path), obj) | |
def parse_data(htmlfile): | |
with open(htmlfile,'r') as f: | |
parseiter = iter(f) | |
try: | |
while parseiter.next().rstrip('\n'): | |
pass | |
except StopIteration: | |
return BookmarkFolder(1, None, add_date='', last_modified='', name='', personal_toolbar_folder=False) | |
else: | |
return BookmarkFolder(1, parseiter, add_date='', last_modified='', name='', personal_toolbar_folder=False) | |
revisions = [] | |
for htmlfile in sorted(glob.glob('*.html')): | |
try: | |
revisions.append((int(htmlfile.replace('.html',''),10), parse_data(htmlfile))) | |
except: | |
print htmlfile | |
raise | |
recent = (None, None) | |
for revid, rev in revisions: | |
paths = dict(rev.iter_path()) | |
recent = (paths, recent[0]) | |
if recent[1] is None: | |
continue | |
rev_changes = [] | |
old_hrefs = set(map(lambda x: x.href, recent[1].values())) | |
new_hrefs = set(map(lambda x: x.href, recent[0].values())) | |
all_paths = set() | |
all_paths.update(recent[0].keys()) | |
all_paths.update(recent[1].keys()) | |
for path in sorted(all_paths): | |
if path in recent[1] and recent[1][path].href in new_hrefs: | |
continue # Not a deletion or insertion even if inconsistent | |
elif path in recent[0] and recent[0][path].href in old_hrefs: | |
continue # Not a deletion or insertion even if inconsistent | |
if path not in recent[0]: | |
rev_changes.append('Removed: {}'.format(path)) | |
if path not in recent[1]: | |
rev_changes.append('Added: {}'.format(path)) | |
if path in recent[1] and path in recent[0] and recent[1][path].href != recent[0][path].href: | |
rev_changes.append('Edited: {}'.format(path)) | |
if rev_changes: | |
print('*** REVISION #{} ***'.format(revid)) | |
print('') | |
print('\n'.join(rev_changes)) | |
print('') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment