tsumare · June 29, 2016 20:17
diff --git a/xmarks_recover.py b/xmarks_recover.py
 # Download the xmarks "View Bookmarks Revision" from their webpage for all revisions, then run this to audit changes.

 import glob, re

 class BookmarkSeparator(object):
 	name = '<<SEPARATOR>>'
 	add_date = 0
 	last_modified = 0
 	description = '<<SEPARATOR>>'

 	def printout(self, pathprefix):
 		print('{}/<<SEPARATOR>>'.format(pathprefix))

 	def iter_path(self):
 		return iter('')

 class Bookmark(object):
 	def __init__(self, href, add_date, last_modified, name, shortcut_url):
 		self.href = href
 		self.add_date = add_date
 		self.last_modified = last_modified
 		self.name = name
 		self.description = ''
 		self.shortcut_url = shortcut_url

 	def printout(self, pathprefix):
 		print('{}/{}'.format(pathprefix, self.name))

 	def iter_path(self):
 		yield (self.name, self)

 class BookmarkFolder(object):
 	def __init__(self, depth, parseiter, add_date, last_modified, name, personal_toolbar_folder):
 		self.add_date = int((add_date if add_date else '0'),10)
 		self.last_modified = int((last_modified if last_modified else '0'),10)
 		self.name = name
 		self.description = ''
 		self.personal_toolbar_folder = personal_toolbar_folder
 		if self.personal_toolbar_folder:
 			self.name = '<<TOOLBAR>>'

 		self.contents = []

 		if parseiter is None:
 			return

 		debug_previous_line = (None, None)
 		prev_entity = self
 		prev_is_description = False
 		for line in parseiter:
 			line = line.rstrip('\n')
 			debug_previous_line = (line, debug_previous_line[0])

 			try:
 				if line.startswith('<DD>'):
 					prev_entity.description = line[4:]
 					prev_is_description = True
 					continue
 				elif prev_is_description and not line.lstrip(' ').startswith('<'):
 					prev_entity.description += '\n' + line
 					continue
 				else:
 					prev_is_description = False

 				if not line.strip(' '):
 					continue

 				if line == '    '*(depth-1) + '<DL><p>':
 					continue # our start tags

 				if not line.startswith('    '*depth + '<'):
 					assert line == '    '*(depth-1) + '</DL><p>' # our end tags
 					return

 				line = line.lstrip(' ')
 				if line.startswith('<DT><H3 '):
 					m = re.match('^<DT><H3 ADD_DATE="([0-9]+)" LAST_MODIFIED="([0-9]*)"( PERSONAL_TOOLBAR_FOLDER="true")?>(.+)</H3>$', line)
 					assert m is not None
 					prev_entity = BookmarkFolder(depth+1, parseiter, add_date=m.group(1), last_modified=m.group(2), name=m.group(4), personal_toolbar_folder=(m.group(3) is not None))
 					self.contents.append(prev_entity)
 					continue

 				elif line == '<HR>':
 					prev_entity = BookmarkSeparator()
 					self.contents.append(prev_entity)

 				elif line.startswith('<DT><A HREF='):
 					m = re.match('^<DT><A HREF="([^"]+)" ADD_DATE="([0-9]*)" LAST_MODIFIED="([0-9]*)"(?: SHORTCUTURL="([^"]+)")?>(.+)</A>$', line)
 					assert m is not None
 					prev_entity = Bookmark(href=m.group(1), add_date=m.group(2), last_modified=m.group(3), name=m.group(5), shortcut_url=m.group(4))
 					self.contents.append(prev_entity)
 					continue

 				else:
 					raise RuntimeError('Unable to parse line: {}'.format(line))
 			except:
 				print('Prev: {}'.format(debug_previous_line[1]))
 				print('Line: {}'.format(line))
 				raise

 	def printout(self, pathprefix=None):
 		name = '{}/{}'.format(pathprefix, self.name)
 		if pathprefix is None:
 			name = self.name
 		print('{}/'.format(name))
 		for f in self.contents:
 			f.printout(name)

 	def iter_path(self):
 		for child in self.contents:
 			for path, obj in child.iter_path():
 				yield ('{}/{}'.format(self.name, path), obj)


 def parse_data(htmlfile):
 	with open(htmlfile,'r') as f:
 		parseiter = iter(f)
 		try:
 			while parseiter.next().rstrip('\n'):
 				pass
 		except StopIteration:
 			return BookmarkFolder(1, None, add_date='', last_modified='', name='', personal_toolbar_folder=False)
 		else:
 			return BookmarkFolder(1, parseiter, add_date='', last_modified='', name='', personal_toolbar_folder=False)

 revisions = []
 for htmlfile in sorted(glob.glob('*.html')):
 	try:
 		revisions.append((int(htmlfile.replace('.html',''),10), parse_data(htmlfile)))
 	except:
 		print htmlfile
 		raise

 recent = (None, None)
 for revid, rev in revisions:
 	paths = dict(rev.iter_path())
 	recent = (paths, recent[0])
 	if recent[1] is None:
 		continue

 	rev_changes = []

 	old_hrefs = set(map(lambda x: x.href, recent[1].values()))
 	new_hrefs = set(map(lambda x: x.href, recent[0].values()))

 	all_paths = set()
 	all_paths.update(recent[0].keys())
 	all_paths.update(recent[1].keys())
 	for path in sorted(all_paths):
 		if path in recent[1] and recent[1][path].href in new_hrefs:
 			continue # Not a deletion or insertion even if inconsistent
 		elif path in recent[0] and recent[0][path].href in old_hrefs:
 			continue # Not a deletion or insertion even if inconsistent

 		if path not in recent[0]:
 			rev_changes.append('Removed:  {}'.format(path))
 		if path not in recent[1]:
 			rev_changes.append('Added:    {}'.format(path))
 		if path in recent[1] and path in recent[0] and recent[1][path].href != recent[0][path].href:
 			rev_changes.append('Edited:   {}'.format(path))

 	if rev_changes:
 		print('*** REVISION #{} ***'.format(revid))
 		print('')
 		print('\n'.join(rev_changes))
 		print('')
	# Download the xmarks "View Bookmarks Revision" from their webpage for all revisions, then run this to audit changes.

	import glob, re

	class BookmarkSeparator(object):
	name = '<<SEPARATOR>>'
	add_date = 0
	last_modified = 0
	description = '<<SEPARATOR>>'

	def printout(self, pathprefix):
	print('{}/<<SEPARATOR>>'.format(pathprefix))

	def iter_path(self):
	return iter('')

	class Bookmark(object):
	def __init__(self, href, add_date, last_modified, name, shortcut_url):
	self.href = href
	self.add_date = add_date
	self.last_modified = last_modified
	self.name = name
	self.description = ''
	self.shortcut_url = shortcut_url

	def printout(self, pathprefix):
	print('{}/{}'.format(pathprefix, self.name))

	def iter_path(self):
	yield (self.name, self)

	class BookmarkFolder(object):
	def __init__(self, depth, parseiter, add_date, last_modified, name, personal_toolbar_folder):
	self.add_date = int((add_date if add_date else '0'),10)
	self.last_modified = int((last_modified if last_modified else '0'),10)
	self.name = name
	self.description = ''
	self.personal_toolbar_folder = personal_toolbar_folder
	if self.personal_toolbar_folder:
	self.name = '<<TOOLBAR>>'

	self.contents = []

	if parseiter is None:
	return

	debug_previous_line = (None, None)
	prev_entity = self
	prev_is_description = False
	for line in parseiter:
	line = line.rstrip('\n')
	debug_previous_line = (line, debug_previous_line[0])

	try:
	if line.startswith('<DD>'):
	prev_entity.description = line[4:]
	prev_is_description = True
	continue
	elif prev_is_description and not line.lstrip(' ').startswith('<'):
	prev_entity.description += '\n' + line
	continue
	else:
	prev_is_description = False

	if not line.strip(' '):
	continue

	if line == ' '*(depth-1) + '<DL><p>':
	continue # our start tags

	if not line.startswith(' '*depth + '<'):
	assert line == ' '*(depth-1) + '</DL><p>' # our end tags
	return

	line = line.lstrip(' ')
	if line.startswith('<DT><H3 '):
	m = re.match('^<DT><H3 ADD_DATE="([0-9]+)" LAST_MODIFIED="([0-9]*)"( PERSONAL_TOOLBAR_FOLDER="true")?>(.+)</H3>$', line)
	assert m is not None
	prev_entity = BookmarkFolder(depth+1, parseiter, add_date=m.group(1), last_modified=m.group(2), name=m.group(4), personal_toolbar_folder=(m.group(3) is not None))
	self.contents.append(prev_entity)
	continue

	elif line == '<HR>':
	prev_entity = BookmarkSeparator()
	self.contents.append(prev_entity)

	elif line.startswith('<DT><A HREF='):
	m = re.match('^<DT><A HREF="([^"]+)" ADD_DATE="([0-9])" LAST_MODIFIED="([0-9])"(?: SHORTCUTURL="([^"]+)")?>(.+)</A>$', line)
	assert m is not None
	prev_entity = Bookmark(href=m.group(1), add_date=m.group(2), last_modified=m.group(3), name=m.group(5), shortcut_url=m.group(4))
	self.contents.append(prev_entity)
	continue

	else:
	raise RuntimeError('Unable to parse line: {}'.format(line))
	except:
	print('Prev: {}'.format(debug_previous_line[1]))
	print('Line: {}'.format(line))
	raise

	def printout(self, pathprefix=None):
	name = '{}/{}'.format(pathprefix, self.name)
	if pathprefix is None:
	name = self.name
	print('{}/'.format(name))
	for f in self.contents:
	f.printout(name)

	def iter_path(self):
	for child in self.contents:
	for path, obj in child.iter_path():
	yield ('{}/{}'.format(self.name, path), obj)


	def parse_data(htmlfile):
	with open(htmlfile,'r') as f:
	parseiter = iter(f)
	try:
	while parseiter.next().rstrip('\n'):
	pass
	except StopIteration:
	return BookmarkFolder(1, None, add_date='', last_modified='', name='', personal_toolbar_folder=False)
	else:
	return BookmarkFolder(1, parseiter, add_date='', last_modified='', name='', personal_toolbar_folder=False)

	revisions = []
	for htmlfile in sorted(glob.glob('*.html')):
	try:
	revisions.append((int(htmlfile.replace('.html',''),10), parse_data(htmlfile)))
	except:
	print htmlfile
	raise

	recent = (None, None)
	for revid, rev in revisions:
	paths = dict(rev.iter_path())
	recent = (paths, recent[0])
	if recent[1] is None:
	continue

	rev_changes = []

	old_hrefs = set(map(lambda x: x.href, recent[1].values()))
	new_hrefs = set(map(lambda x: x.href, recent[0].values()))

	all_paths = set()
	all_paths.update(recent[0].keys())
	all_paths.update(recent[1].keys())
	for path in sorted(all_paths):
	if path in recent[1] and recent[1][path].href in new_hrefs:
	continue # Not a deletion or insertion even if inconsistent
	elif path in recent[0] and recent[0][path].href in old_hrefs:
	continue # Not a deletion or insertion even if inconsistent

	if path not in recent[0]:
	rev_changes.append('Removed: {}'.format(path))
	if path not in recent[1]:
	rev_changes.append('Added: {}'.format(path))
	if path in recent[1] and path in recent[0] and recent[1][path].href != recent[0][path].href:
	rev_changes.append('Edited: {}'.format(path))

	if rev_changes:
	print('* REVISION #{} *'.format(revid))
	print('')
	print('\n'.join(rev_changes))
	print('')