Skip to content

Instantly share code, notes, and snippets.

@mikaelhg
Created October 15, 2016 22:39
Show Gist options
  • Save mikaelhg/b12fd79277231e1736a148de5a0c05f7 to your computer and use it in GitHub Desktop.
Save mikaelhg/b12fd79277231e1736a148de5a0c05f7 to your computer and use it in GitHub Desktop.
Diff between two subsequent editions of an ebook alpha file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import difflib
import epub
from bs4 import BeautifulSoup
def text_paras(book, href):
soup = BeautifulSoup(book.read_item(href), 'lxml')
return [x.get_text() for x in soup.find_all('p')]
book1 = epub.open_epub('/home/mikael/astlan/Apostles of Doom Alpha 1 - J. L. Langland.epub')
book2 = epub.open_epub('/home/mikael/astlan/Apostles of Doom Alpha 2 - J. L. Langland.epub')
for item in book1.opf.manifest.values():
print(item.href)
if item.media_type == 'application/xhtml+xml':
paras1 = text_paras(book1, item.href)
paras2 = text_paras(book2, item.href)
s = difflib.SequenceMatcher(None, paras1, paras2)
for opcode in s.get_opcodes():
print("%6s a[%d:%d] b[%d:%d]" % opcode)
if opcode[0] == 'insert':
print('B: ', paras2[opcode[3]:opcode[4]])
elif opcode[0] == 'replace':
print('A: ', paras1[opcode[1]:opcode[2]])
print('B: ', paras2[opcode[3]:opcode[4]])
elif opcode[0] == 'delete':
print('A: ', paras1[opcode[1]:opcode[2]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment