Created
June 16, 2012 23:05
-
-
Save rschroll/2942769 to your computer and use it in GitHub Desktop.
Decoding #pdfloc for highlighted text in the Sony PRS-T1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
%s mount-point | |
Print to a file information on the highlighted text of a selected file on | |
the reader at mount-point. The output file is tab-separated data of the | |
form: | |
page highlight-range mark-start mark-end mark-type marked-text | |
mark-start and mark-end are the 'pdfloc' data defining where the marked | |
text starts and stops. Some of this information is extracted into the | |
highlight-range field, which has format: | |
c,d,e,g -> C,D,E,G | |
We think 'c' (and 'C') tell us the text line, 'd' and 'e' tell us the | |
character, and 'g' is some kind of flag, but we don't understand how this | |
information is encoded. If you figure something out, please tell us: | |
https://github.com/rschroll/prsannots/issues/4 | |
""" | |
# Copyright 2012 Robert Schroll | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU Lesser General Public License as | |
# published by the Free Software Foundation, either version 3 of the | |
# License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser General Public | |
# License along with this program. If not, see | |
# <http://www.gnu.org/licenses/>. | |
import os | |
import sys | |
from prsannots.prst1 import Reader | |
MT = {10: 'highlight', 11: 'text', 12: 'drawing'} | |
def u_raw_input(prompt): | |
"""raw_input with unicode encoding/decoding.""" | |
return raw_input(prompt.encode(sys.stdout.encoding)).decode(sys.stdin.encoding) | |
def select_book(books): | |
print "Please select which book to get:" | |
for i, book in enumerate(books): | |
title = book.title or book.file.split('/')[-1] | |
print " %i. %s" % (i+1, title) | |
which = u_raw_input("> ") | |
try: | |
return books[int(which) - 1] | |
except (ValueError, IndexError): | |
print "Could not understand your response. Aborting." | |
sys.exit(1) | |
def main(path): | |
reader = Reader(path) | |
book = select_book(reader.books) | |
c = reader.db.cursor() | |
c.execute('''select page, mark, mark_end, markup_type, marked_text | |
from annotation | |
where content_id = ? | |
order by page''', (book.id,)) | |
outfn = os.path.splitext(os.path.basename(book.file))[0] + '.txt' | |
userfn = u_raw_input("Enter output file name [%s]: " % outfn) | |
if userfn: | |
outfn = userfn | |
f = open(outfn, 'w') | |
for line in c: | |
nums = [s[8:-2].split(',') for s in line[1:3]] | |
hls = '%s,%s,%s,%s -> %s,%s,%s,%s' % tuple(nums[0][2:5] + nums[0][6:7] + nums[1][2:5] + nums[0][6:7]) | |
f.write('%i\t%s\t%s\t%s\t%s\t%s\n' % (int(line[0]+1), hls, line[1][:-1], line[2][:-1], MT[line[3]], line[4].encode('utf-8'))) | |
f.close() | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
print __doc__ % sys.argv[0] | |
sys.exit(0) | |
if not os.path.ismount(sys.argv[1]): | |
print "First argument must be mount point of Sony Reader." | |
print "(%s does not appear to be a mount point.)" % sys.argv[1] | |
sys.exit(1) | |
main(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment