Skip to content

Instantly share code, notes, and snippets.

@rschroll
Created June 16, 2012 23:05
Show Gist options
  • Save rschroll/2942769 to your computer and use it in GitHub Desktop.
Save rschroll/2942769 to your computer and use it in GitHub Desktop.
Decoding #pdfloc for highlighted text in the Sony PRS-T1
#!/usr/bin/env python
"""
%s mount-point
Print to a file information on the highlighted text of a selected file on
the reader at mount-point. The output file is tab-separated data of the
form:
page highlight-range mark-start mark-end mark-type marked-text
mark-start and mark-end are the 'pdfloc' data defining where the marked
text starts and stops. Some of this information is extracted into the
highlight-range field, which has format:
c,d,e,g -> C,D,E,G
We think 'c' (and 'C') tell us the text line, 'd' and 'e' tell us the
character, and 'g' is some kind of flag, but we don't understand how this
information is encoded. If you figure something out, please tell us:
https://github.com/rschroll/prsannots/issues/4
"""
# Copyright 2012 Robert Schroll
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
import os
import sys
from prsannots.prst1 import Reader
MT = {10: 'highlight', 11: 'text', 12: 'drawing'}
def u_raw_input(prompt):
"""raw_input with unicode encoding/decoding."""
return raw_input(prompt.encode(sys.stdout.encoding)).decode(sys.stdin.encoding)
def select_book(books):
print "Please select which book to get:"
for i, book in enumerate(books):
title = book.title or book.file.split('/')[-1]
print " %i. %s" % (i+1, title)
which = u_raw_input("> ")
try:
return books[int(which) - 1]
except (ValueError, IndexError):
print "Could not understand your response. Aborting."
sys.exit(1)
def main(path):
reader = Reader(path)
book = select_book(reader.books)
c = reader.db.cursor()
c.execute('''select page, mark, mark_end, markup_type, marked_text
from annotation
where content_id = ?
order by page''', (book.id,))
outfn = os.path.splitext(os.path.basename(book.file))[0] + '.txt'
userfn = u_raw_input("Enter output file name [%s]: " % outfn)
if userfn:
outfn = userfn
f = open(outfn, 'w')
for line in c:
nums = [s[8:-2].split(',') for s in line[1:3]]
hls = '%s,%s,%s,%s -> %s,%s,%s,%s' % tuple(nums[0][2:5] + nums[0][6:7] + nums[1][2:5] + nums[0][6:7])
f.write('%i\t%s\t%s\t%s\t%s\t%s\n' % (int(line[0]+1), hls, line[1][:-1], line[2][:-1], MT[line[3]], line[4].encode('utf-8')))
f.close()
if __name__ == '__main__':
if len(sys.argv) != 2:
print __doc__ % sys.argv[0]
sys.exit(0)
if not os.path.ismount(sys.argv[1]):
print "First argument must be mount point of Sony Reader."
print "(%s does not appear to be a mount point.)" % sys.argv[1]
sys.exit(1)
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment