Skip to content

Instantly share code, notes, and snippets.

@violetguos
Last active December 12, 2017 19:45
Show Gist options
  • Save violetguos/f94c486f9faa944bf49daf66fc58d1d5 to your computer and use it in GitHub Desktop.
Save violetguos/f94c486f9faa944bf49daf66fc58d1d5 to your computer and use it in GitHub Desktop.
PrintQuota
#############################
#this should work on your PC
#still have to manually input
#your quota
#N.B. ECF runfs Python 2.6,
#I'm not gonna support this.
#just run on your PC
#try to run 'pq' on ECF and
#copy your quota to the script, set it to num_pg
############################
from glob import glob as __g
from PyPDF2 import PdfFileReader
import re
import pandas as pd
import os
#give number of pages i want to print, what is the ECF quota
def dupQuota(num_pages):
quota = (num_pages + 0.5*num_pages)/2
#given my current quota, how many PDF pages can i print
def quotaToPage(num_quota):
pages = num_quota * 2 / 1.5
return pages
def count_os(path):
"""
alternative method
"""
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'):
for f in files:
if f.endswith(".pdf"):
pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
df = df.append(df2, ignore_index=True)
return df
def knapsack(items, maxweight):
"""
Solve the knapsack problem by finding the most valuable
subsequence of `items` subject that weighs no more than
`maxweight`.
`items` is a sequence of pairs `(value, weight)`, where `value` is
a number and `weight` is a non-negative integer.
`maxweight` is a non-negative integer.
Return a pair whose first element is the sum of values in the most
valuable subsequence, and whose second element is the subsequence.
>>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
>>> knapsack(items, 15)
(11, [(2, 1), (6, 4), (1, 1), (2, 2)])
(importance, page count)
for example, my ECE568 papers got 100 pages, but assigned weight
= 50% vs my CSC384 paper, 20 pages, 40%
"""
# Return the value of the most valuable subsequence of the first i
# elements in items whose weights sum to no more than j.
#@memoized
def bestvalue(i, j):
if i == 0: return 0
value, file_page_tup = items[i - 1]
#print items[i-1]
#print file_page_tup
file_name, weight = file_page_tup
if weight > j:
return bestvalue(i - 1, j)
else:
return max(bestvalue(i - 1, j),
bestvalue(i - 1, j - weight) + value)
j = maxweight
result = []
for i in xrange(len(items), 0, -1):
if bestvalue(i, j) != bestvalue(i - 1, j):
result.append(items[i - 1])
f, w = items[i - 1][1]
j -= w
result.reverse()
return bestvalue(len(items), maxweight), result
#ms = count_pg(".")
path = os.getcwd()
pd_page = count_os(path)
list_page = []
for i,j in zip(pd_page['fileName'], pd_page['pageNumber']):
tup_pg = tuple((i,j))
list_page.append(tuple((j, tup_pg)))
#print(list_page)
item = list_page
num_pg = 100
quota = quotaToPage(num_pg)
print quota
print knapsack(item, quota)
#print b1, b2
#print b2[0], b2[1]
################################
# FULLY FUNCTIONAL
# Returns true page count for all pdf, no
# problem so far
#TODO: PD dataframe to list of tuples??
# ###############
from glob import glob as __g
from PyPDF2 import PdfFileReader
import re
import pandas as pd
import os
def count_os(path):
"""
alternative method
"""
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'):
for f in files:
if f.endswith(".pdf"):
pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
df = df.append(df2, ignore_index=True)
return df
#ms = count_pg(".")
path = os.getcwd()
list_page = count_os(path)
print list_page['pageNumber']
#for iter in ms.items():
# print iter[0], iter[1]
# if iter[1] ==0:
# print "notiec ", iter[0]
#print ms
"""
This module contains a function to count
the total pages for all PDF files in one directory.
"""
#from time import clock as __c #Used for benchmark.
from glob import glob as __g
from re import search as __s
def count( vPath ):
"""
Takes one argument: the path where you want to search the files.
Returns a dictionary with the file name and number of pages for each file.
"""
#
#cdef double ti = __c() #Used for benchmark.
#
vPDFfiles = __g( vPath + "\\" + '*.pdf' )
vPages = 0
vMsg = {}
#
for vPDFfile in vPDFfiles:
vFileOpen = open( vPDFfile, 'rb', 1 )
for vLine in vFileOpen.readlines():
if "/Count " in vLine:
vPages = int( __s("/Count \d*", vLine).group()[7:] )
vMsg[vPDFfile] = vPages
vFileOpen.close()
#
#cdef double tf = __c() #Used for benchmark.
#
#print tf-ti
return vMsg
#
##################################
# Verified that this works on mac OS
#
# ISSUE: COUNT ALL PDFS generated by latex = 0!!!!
##################################
from glob import glob as __g
import re
pattern = re.compile(r"/Count\s+(\d+)")
def count_pg(vPath):
"""
Takes one argument: the path where you want to search the files.
Returns a dictionary with the file name and number of pages for each file.
"""
vPDFfiles = __g( vPath + "/" + '*.pdf' )
print vPDFfiles
vMsg = {}
for vPDFfile in vPDFfiles:
vPages = 0
content = open( vPDFfile, 'rb', 1 ).read()
for match in pattern.finditer(content):
vPages = int(match.group(1))
vMsg[vPDFfile] = vPages
return vMsg
ms = count_pg(".")
print ms
import os
from os.path import join as pjoin, expanduser
import subprocess
from pprint import pprint
directory = expanduser("~/Documents/extra_learning/ug_theses")
for name in os.listdir(directory):
if name[-4:] == ".pdf":
p = pjoin(directory, name)
child = subprocess.Popen("pdftk %s dump_data output" % p, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = child.communicate()
if serr:
print p
print(serr)
else:
D = dict((t[0].strip(), t[1].strip()) for t in
(t.split(":") for t in sout.split("\n")[:-1]))
pprint(D)
########################################################
# As a cheap and poor af student at Utoronto,
# I have a limited amount of print
# quota and I can't possibly print
# all the material. Given a directory of
# PDFs I want to print, and privided that I
# go duplex, what is the maximum number of
# papers I can print? If not all the papers are given
# equal weight, how many can I print?
# Modeled as a knapsack problem
########################################################
import pandas as pd
import os
from PyPDF2 import PdfFileReader
import commands
#cmd = "pq"
#output = commands.getoutput(cmd) #utorid -> PAGES PRINTED UNIX: 280 WINDOWS: 0 TOTAL: 280 LIMIT: 780
#try to extract the last number
#print type(output)
#give number of pages i want to print, what is the ECF quota
def dupQuota(num_pages):
quota = (num_pages + 0.5*num_pages)/2
#given my current quota, how many PDF pages can i print
def quotaToPage(num_quota):
pages = num_quota * 2 / 1.5
def knapsack(items, maxweight):
"""
Solve the knapsack problem by finding the most valuable
subsequence of `items` subject that weighs no more than
`maxweight`.
`items` is a sequence of pairs `(value, weight)`, where `value` is
a number and `weight` is a non-negative integer.
`maxweight` is a non-negative integer.
Return a pair whose first element is the sum of values in the most
valuable subsequence, and whose second element is the subsequence.
>>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
>>> knapsack(items, 15)
(11, [(2, 1), (6, 4), (1, 1), (2, 2)])
(importance, page count)
for example, my ECE568 papers got 100 pages, but assigned weight
= 50% vs my CSC384 paper, 20 pages, 40%
"""
# Return the value of the most valuable subsequence of the first i
# elements in items whose weights sum to no more than j.
#@memoized
def bestvalue(i, j):
if i == 0: return 0
value, weight = items[i - 1]
if weight > j:
return bestvalue(i - 1, j)
else:
return max(bestvalue(i - 1, j),
bestvalue(i - 1, j - weight) + value)
j = maxweight
result = []
for i in xrange(len(items), 0, -1):
if bestvalue(i, j) != bestvalue(i - 1, j):
result.append(items[i - 1])
j -= items[i - 1][1]
result.reverse()
return bestvalue(len(items), maxweight), result
item = [(4, 120), (2, 10), (6, 40), (1, 10), (2, 20)]
maxw = 10
#item = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
#knapsack(item, 100)
#TODO: napsack problem?
def cheap_af(dict_paper_page, quota_left):
'''
dict_papaer_page = {'file name': number of pages}
(value, weight)`, where `value` is
a number and `weight` is a non-negative integer.
dict_paper_page = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
'''
man_arr = [65, 15, 26]
#TODO: need to check
#extract number of pages in a direcotry of papers i want to print
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
pdf=PdfFileReader(open('path/to/file.pdf','rb'))
for root, dirs, files in os.walk(r'Directory path'):
for file in files:
if file.endswith(".pdf"):
df2 = pd.DataFrame([[file, os.path.join(root,file),pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
df = df.append(df2, ignore_index=True)
#linux?
REM PDF-Pages.cmd
@echo off
del output.txt
for /r %1 %%f in (*.pdf) do pdfinfo.exe -meta "%%f" >out.txt & echo "%%f", | tr.exe -d "\r\n" >>output.txt & find "Pages:" out.txt | tr.exe -d "\r\n\055\056\072[:alpha:][:space:]" >>output.txt & echo , | tr.exe -d "\r\n" >>output.txt & find "File size:" out.txt | tr.exe -d "\055\056\072[:space:][:alpha:]" >>output.txt & echo. >>output.txt
del out.txt
@violetguos
Copy link
Author

Dec. 12 page count: 648

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment