Last active
December 12, 2017 19:45
-
-
Save violetguos/f94c486f9faa944bf49daf66fc58d1d5 to your computer and use it in GitHub Desktop.
PrintQuota
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################# | |
#this should work on your PC | |
#still have to manually input | |
#your quota | |
#N.B. ECF runfs Python 2.6, | |
#I'm not gonna support this. | |
#just run on your PC | |
#try to run 'pq' on ECF and | |
#copy your quota to the script, set it to num_pg | |
############################ | |
from glob import glob as __g | |
from PyPDF2 import PdfFileReader | |
import re | |
import pandas as pd | |
import os | |
#give number of pages i want to print, what is the ECF quota | |
def dupQuota(num_pages): | |
quota = (num_pages + 0.5*num_pages)/2 | |
#given my current quota, how many PDF pages can i print | |
def quotaToPage(num_quota): | |
pages = num_quota * 2 / 1.5 | |
return pages | |
def count_os(path): | |
""" | |
alternative method | |
""" | |
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber']) | |
for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'): | |
for f in files: | |
if f.endswith(".pdf"): | |
pdf=PdfFileReader(open(os.path.join(root, f),'rb')) | |
df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber']) | |
df = df.append(df2, ignore_index=True) | |
return df | |
def knapsack(items, maxweight): | |
""" | |
Solve the knapsack problem by finding the most valuable | |
subsequence of `items` subject that weighs no more than | |
`maxweight`. | |
`items` is a sequence of pairs `(value, weight)`, where `value` is | |
a number and `weight` is a non-negative integer. | |
`maxweight` is a non-negative integer. | |
Return a pair whose first element is the sum of values in the most | |
valuable subsequence, and whose second element is the subsequence. | |
>>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)] | |
>>> knapsack(items, 15) | |
(11, [(2, 1), (6, 4), (1, 1), (2, 2)]) | |
(importance, page count) | |
for example, my ECE568 papers got 100 pages, but assigned weight | |
= 50% vs my CSC384 paper, 20 pages, 40% | |
""" | |
# Return the value of the most valuable subsequence of the first i | |
# elements in items whose weights sum to no more than j. | |
#@memoized | |
def bestvalue(i, j): | |
if i == 0: return 0 | |
value, file_page_tup = items[i - 1] | |
#print items[i-1] | |
#print file_page_tup | |
file_name, weight = file_page_tup | |
if weight > j: | |
return bestvalue(i - 1, j) | |
else: | |
return max(bestvalue(i - 1, j), | |
bestvalue(i - 1, j - weight) + value) | |
j = maxweight | |
result = [] | |
for i in xrange(len(items), 0, -1): | |
if bestvalue(i, j) != bestvalue(i - 1, j): | |
result.append(items[i - 1]) | |
f, w = items[i - 1][1] | |
j -= w | |
result.reverse() | |
return bestvalue(len(items), maxweight), result | |
#ms = count_pg(".") | |
path = os.getcwd() | |
pd_page = count_os(path) | |
list_page = [] | |
for i,j in zip(pd_page['fileName'], pd_page['pageNumber']): | |
tup_pg = tuple((i,j)) | |
list_page.append(tuple((j, tup_pg))) | |
#print(list_page) | |
item = list_page | |
num_pg = 100 | |
quota = quotaToPage(num_pg) | |
print quota | |
print knapsack(item, quota) | |
#print b1, b2 | |
#print b2[0], b2[1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################ | |
# FULLY FUNCTIONAL | |
# Returns true page count for all pdf, no | |
# problem so far | |
#TODO: PD dataframe to list of tuples?? | |
# ############### | |
from glob import glob as __g | |
from PyPDF2 import PdfFileReader | |
import re | |
import pandas as pd | |
import os | |
def count_os(path): | |
""" | |
alternative method | |
""" | |
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber']) | |
for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'): | |
for f in files: | |
if f.endswith(".pdf"): | |
pdf=PdfFileReader(open(os.path.join(root, f),'rb')) | |
df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber']) | |
df = df.append(df2, ignore_index=True) | |
return df | |
#ms = count_pg(".") | |
path = os.getcwd() | |
list_page = count_os(path) | |
print list_page['pageNumber'] | |
#for iter in ms.items(): | |
# print iter[0], iter[1] | |
# if iter[1] ==0: | |
# print "notiec ", iter[0] | |
#print ms |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This module contains a function to count | |
the total pages for all PDF files in one directory. | |
""" | |
#from time import clock as __c #Used for benchmark. | |
from glob import glob as __g | |
from re import search as __s | |
def count( vPath ): | |
""" | |
Takes one argument: the path where you want to search the files. | |
Returns a dictionary with the file name and number of pages for each file. | |
""" | |
# | |
#cdef double ti = __c() #Used for benchmark. | |
# | |
vPDFfiles = __g( vPath + "\\" + '*.pdf' ) | |
vPages = 0 | |
vMsg = {} | |
# | |
for vPDFfile in vPDFfiles: | |
vFileOpen = open( vPDFfile, 'rb', 1 ) | |
for vLine in vFileOpen.readlines(): | |
if "/Count " in vLine: | |
vPages = int( __s("/Count \d*", vLine).group()[7:] ) | |
vMsg[vPDFfile] = vPages | |
vFileOpen.close() | |
# | |
#cdef double tf = __c() #Used for benchmark. | |
# | |
#print tf-ti | |
return vMsg | |
# |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################## | |
# Verified that this works on mac OS | |
# | |
# ISSUE: COUNT ALL PDFS generated by latex = 0!!!! | |
################################## | |
from glob import glob as __g | |
import re | |
pattern = re.compile(r"/Count\s+(\d+)") | |
def count_pg(vPath): | |
""" | |
Takes one argument: the path where you want to search the files. | |
Returns a dictionary with the file name and number of pages for each file. | |
""" | |
vPDFfiles = __g( vPath + "/" + '*.pdf' ) | |
print vPDFfiles | |
vMsg = {} | |
for vPDFfile in vPDFfiles: | |
vPages = 0 | |
content = open( vPDFfile, 'rb', 1 ).read() | |
for match in pattern.finditer(content): | |
vPages = int(match.group(1)) | |
vMsg[vPDFfile] = vPages | |
return vMsg | |
ms = count_pg(".") | |
print ms | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from os.path import join as pjoin, expanduser | |
import subprocess | |
from pprint import pprint | |
directory = expanduser("~/Documents/extra_learning/ug_theses") | |
for name in os.listdir(directory): | |
if name[-4:] == ".pdf": | |
p = pjoin(directory, name) | |
child = subprocess.Popen("pdftk %s dump_data output" % p, shell=True, | |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
sout, serr = child.communicate() | |
if serr: | |
print p | |
print(serr) | |
else: | |
D = dict((t[0].strip(), t[1].strip()) for t in | |
(t.split(":") for t in sout.split("\n")[:-1])) | |
pprint(D) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################## | |
# As a cheap and poor af student at Utoronto, | |
# I have a limited amount of print | |
# quota and I can't possibly print | |
# all the material. Given a directory of | |
# PDFs I want to print, and privided that I | |
# go duplex, what is the maximum number of | |
# papers I can print? If not all the papers are given | |
# equal weight, how many can I print? | |
# Modeled as a knapsack problem | |
######################################################## | |
import pandas as pd | |
import os | |
from PyPDF2 import PdfFileReader | |
import commands | |
#cmd = "pq" | |
#output = commands.getoutput(cmd) #utorid -> PAGES PRINTED UNIX: 280 WINDOWS: 0 TOTAL: 280 LIMIT: 780 | |
#try to extract the last number | |
#print type(output) | |
#give number of pages i want to print, what is the ECF quota | |
def dupQuota(num_pages): | |
quota = (num_pages + 0.5*num_pages)/2 | |
#given my current quota, how many PDF pages can i print | |
def quotaToPage(num_quota): | |
pages = num_quota * 2 / 1.5 | |
def knapsack(items, maxweight): | |
""" | |
Solve the knapsack problem by finding the most valuable | |
subsequence of `items` subject that weighs no more than | |
`maxweight`. | |
`items` is a sequence of pairs `(value, weight)`, where `value` is | |
a number and `weight` is a non-negative integer. | |
`maxweight` is a non-negative integer. | |
Return a pair whose first element is the sum of values in the most | |
valuable subsequence, and whose second element is the subsequence. | |
>>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)] | |
>>> knapsack(items, 15) | |
(11, [(2, 1), (6, 4), (1, 1), (2, 2)]) | |
(importance, page count) | |
for example, my ECE568 papers got 100 pages, but assigned weight | |
= 50% vs my CSC384 paper, 20 pages, 40% | |
""" | |
# Return the value of the most valuable subsequence of the first i | |
# elements in items whose weights sum to no more than j. | |
#@memoized | |
def bestvalue(i, j): | |
if i == 0: return 0 | |
value, weight = items[i - 1] | |
if weight > j: | |
return bestvalue(i - 1, j) | |
else: | |
return max(bestvalue(i - 1, j), | |
bestvalue(i - 1, j - weight) + value) | |
j = maxweight | |
result = [] | |
for i in xrange(len(items), 0, -1): | |
if bestvalue(i, j) != bestvalue(i - 1, j): | |
result.append(items[i - 1]) | |
j -= items[i - 1][1] | |
result.reverse() | |
return bestvalue(len(items), maxweight), result | |
item = [(4, 120), (2, 10), (6, 40), (1, 10), (2, 20)] | |
maxw = 10 | |
#item = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)] | |
#knapsack(item, 100) | |
#TODO: napsack problem? | |
def cheap_af(dict_paper_page, quota_left): | |
''' | |
dict_papaer_page = {'file name': number of pages} | |
(value, weight)`, where `value` is | |
a number and `weight` is a non-negative integer. | |
dict_paper_page = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)] | |
''' | |
man_arr = [65, 15, 26] | |
#TODO: need to check | |
#extract number of pages in a direcotry of papers i want to print | |
df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber']) | |
pdf=PdfFileReader(open('path/to/file.pdf','rb')) | |
for root, dirs, files in os.walk(r'Directory path'): | |
for file in files: | |
if file.endswith(".pdf"): | |
df2 = pd.DataFrame([[file, os.path.join(root,file),pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber']) | |
df = df.append(df2, ignore_index=True) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#linux? | |
REM PDF-Pages.cmd | |
@echo off | |
del output.txt | |
for /r %1 %%f in (*.pdf) do pdfinfo.exe -meta "%%f" >out.txt & echo "%%f", | tr.exe -d "\r\n" >>output.txt & find "Pages:" out.txt | tr.exe -d "\r\n\055\056\072[:alpha:][:space:]" >>output.txt & echo , | tr.exe -d "\r\n" >>output.txt & find "File size:" out.txt | tr.exe -d "\055\056\072[:space:][:alpha:]" >>output.txt & echo. >>output.txt | |
del out.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dec. 12 page count: 648