violetguos · December 12, 2017 19:45 · violetguos · Dec 12, 2017
diff --git a/completeOnPC.py b/completeOnPC.py
 #############################
 #this should work on your PC
 #still have to manually input
 #your quota
 #N.B. ECF runfs Python 2.6, 
 #I'm not gonna support this.
 #just run on your PC
 #try to run 'pq' on ECF and
 #copy your quota to the script, set it to num_pg
 ############################

 from glob import glob as __g
 from PyPDF2 import PdfFileReader

 import re
 import pandas as pd
 import os

 #give number of pages i want to print, what is the ECF quota
 def dupQuota(num_pages):
  quota = (num_pages + 0.5*num_pages)/2

 #given my current quota, how many PDF pages can i print
 def quotaToPage(num_quota):
  pages = num_quota * 2 / 1.5
  return pages


 def count_os(path):
    """
    alternative method
    """
    df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
    for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'):
        for f in files:
            if f.endswith(".pdf"):
                pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
                df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
                df = df.append(df2, ignore_index=True)
    return df




 def knapsack(items, maxweight):
    """
    Solve the knapsack problem by finding the most valuable
    subsequence of `items` subject that weighs no more than
    `maxweight`.
    `items` is a sequence of pairs `(value, weight)`, where `value` is
    a number and `weight` is a non-negative integer.
    `maxweight` is a non-negative integer.
    Return a pair whose first element is the sum of values in the most
    valuable subsequence, and whose second element is the subsequence.
    >>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
    >>> knapsack(items, 15)
    (11, [(2, 1), (6, 4), (1, 1), (2, 2)])


    (importance, page count)
    for example, my ECE568 papers got 100 pages, but assigned weight
    = 50% vs my CSC384 paper, 20 pages, 40%

    """

    # Return the value of the most valuable subsequence of the first i
    # elements in items whose weights sum to no more than j.
    #@memoized

    def bestvalue(i, j):
        if i == 0: return 0

        value, file_page_tup = items[i - 1]
        #print items[i-1]
        #print file_page_tup
        file_name, weight = file_page_tup
        if weight > j:
            return bestvalue(i - 1, j)
        else:
            return max(bestvalue(i - 1, j),
                       bestvalue(i - 1, j - weight) + value)

    j = maxweight
    result = []
    for i in xrange(len(items), 0, -1):
        if bestvalue(i, j) != bestvalue(i - 1, j):
            result.append(items[i - 1])
            f, w = items[i - 1][1]
            j -= w
    result.reverse()
    return bestvalue(len(items), maxweight), result






 #ms = count_pg(".")
 path = os.getcwd()
 pd_page = count_os(path)

 list_page = []
 for i,j in zip(pd_page['fileName'], pd_page['pageNumber']):
    tup_pg = tuple((i,j))
    list_page.append(tuple((j, tup_pg)))

 #print(list_page)

 item  = list_page
 num_pg = 100
 quota = quotaToPage(num_pg)
 print quota
 print knapsack(item, quota)

 #print b1, b2
 #print b2[0], b2[1]
diff --git a/countOswalk.py b/countOswalk.py
 ################################
 # FULLY FUNCTIONAL
 # Returns true page count for all pdf, no
 # problem so far
 #TODO: PD dataframe to list of tuples??
 # ###############

 from glob import glob as __g
 from PyPDF2 import PdfFileReader

 import re
 import pandas as pd
 import os

 def count_os(path):
    """
    alternative method
    """
    df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
    for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'):
        for f in files:
            if f.endswith(".pdf"):
                pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
                df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
                df = df.append(df2, ignore_index=True)
    return df

 #ms = count_pg(".")
 path = os.getcwd()
 list_page = count_os(path)

 print list_page['pageNumber']

 #for iter in ms.items():
 #    print iter[0], iter[1]
 #    if iter[1] ==0:
 #        print "notiec ", iter[0]


 #print ms
diff --git a/pagecountBrute.py b/pagecountBrute.py
 """
 This module contains a function to count
 the total pages for all PDF files in one directory.
 """
 #from time import clock as __c #Used for benchmark.
 from glob import glob as __g
 from re import search as __s
 def count( vPath ):
 	"""
 	Takes one argument: the path where you want to search the files.
 	Returns a dictionary with the file name and number of pages for each file.
 	"""
 	#
 	#cdef double ti = __c() #Used for benchmark.
 	#
 	vPDFfiles = __g( vPath + "\\" + '*.pdf' )
 	vPages = 0
 	vMsg = {}
 	#
 	for vPDFfile in vPDFfiles:
 		vFileOpen = open( vPDFfile, 'rb', 1 )
 		for vLine in vFileOpen.readlines():
 			if "/Count " in vLine:
 				vPages = int( __s("/Count \d*", vLine).group()[7:] )
 		vMsg[vPDFfile] = vPages
 		vFileOpen.close()
 	#
 	#cdef double tf = __c() #Used for benchmark.
 	#
 	#print tf-ti
 	return vMsg
 	#
diff --git a/pagecountEfficient.py b/pagecountEfficient.py
 ##################################
 # Verified that this works on mac OS
 #
 # ISSUE: COUNT ALL PDFS generated by latex = 0!!!!
 ##################################


 from glob import glob as __g
 import re
 pattern = re.compile(r"/Count\s+(\d+)")
 def count_pg(vPath):
    """
    Takes one argument: the path where you want to search the files.
    Returns a dictionary with the file name and number of pages for each file.
    """
    vPDFfiles = __g( vPath + "/" + '*.pdf' )
    print vPDFfiles
    vMsg = {}
    for vPDFfile in vPDFfiles:
        vPages = 0

        content = open( vPDFfile, 'rb', 1 ).read()
        for match in pattern.finditer(content):
            vPages = int(match.group(1))
        vMsg[vPDFfile] = vPages
    return vMsg

 ms = count_pg(".")
 print ms

diff --git a/pageCountUnix.py b/pageCountUnix.py
 import os
 from os.path import join as pjoin, expanduser
 import subprocess
 from pprint import pprint

 directory = expanduser("~/Documents/extra_learning/ug_theses")

 for name in os.listdir(directory):
  if name[-4:] == ".pdf":
    p = pjoin(directory, name)
    child = subprocess.Popen("pdftk %s dump_data output" % p, shell=True,
                           stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    sout, serr = child.communicate()
    if serr:
      print p
      print(serr)
    else:
      D = dict((t[0].strip(), t[1].strip()) for t in
                     (t.split(":") for t in sout.split("\n")[:-1]))
      pprint(D)
diff --git a/paperprint.py b/paperprint.py
 ########################################################
 # As a cheap and poor af student at Utoronto, 
 # I have a limited amount of print 
 # quota and I can't possibly print 
 # all the material. Given a directory of 
 # PDFs I want to print, and privided that I 
 # go duplex, what is the maximum number of 
 # papers I can print? If not all the papers are given 
 # equal weight, how many can I print? 
 # Modeled as a knapsack problem
 ########################################################

 import pandas as pd
 import os
 from PyPDF2 import PdfFileReader
 import commands
 #cmd = "pq"
 #output = commands.getoutput(cmd) #utorid -> PAGES PRINTED   UNIX: 280   WINDOWS: 0   TOTAL: 280   LIMIT: 780
 #try to extract the last number
 #print type(output)


 #give number of pages i want to print, what is the ECF quota
 def dupQuota(num_pages):
  quota = (num_pages + 0.5*num_pages)/2

 #given my current quota, how many PDF pages can i print
 def quotaToPage(num_quota):
  pages = num_quota * 2 / 1.5



 def knapsack(items, maxweight):
    """
    Solve the knapsack problem by finding the most valuable
    subsequence of `items` subject that weighs no more than
    `maxweight`.
    `items` is a sequence of pairs `(value, weight)`, where `value` is
    a number and `weight` is a non-negative integer.
    `maxweight` is a non-negative integer.
    Return a pair whose first element is the sum of values in the most
    valuable subsequence, and whose second element is the subsequence.
    >>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
    >>> knapsack(items, 15)
    (11, [(2, 1), (6, 4), (1, 1), (2, 2)])
    
    
    (importance, page count)
    for example, my ECE568 papers got 100 pages, but assigned weight
    = 50% vs my CSC384 paper, 20 pages, 40%
    
    """

    # Return the value of the most valuable subsequence of the first i
    # elements in items whose weights sum to no more than j.
    #@memoized

    def bestvalue(i, j):
        if i == 0: return 0
        value, weight = items[i - 1]
        if weight > j:
            return bestvalue(i - 1, j)
        else:
            return max(bestvalue(i - 1, j),
                       bestvalue(i - 1, j - weight) + value)

    j = maxweight
    result = []
    for i in xrange(len(items), 0, -1):
        if bestvalue(i, j) != bestvalue(i - 1, j):
            result.append(items[i - 1])
            j -= items[i - 1][1]
    result.reverse()
    return bestvalue(len(items), maxweight), result  


 item = [(4, 120), (2, 10), (6, 40), (1, 10), (2, 20)]
 maxw = 10

 #item = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
 #knapsack(item, 100)
  
 #TODO: napsack problem?
 def cheap_af(dict_paper_page, quota_left):
  '''
  dict_papaer_page = {'file name': number of pages}
  (value, weight)`, where `value` is
    a number and `weight` is a non-negative integer.
  dict_paper_page =  [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
  '''
  
 man_arr = [65, 15, 26] 

 #TODO: need to check  
 #extract number of pages in a direcotry of papers i want to print
 df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
 pdf=PdfFileReader(open('path/to/file.pdf','rb'))
 for root, dirs, files in os.walk(r'Directory path'):
    for file in files:
        if file.endswith(".pdf"):
            df2 = pd.DataFrame([[file, os.path.join(root,file),pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
            df = df.append(df2, ignore_index=True)
            
diff --git a/PDF-Pages.sh b/PDF-Pages.sh
 #linux?
 REM PDF-Pages.cmd
 @echo off
 del output.txt
 for /r %1 %%f in (*.pdf) do pdfinfo.exe -meta "%%f" >out.txt & echo "%%f", | tr.exe -d "\r\n" >>output.txt & find "Pages:" out.txt | tr.exe -d "\r\n\055\056\072[:alpha:][:space:]" >>output.txt & echo , | tr.exe -d "\r\n" >>output.txt & find "File size:" out.txt | tr.exe -d "\055\056\072[:space:][:alpha:]" >>output.txt & echo. >>output.txt
 del out.txt
	#############################
	#this should work on your PC
	#still have to manually input
	#your quota
	#N.B. ECF runfs Python 2.6,
	#I'm not gonna support this.
	#just run on your PC
	#try to run 'pq' on ECF and
	#copy your quota to the script, set it to num_pg
	############################

	from glob import glob as __g
	from PyPDF2 import PdfFileReader

	import re
	import pandas as pd
	import os

	#give number of pages i want to print, what is the ECF quota
	def dupQuota(num_pages):
	quota = (num_pages + 0.5*num_pages)/2

	#given my current quota, how many PDF pages can i print
	def quotaToPage(num_quota):
	pages = num_quota * 2 / 1.5
	return pages


	def count_os(path):
	"""
	alternative method
	"""
	df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
	for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'):
	for f in files:
	if f.endswith(".pdf"):
	pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
	df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
	df = df.append(df2, ignore_index=True)
	return df




	def knapsack(items, maxweight):
	"""
	Solve the knapsack problem by finding the most valuable
	subsequence of `items` subject that weighs no more than
	`maxweight`.
	`items` is a sequence of pairs `(value, weight)`, where `value` is
	a number and `weight` is a non-negative integer.
	`maxweight` is a non-negative integer.
	Return a pair whose first element is the sum of values in the most
	valuable subsequence, and whose second element is the subsequence.
	>>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
	>>> knapsack(items, 15)
	(11, [(2, 1), (6, 4), (1, 1), (2, 2)])


	(importance, page count)
	for example, my ECE568 papers got 100 pages, but assigned weight
	= 50% vs my CSC384 paper, 20 pages, 40%

	"""

	# Return the value of the most valuable subsequence of the first i
	# elements in items whose weights sum to no more than j.
	#@memoized

	def bestvalue(i, j):
	if i == 0: return 0

	value, file_page_tup = items[i - 1]
	#print items[i-1]
	#print file_page_tup
	file_name, weight = file_page_tup
	if weight > j:
	return bestvalue(i - 1, j)
	else:
	return max(bestvalue(i - 1, j),
	bestvalue(i - 1, j - weight) + value)

	j = maxweight
	result = []
	for i in xrange(len(items), 0, -1):
	if bestvalue(i, j) != bestvalue(i - 1, j):
	result.append(items[i - 1])
	f, w = items[i - 1][1]
	j -= w
	result.reverse()
	return bestvalue(len(items), maxweight), result






	#ms = count_pg(".")
	path = os.getcwd()
	pd_page = count_os(path)

	list_page = []
	for i,j in zip(pd_page['fileName'], pd_page['pageNumber']):
	tup_pg = tuple((i,j))
	list_page.append(tuple((j, tup_pg)))

	#print(list_page)

	item = list_page
	num_pg = 100
	quota = quotaToPage(num_pg)
	print quota
	print knapsack(item, quota)

	#print b1, b2
	#print b2[0], b2[1]
	################################
	# FULLY FUNCTIONAL
	# Returns true page count for all pdf, no
	# problem so far
	#TODO: PD dataframe to list of tuples??
	# ###############

	from glob import glob as __g
	from PyPDF2 import PdfFileReader

	import re
	import pandas as pd
	import os

	def count_os(path):
	"""
	alternative method
	"""
	df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
	for root, dirs, files in os.walk(path): #(r'/Documents/extra_learning/ug_theses'):
	for f in files:
	if f.endswith(".pdf"):
	pdf=PdfFileReader(open(os.path.join(root, f),'rb'))
	df2 = pd.DataFrame([[f, os.path.join(root,f), pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
	df = df.append(df2, ignore_index=True)
	return df

	#ms = count_pg(".")
	path = os.getcwd()
	list_page = count_os(path)

	print list_page['pageNumber']

	#for iter in ms.items():
	# print iter[0], iter[1]
	# if iter[1] ==0:
	# print "notiec ", iter[0]


	#print ms
	"""
	This module contains a function to count
	the total pages for all PDF files in one directory.
	"""
	#from time import clock as __c #Used for benchmark.
	from glob import glob as __g
	from re import search as __s
	def count( vPath ):
	"""
	Takes one argument: the path where you want to search the files.
	Returns a dictionary with the file name and number of pages for each file.
	"""
	#
	#cdef double ti = __c() #Used for benchmark.
	#
	vPDFfiles = __g( vPath + "\\" + '*.pdf' )
	vPages = 0
	vMsg = {}
	#
	for vPDFfile in vPDFfiles:
	vFileOpen = open( vPDFfile, 'rb', 1 )
	for vLine in vFileOpen.readlines():
	if "/Count " in vLine:
	vPages = int( __s("/Count \d*", vLine).group()[7:] )
	vMsg[vPDFfile] = vPages
	vFileOpen.close()
	#
	#cdef double tf = __c() #Used for benchmark.
	#
	#print tf-ti
	return vMsg
	#
	##################################
	# Verified that this works on mac OS
	#
	# ISSUE: COUNT ALL PDFS generated by latex = 0!!!!
	##################################


	from glob import glob as __g
	import re
	pattern = re.compile(r"/Count\s+(\d+)")
	def count_pg(vPath):
	"""
	Takes one argument: the path where you want to search the files.
	Returns a dictionary with the file name and number of pages for each file.
	"""
	vPDFfiles = __g( vPath + "/" + '*.pdf' )
	print vPDFfiles
	vMsg = {}
	for vPDFfile in vPDFfiles:
	vPages = 0

	content = open( vPDFfile, 'rb', 1 ).read()
	for match in pattern.finditer(content):
	vPages = int(match.group(1))
	vMsg[vPDFfile] = vPages
	return vMsg

	ms = count_pg(".")
	print ms
	import os
	from os.path import join as pjoin, expanduser
	import subprocess
	from pprint import pprint

	directory = expanduser("~/Documents/extra_learning/ug_theses")

	for name in os.listdir(directory):
	if name[-4:] == ".pdf":
	p = pjoin(directory, name)
	child = subprocess.Popen("pdftk %s dump_data output" % p, shell=True,
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	sout, serr = child.communicate()
	if serr:
	print p
	print(serr)
	else:
	D = dict((t[0].strip(), t[1].strip()) for t in
	(t.split(":") for t in sout.split("\n")[:-1]))
	pprint(D)
	########################################################
	# As a cheap and poor af student at Utoronto,
	# I have a limited amount of print
	# quota and I can't possibly print
	# all the material. Given a directory of
	# PDFs I want to print, and privided that I
	# go duplex, what is the maximum number of
	# papers I can print? If not all the papers are given
	# equal weight, how many can I print?
	# Modeled as a knapsack problem
	########################################################

	import pandas as pd
	import os
	from PyPDF2 import PdfFileReader
	import commands
	#cmd = "pq"
	#output = commands.getoutput(cmd) #utorid -> PAGES PRINTED UNIX: 280 WINDOWS: 0 TOTAL: 280 LIMIT: 780
	#try to extract the last number
	#print type(output)


	#give number of pages i want to print, what is the ECF quota
	def dupQuota(num_pages):
	quota = (num_pages + 0.5*num_pages)/2

	#given my current quota, how many PDF pages can i print
	def quotaToPage(num_quota):
	pages = num_quota * 2 / 1.5



	def knapsack(items, maxweight):
	"""
	Solve the knapsack problem by finding the most valuable
	subsequence of `items` subject that weighs no more than
	`maxweight`.
	`items` is a sequence of pairs `(value, weight)`, where `value` is
	a number and `weight` is a non-negative integer.
	`maxweight` is a non-negative integer.
	Return a pair whose first element is the sum of values in the most
	valuable subsequence, and whose second element is the subsequence.
	>>> items = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
	>>> knapsack(items, 15)
	(11, [(2, 1), (6, 4), (1, 1), (2, 2)])


	(importance, page count)
	for example, my ECE568 papers got 100 pages, but assigned weight
	= 50% vs my CSC384 paper, 20 pages, 40%

	"""

	# Return the value of the most valuable subsequence of the first i
	# elements in items whose weights sum to no more than j.
	#@memoized

	def bestvalue(i, j):
	if i == 0: return 0
	value, weight = items[i - 1]
	if weight > j:
	return bestvalue(i - 1, j)
	else:
	return max(bestvalue(i - 1, j),
	bestvalue(i - 1, j - weight) + value)

	j = maxweight
	result = []
	for i in xrange(len(items), 0, -1):
	if bestvalue(i, j) != bestvalue(i - 1, j):
	result.append(items[i - 1])
	j -= items[i - 1][1]
	result.reverse()
	return bestvalue(len(items), maxweight), result


	item = [(4, 120), (2, 10), (6, 40), (1, 10), (2, 20)]
	maxw = 10

	#item = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
	#knapsack(item, 100)

	#TODO: napsack problem?
	def cheap_af(dict_paper_page, quota_left):
	'''
	dict_papaer_page = {'file name': number of pages}
	(value, weight)`, where `value` is
	a number and `weight` is a non-negative integer.
	dict_paper_page = [(4, 12), (2, 1), (6, 4), (1, 1), (2, 2)]
	'''

	man_arr = [65, 15, 26]

	#TODO: need to check
	#extract number of pages in a direcotry of papers i want to print
	df = pd.DataFrame(columns=['fileName', 'fileLocation', 'pageNumber'])
	pdf=PdfFileReader(open('path/to/file.pdf','rb'))
	for root, dirs, files in os.walk(r'Directory path'):
	for file in files:
	if file.endswith(".pdf"):
	df2 = pd.DataFrame([[file, os.path.join(root,file),pdf.getNumPages()]], columns=['fileName', 'fileLocation', 'pageNumber'])
	df = df.append(df2, ignore_index=True)
	#linux?
	REM PDF-Pages.cmd
	@echo off
	del output.txt
	for /r %1 %%f in (*.pdf) do pdfinfo.exe -meta "%%f" >out.txt & echo "%%f", \| tr.exe -d "\r\n" >>output.txt & find "Pages:" out.txt \| tr.exe -d "\r\n\055\056\072[:alpha:][:space:]" >>output.txt & echo , \| tr.exe -d "\r\n" >>output.txt & find "File size:" out.txt \| tr.exe -d "\055\056\072[:space:][:alpha:]" >>output.txt & echo. >>output.txt
	del out.txt