Skip to content

Instantly share code, notes, and snippets.

@silverHugh
Created August 11, 2017 07:11
Show Gist options
  • Save silverHugh/3e5dad25a49ab7cd966caae65a9f9ef4 to your computer and use it in GitHub Desktop.
Save silverHugh/3e5dad25a49ab7cd966caae65a9f9ef4 to your computer and use it in GitHub Desktop.
Deal with file and directories, xlsx and extract text from pdf
import os
import re
from PyPDF2 import PdfFileReader
import textract
import csv
from openpyxl import Workbook
from openpyxl.writer.write_only import WriteOnlyCell
class Catloger(object):
paper_count = 0
sub_name = 5100001
page_count = 0
topic_name = ''
cata_list = []
def del_prefix(self, filename):
print(filename)
res = re.findall('([0-9]{7}_)', filename)
if res:
new_name = filename.replace(res[0], '')
print(new_name)
os.rename(filename, new_name)
pass;
def clear_cache_file(self, dirname):
for item in os.listdir(dirname):
if item.startswith('~') or item.startswith('.'):
print(item)
os.remove(item)
def regular_name(self, dirname):
count = -1
filename = os.path.basename(dirname)
for item in os.listdir(dirname):
if item.endswith('.pdf') and 'copyright' not in item:
self.paper_count += 1
pdf = PdfFileReader(open(item, 'rb'))
count = pdf.getNumPages()
print('[', str(self.paper_count), filename, ']', item, str(count))
self.record(self.paper_count, self.page_count + 1,
self.topic_name, textract.process(item))
new_name = str(self.sub_name + self.page_count)
self.page_count += count
return new_name
def record(self, num, page_start, topic, content):
self.cata_list.append({
'序号': num,
'起始页': page_start,
'Topic': topic,
'内容摘录': content[0:200]
})
def save2csv(self, path):
filename = 'catalog.csv'
path = os.path.join(path, filename)
with open(path, 'w') as csvfile:
fieldnames = ['序号', '起始页', 'Topic', '内容摘录']
writer = csv.DictWriter(csvfile, fieldnames)
writer.writeheader()
writer.writerows(self.cata_list)
print('Save csv file to:', path)
def save2xlsx(self, path):
filename = 'catalog.xlsx'
path = os.path.join(path, filename)
wb = Workbook(write_only=True)
ws = wb.create_sheet()
fieldnames = ['序号', '起始页', 'Topic', '内容摘录']
ws.append(fieldnames)
for cata in self.cata_list:
print(cata['内容摘录'])
content_cell = WriteOnlyCell(ws, value=cata['内容摘录'])
content_cell.style.alignment.wrap_text = True
ws.append([cata['序号'], cata['起始页'], cata['Topic'], content_cell])
wb.save(path)
if __name__ == "__main__":
cat = Catloger()
# Root ICIS2017 Proceedings
root_dir = os.getcwd()
cat.clear_cache_file(root_dir)
for f_root in os.listdir(root_dir):
topic_dir = os.path.join(root_dir, f_root)
if os.path.isdir(topic_dir):
# => Topic 0 Keynote and Invited Presentations
os.chdir(topic_dir)
cat.clear_cache_file(topic_dir)
cat.topic_name = f_root
print(f_root)
for f_topic in os.listdir(topic_dir):
sub_dir = os.path.join(topic_dir, f_topic)
if os.path.isdir(sub_dir):
# => Sub 5100001
os.chdir(os.path.join(topic_dir, sub_dir))
cat.clear_cache_file(topic_dir)
new_name = cat.regular_name(sub_dir)
# print (f_topic, '=>', new_name)
os.chdir(topic_dir)
os.rename(f_topic, new_name)
# for f_sub in os.listdir(sub_dir):
# del_prefix(f_sub)
print('Total page:', cat.page_count)
cat.save2xlsx(root_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment