Created
January 16, 2019 12:19
-
-
Save olivx/e1993d714962fac64b724b859de651d1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf8 | |
from django.conf import settings | |
from django.utils import timezone | |
from django.core.files import File | |
from django.core.files.base import ContentFile | |
from django.core.management.base import BaseCommand, CommandError | |
from combo.models import CandidateCV | |
from django.contrib.auth.models import User | |
import os | |
import sys | |
import pytz | |
import PyPDF2 | |
import textract | |
from optparse import make_option | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
class Command(BaseCommand): | |
help = 'importar cv de uma pasta local para candidato cv' | |
def add_arguments(self, parser): | |
parser.add_argument('--init-default', nargs='+', type=str) | |
def handle(self, *args, **options): | |
time_start = timezone.now() | |
print 'começando gerar os arquivos as %s ' % time_start.strftime('%d/%m/%Y %H:%M:%S') | |
print 'coletando os arquivos e removendo extenção' | |
path_home = os.environ['HOME'] | |
# _path = os.path.join(path_home, 'convert_resumes') | |
_path = os.path.join(path_home, 'resume') | |
list_files = map(lambda x: x[:-4], os.listdir(_path)) | |
users = User.objects.filter(username__in=list_files) | |
candidate_cv = CandidateCV.objects.filter(user__username__in=list_files).values_list('user__username') | |
print 'usuarios encontrado no db', users.count() | |
print 'Numero de candidatos que já contem CV: ', len(candidate_cv) | |
candidate_cv_to_create = users.exclude(username__in=candidate_cv) | |
print 'candidato CV que serão criados', candidate_cv_to_create.count() | |
print 'nltk obtendo recursos adcionais' | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
list_candidate_cv = list() | |
punctuations = ['(',')',';',':','[',']',','] | |
stop_words = stopwords.words('portuguese') | |
total_items = candidate_cv_to_create.count() | |
list_file_open = [] | |
for index, user in enumerate(candidate_cv_to_create): | |
# efeito loading | |
current = index + 1 | |
filled_length = int(100 * current // total_items) | |
bar = '*' * filled_length + '-' * (100 - filled_length) | |
percent = 100 * (current / total_items) | |
percent = ("{0:." + '1' + "f}").format(100 * (current / float(total_items))) | |
sys.stdout.write("\r%s |%s| %s%% %s" % ('process', bar, percent, 'complete')) | |
file_name = user.username.lower() + '.pdf' | |
absolute_file_name = os.path.join(_path, file_name) | |
count = 0 | |
text_from_pdf = "" | |
# o limite de para abrir arquivos é de 1024 | |
if len(list_file_open) > 1000: | |
CandidateCV.objects.bulk_create(list_candidate_cv) | |
for _file in list_file_open: | |
_file.close() | |
list_candidate_cv = [] | |
list_file_open = [] | |
pdf_file = open(absolute_file_name, 'rb') | |
list_file_open.append(pdf_file) | |
django_file = File(pdf_file) | |
pdfReader = PyPDF2.PdfFileReader(pdf_file ) | |
num_pages = pdfReader.numPages | |
while count < num_pages: | |
pageObj = pdfReader.getPage(count) | |
count +=1 | |
text_from_pdf += pageObj.extractText() | |
# text_from_pdf = textract.process(absolute_file_name) | |
tokens = set(word_tokenize(text_from_pdf.lower(), language='portuguese')) | |
token_keywords = [word.encode('ascii', 'ignore').decode('ascii') for word in tokens if not word in stop_words and not word in punctuations] | |
# print keywords | |
keywords = ' '.join(token_keywords) | |
candidate = CandidateCV( | |
user = user, | |
resume= django_file, | |
resume_text = keywords, | |
date = timezone.now(), | |
last_update_date = timezone.now(), | |
imported_by_recruiter = False | |
) | |
list_candidate_cv.append(candidate) | |
# reopen.close() | |
sys.stdout.flush() | |
if current == total_items: | |
print 'bulk create CandidateCV' | |
if len(list_file_open) < 1000: | |
CandidateCV.objects.bulk_create(list_candidate_cv) | |
time_stop = timezone.now() | |
total_time = time_stop - time_start | |
print 'CandidateCV gerado as %s ' % time_stop.strftime('%d/%m/%Y %H:%M:%S') | |
print 'terminado as %s ' % str(total_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment