Skip to content

Instantly share code, notes, and snippets.

@olivx
Created January 16, 2019 12:19
Show Gist options
  • Save olivx/e1993d714962fac64b724b859de651d1 to your computer and use it in GitHub Desktop.
Save olivx/e1993d714962fac64b724b859de651d1 to your computer and use it in GitHub Desktop.
# encoding=utf8
from django.conf import settings
from django.utils import timezone
from django.core.files import File
from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand, CommandError
from combo.models import CandidateCV
from django.contrib.auth.models import User
import os
import sys
import pytz
import PyPDF2
import textract
from optparse import make_option
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
reload(sys)
sys.setdefaultencoding('utf8')
class Command(BaseCommand):
help = 'importar cv de uma pasta local para candidato cv'
def add_arguments(self, parser):
parser.add_argument('--init-default', nargs='+', type=str)
def handle(self, *args, **options):
print
time_start = timezone.now()
print 'começando gerar os arquivos as %s ' % time_start.strftime('%d/%m/%Y %H:%M:%S')
print 'coletando os arquivos e removendo extenção'
print
path_home = os.environ['HOME']
# _path = os.path.join(path_home, 'convert_resumes')
_path = os.path.join(path_home, 'resume')
list_files = map(lambda x: x[:-4], os.listdir(_path))
users = User.objects.filter(username__in=list_files)
candidate_cv = CandidateCV.objects.filter(user__username__in=list_files).values_list('user__username')
print 'usuarios encontrado no db', users.count()
print 'Numero de candidatos que já contem CV: ', len(candidate_cv)
print
candidate_cv_to_create = users.exclude(username__in=candidate_cv)
print 'candidato CV que serão criados', candidate_cv_to_create.count()
print
print 'nltk obtendo recursos adcionais'
nltk.download('stopwords')
nltk.download('punkt')
list_candidate_cv = list()
punctuations = ['(',')',';',':','[',']',',']
stop_words = stopwords.words('portuguese')
total_items = candidate_cv_to_create.count()
print
list_file_open = []
for index, user in enumerate(candidate_cv_to_create):
# efeito loading
current = index + 1
filled_length = int(100 * current // total_items)
bar = '*' * filled_length + '-' * (100 - filled_length)
percent = 100 * (current / total_items)
percent = ("{0:." + '1' + "f}").format(100 * (current / float(total_items)))
sys.stdout.write("\r%s |%s| %s%% %s" % ('process', bar, percent, 'complete'))
file_name = user.username.lower() + '.pdf'
absolute_file_name = os.path.join(_path, file_name)
count = 0
text_from_pdf = ""
# o limite de para abrir arquivos é de 1024
if len(list_file_open) > 1000:
CandidateCV.objects.bulk_create(list_candidate_cv)
for _file in list_file_open:
_file.close()
list_candidate_cv = []
list_file_open = []
pdf_file = open(absolute_file_name, 'rb')
list_file_open.append(pdf_file)
django_file = File(pdf_file)
pdfReader = PyPDF2.PdfFileReader(pdf_file )
num_pages = pdfReader.numPages
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
text_from_pdf += pageObj.extractText()
# text_from_pdf = textract.process(absolute_file_name)
tokens = set(word_tokenize(text_from_pdf.lower(), language='portuguese'))
token_keywords = [word.encode('ascii', 'ignore').decode('ascii') for word in tokens if not word in stop_words and not word in punctuations]
# print keywords
keywords = ' '.join(token_keywords)
candidate = CandidateCV(
user = user,
resume= django_file,
resume_text = keywords,
date = timezone.now(),
last_update_date = timezone.now(),
imported_by_recruiter = False
)
list_candidate_cv.append(candidate)
# reopen.close()
sys.stdout.flush()
if current == total_items:
print
print
print 'bulk create CandidateCV'
print
if len(list_file_open) < 1000:
CandidateCV.objects.bulk_create(list_candidate_cv)
time_stop = timezone.now()
total_time = time_stop - time_start
print 'CandidateCV gerado as %s ' % time_stop.strftime('%d/%m/%Y %H:%M:%S')
print 'terminado as %s ' % str(total_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment