olivx · January 16, 2019 12:19
diff --git a/create_candidate_cv_essence_it.py b/create_candidate_cv_essence_it.py
 # encoding=utf8
 from django.conf import settings
 from django.utils import timezone
 from django.core.files import File
 from django.core.files.base import ContentFile
 from django.core.management.base import BaseCommand, CommandError

 from combo.models import CandidateCV
 from django.contrib.auth.models import User

 import os
 import sys
 import pytz
 import PyPDF2
 import textract
 from optparse import make_option
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize

 reload(sys)
 sys.setdefaultencoding('utf8')


 class Command(BaseCommand):
    help = 'importar cv de uma pasta local para candidato cv'

    def add_arguments(self, parser):
        parser.add_argument('--init-default', nargs='+', type=str)

    def handle(self, *args, **options):
        print
        time_start = timezone.now()
        print 'começando gerar os arquivos as %s ' % time_start.strftime('%d/%m/%Y %H:%M:%S')
        print 'coletando os arquivos e removendo extenção'
        print

        path_home = os.environ['HOME']
        # _path = os.path.join(path_home, 'convert_resumes')
        _path = os.path.join(path_home, 'resume')

        list_files = map(lambda x: x[:-4], os.listdir(_path))
        users = User.objects.filter(username__in=list_files)
        candidate_cv = CandidateCV.objects.filter(user__username__in=list_files).values_list('user__username')

        print 'usuarios encontrado no db', users.count()
        print 'Numero de candidatos que já contem CV: ', len(candidate_cv)
        print

        candidate_cv_to_create = users.exclude(username__in=candidate_cv)
        print 'candidato CV que serão criados', candidate_cv_to_create.count()
        print

        print 'nltk obtendo recursos adcionais'
        nltk.download('stopwords')
        nltk.download('punkt')


        list_candidate_cv = list()
        punctuations = ['(',')',';',':','[',']',',']
        stop_words = stopwords.words('portuguese')
        total_items = candidate_cv_to_create.count()

        print
        list_file_open = []
        for index, user in enumerate(candidate_cv_to_create):

            # efeito loading
            current = index + 1
            filled_length = int(100 * current // total_items)
            bar = '*' * filled_length + '-' * (100 - filled_length)
            percent = 100 * (current / total_items)
            percent = ("{0:." + '1' + "f}").format(100 * (current / float(total_items)))

            sys.stdout.write("\r%s |%s| %s%% %s" % ('process', bar, percent, 'complete'))

            file_name = user.username.lower() + '.pdf'
            absolute_file_name = os.path.join(_path, file_name)

            count = 0
            text_from_pdf = ""

            # o limite de para abrir arquivos é de 1024
            if len(list_file_open) > 1000:
                CandidateCV.objects.bulk_create(list_candidate_cv)

                for _file in list_file_open:
                    _file.close()

                list_candidate_cv = []
                list_file_open = []



            pdf_file =  open(absolute_file_name, 'rb')
            list_file_open.append(pdf_file)
            django_file = File(pdf_file)

            pdfReader = PyPDF2.PdfFileReader(pdf_file )
            num_pages = pdfReader.numPages

            while count < num_pages:
                pageObj = pdfReader.getPage(count)
                count +=1
                text_from_pdf += pageObj.extractText()

            # text_from_pdf = textract.process(absolute_file_name)
            tokens = set(word_tokenize(text_from_pdf.lower(), language='portuguese'))
            token_keywords = [word.encode('ascii', 'ignore').decode('ascii') for word in tokens if not word in stop_words and not word in punctuations]

            # print keywords
            keywords = ' '.join(token_keywords)

            candidate = CandidateCV(
                            user = user,
                            resume= django_file,
                            resume_text = keywords,
                            date = timezone.now(),
                            last_update_date = timezone.now(),
                            imported_by_recruiter = False
                        )
            list_candidate_cv.append(candidate)

            # reopen.close()
            sys.stdout.flush()
            if current == total_items:
                print


        print
        print 'bulk create CandidateCV'
        print

        if len(list_file_open) < 1000:
            CandidateCV.objects.bulk_create(list_candidate_cv)

        time_stop = timezone.now()
        total_time = time_stop - time_start
        print 'CandidateCV gerado as %s ' % time_stop.strftime('%d/%m/%Y %H:%M:%S')
        print 'terminado as %s ' % str(total_time)
	# encoding=utf8
	from django.conf import settings
	from django.utils import timezone
	from django.core.files import File
	from django.core.files.base import ContentFile
	from django.core.management.base import BaseCommand, CommandError

	from combo.models import CandidateCV
	from django.contrib.auth.models import User

	import os
	import sys
	import pytz
	import PyPDF2
	import textract
	from optparse import make_option
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	reload(sys)
	sys.setdefaultencoding('utf8')


	class Command(BaseCommand):
	help = 'importar cv de uma pasta local para candidato cv'

	def add_arguments(self, parser):
	parser.add_argument('--init-default', nargs='+', type=str)

	def handle(self, args, *options):
	print
	time_start = timezone.now()
	print 'começando gerar os arquivos as %s ' % time_start.strftime('%d/%m/%Y %H:%M:%S')
	print 'coletando os arquivos e removendo extenção'
	print

	path_home = os.environ['HOME']
	# _path = os.path.join(path_home, 'convert_resumes')
	_path = os.path.join(path_home, 'resume')

	list_files = map(lambda x: x[:-4], os.listdir(_path))
	users = User.objects.filter(username__in=list_files)
	candidate_cv = CandidateCV.objects.filter(user__username__in=list_files).values_list('user__username')

	print 'usuarios encontrado no db', users.count()
	print 'Numero de candidatos que já contem CV: ', len(candidate_cv)
	print

	candidate_cv_to_create = users.exclude(username__in=candidate_cv)
	print 'candidato CV que serão criados', candidate_cv_to_create.count()
	print

	print 'nltk obtendo recursos adcionais'
	nltk.download('stopwords')
	nltk.download('punkt')


	list_candidate_cv = list()
	punctuations = ['(',')',';',':','[',']',',']
	stop_words = stopwords.words('portuguese')
	total_items = candidate_cv_to_create.count()

	print
	list_file_open = []
	for index, user in enumerate(candidate_cv_to_create):

	# efeito loading
	current = index + 1
	filled_length = int(100 * current // total_items)
	bar = '' filled_length + '-' * (100 - filled_length)
	percent = 100 * (current / total_items)
	percent = ("{0:." + '1' + "f}").format(100 * (current / float(total_items)))

	sys.stdout.write("\r%s \|%s\| %s%% %s" % ('process', bar, percent, 'complete'))

	file_name = user.username.lower() + '.pdf'
	absolute_file_name = os.path.join(_path, file_name)

	count = 0
	text_from_pdf = ""

	# o limite de para abrir arquivos é de 1024
	if len(list_file_open) > 1000:
	CandidateCV.objects.bulk_create(list_candidate_cv)

	for _file in list_file_open:
	_file.close()

	list_candidate_cv = []
	list_file_open = []



	pdf_file = open(absolute_file_name, 'rb')
	list_file_open.append(pdf_file)
	django_file = File(pdf_file)

	pdfReader = PyPDF2.PdfFileReader(pdf_file )
	num_pages = pdfReader.numPages

	while count < num_pages:
	pageObj = pdfReader.getPage(count)
	count +=1
	text_from_pdf += pageObj.extractText()

	# text_from_pdf = textract.process(absolute_file_name)
	tokens = set(word_tokenize(text_from_pdf.lower(), language='portuguese'))
	token_keywords = [word.encode('ascii', 'ignore').decode('ascii') for word in tokens if not word in stop_words and not word in punctuations]

	# print keywords
	keywords = ' '.join(token_keywords)

	candidate = CandidateCV(
	user = user,
	resume= django_file,
	resume_text = keywords,
	date = timezone.now(),
	last_update_date = timezone.now(),
	imported_by_recruiter = False
	)
	list_candidate_cv.append(candidate)

	# reopen.close()
	sys.stdout.flush()
	if current == total_items:
	print


	print
	print 'bulk create CandidateCV'
	print

	if len(list_file_open) < 1000:
	CandidateCV.objects.bulk_create(list_candidate_cv)

	time_stop = timezone.now()
	total_time = time_stop - time_start
	print 'CandidateCV gerado as %s ' % time_stop.strftime('%d/%m/%Y %H:%M:%S')
	print 'terminado as %s ' % str(total_time)