Skip to content

Instantly share code, notes, and snippets.

@olivx
Created December 22, 2018 05:58
Show Gist options
  • Save olivx/9265c6d8a359e0bb76dc30ff1f57141a to your computer and use it in GitHub Desktop.
Save olivx/9265c6d8a359e0bb76dc30ff1f57141a to your computer and use it in GitHub Desktop.
company = Company.objects.get(pk=116091)
def create_users(df):
# normalize os dados
print 'normalize o username '
df.replace('', np.nan, inplace=True)
df = df.replace(np.nan, '', regex=True)
df['username'] = df['username'].apply(limit_30_char)
df['first_name'] = df['first_name'].apply(limit_30_char)
df['last_name'] = df['last_name'].apply(limit_30_char)
df['username'] = df['username'].str.strip()
df['username'] = df['username'].str.lower()
df_user_to_create = df.drop_duplicates('username').sort_values(by=['username'])
print 'recuperando usarios cadastrados'
data_all_user = list(User.objects.filter(
username__in=list(df_user_to_create['username'].values.tolist())
).values_list('username', flat=True)
)
columns = list(df.columns)[1:]
users = []
list_usernames = []
print 'montando data user'
for usernames in data_all_user:
list_usernames.append(usernames.strip().lower())
print 'gerando userprofile apartir do dataframe'
for row in df_user_to_create.itertuples():
user = {}
username = getattr(row, 'username')
for col in columns:
if username not in list_usernames:
user[col] = getattr(row, col)
users.append(User(**user))
for index, user in enumerate(users):
if len(user.username) < 4:
users.pop(index)
return users
def create_objects(df, klass, key, company=None):
if company is not None:
commentor = User.objects.get(pk=company.pk)
# normalize os dados
print 'normalize %s ' % klass.__name__
df = df.replace(np.nan, '', regex=True)
df[key] = df[key].apply(limit_30_char)
df[key] = df[key].str.strip()
df[key] = df[key].str.lower()
df_to_create = df.drop_duplicates(key).sort_values(by=[key])
columns = list(df.columns)[1:]
# recuperando usarios cadastrados
data_all_user = list(User.objects.filter(
username__in=list(df_to_create[key].values.tolist())
)
)
if klass.__name__ == 'Comment':
data_all_elements = list(klass.objects.filter(
tocandidate__username__in=list(df_to_create[key].values.tolist())
).values_list('tocandidate__username', flat=True)
)
elif klass.__name__ == 'CompanyCandidate':
data_all_elements = list(klass.objects.filter(
candidate__username__in=list(df_to_create[key].values.tolist())
).values_list('candidate__username', flat=True)
)
else:
data_all_elements = list(klass.objects.filter(
user__username__in=list(df_to_create[key].values.tolist())
).values_list('user__username', flat=True)
)
print 'total em data_all_elements ', len(data_all_elements)
print 'total de paginas data_users', len(data_all_user)
print 'total em df_to_create', df_to_create.shape[0]
data_users = {}
print 'montando data user'
for _user in data_all_user:
data_users[_user.username] = _user
list_elements = []
print 'montato data user profile '
for element in data_all_elements:
list_elements.append(element.strip().lower())
_list_to_return = []
print 'gerando userprofile apartir do dataframe'
for row in df_to_create.itertuples():
_object = {}
for col in columns:
username = getattr(row, key)
if username not in list_elements:
if col == 'user':
user = data_users.get(username, None)
if user is not None:
_object['user_id'] = user.pk
else:
print 'User %s não foi achado no dict, indo buscar no banco.' % username
_object['user_id'] = User.objects.get(username=username).pk
elif col == 'candidate':
user = data_users.get(username)
if user is not None:
_object['candidate_id'] = user.pk
else:
print 'User %s não foi achado no dict, indo buscar no banco.' % username
_object['candidate_id'] = User.objects.get(username=username).pk
elif col == 'tocandidate':
user = data_users.get(username)
if user is not None:
_object['tocandidate_id'] = user.pk
_object['commentator_id'] = commentor.pk
else:
print 'User %s não foi achado no dict, indo buscar no banco.' % username
_object['tocandidate_id'] = User.objects.get(username=username).pk
_object['commentator_id'] = commentor.pk
elif col == 'company':
_object[col] = company
elif col == 'birthday':
try:
_object[col] = datetime.strptime(item, 'yy-mm-dd')
except:
_object[col] = None
else:
_object[col] = getattr(row, col)
if bool(_object):
_list_to_return.append(klass(**_object))
return _list_to_return
def limit_30_char(x):
if len(str(x)) >= 30:
return x[:29]
return x
reload(sys)
sys.setdefaultencoding('utf8')
print 'Incializando importação...'
time_start = timeit.default_timer()
print 'start em %s' % datetime.now()
base_dir = settings.BASE_DIR
dict_klass = {}
dict_klass['user'] = {'klass': User, 'key': 'username'}
dict_klass['userprofile'] = {'klass': UserProfile, 'key': 'user'}
dict_klass['usercpf'] = {'klass': UserCPF, 'key': 'user'}
dict_klass['comments'] = {'klass': Comment, 'key': 'tocandidate'}
dict_klass['userskill'] = {'klass': UserSkill, 'key': 'user'}
dict_klass['userdisability'] = {'klass': UserDisability, 'key': 'user'}
dict_klass['companycandidate'] = {'klass': CompanyCandidate, 'key': 'candidate'}
xls = pd.ExcelFile('data_essence_it.xlsx')
for sheet in xls.sheet_names:
print ''
print ''
print '*'* 50
print 'iniciando nova tarefa em %s' % datetime.now()
print'começando com %s' % sheet
if sheet == 'user':
tic = timeit.default_timer()
df_import_data = pd.read_excel(xls, sheet)
print 'montando os uaurios e fazendo bulk create'
users = create_users(df_import_data)
print 'bulk create !!!'
User.objects.bulk_create(users, batch_size=10000)
print ''
print 'adicionado os usarios criados em um grupo de candidatos'
list_user = df_import_data.username.values.tolist()
users = User.objects.filter(username__in=list_user)
group = Group.objects.get(name='candidate')
group.user_set.add(*users)
toc = timeit.default_timer()
print 'tempo de tarefa ', toc - tic
print 'terminado em ', datetime.now()
else:
_klass = dict_klass[sheet]['klass']
_key = dict_klass[sheet]['key']
tic = timeit.default_timer()
df_import_data = pd.read_excel(xls, sheet)
print 'montando os %s e fazendo bulk create' % sheet
_objects = create_objects(df_import_data, _klass, _key, company)
print 'bulk create !!!'
_klass.objects.bulk_create(_objects, batch_size=10000)
toc = timeit.default_timer()
print 'tempo de tarefa %s', toc - tic
print 'terminado em %s', datetime.now()
time_end = timeit.default_timer()
print ''
print ''
print 'tempo total de execução ', time_end - time_start
print 'terminado em ', datetime.now()
import re
import os
import pymysql
import pandas as pd
from weasyprint import HTML
import timeit
class Connection:
GET_USER_NAMES = '''
SELECT DISTINCT EMAIL from applicants where email <> '';
'''
GET_USER = '''
SELECT EMAIL AS username, EMAIL as email,
Substring(FIRST_NAME, 1, Locate(' ', FIRST_NAME) - 1) AS first_name,
Substring(FIRST_NAME, Locate(' ', FIRST_NAME) + 1) AS last_name,
'H@shPAssWr0d' as password
FROM applicants
where trim(email) in (%s )
'''
GET_USER_CPF = '''
SELECT email as user, rg, case
when user_name REGEXP '^[0-9]+$' then user_name
else '' end as cpf
FROM applicants
where trim(email) in (%s );
'''
GET_USER_DISABILITY = '''
SELECT email as user , d.jobconvo as disability
FROM applicants
left join disability as d on d.id = applicants.deficient
where trim(email) in (%s );
'''
GET_USER_PROFILES = '''
SELECT first_name as full_name, email as user,
case gender
when 1323 then '1'
when 1324 then '2'
else 3 end as born_sex,
dob as birthday, phone1, phone2 , address1 as address, number as adddressnumber,
address2 as complement, neighborhood as neighbourhood, states.state_name as state, citys.name as city,
country_name as country , zip as cep
FROM essence.applicants
left join states on states.state_id = applicants.state_id
left join citys on citys.id = applicants.city_id
left join country on country.country_id = applicants.country_id
where trim(email) in (%s );
'''
GET_USER_SKILL = '''
select a.email as user, s.skill as skill
from applicants_skill as a_skill
inner join applicants as a on a.applicant_id = a_skill.applicant_id
inner join skill as s on s.id = a_skill.skill_id
where trim(email) in (%s );
'''
GET_USER_COMMENTS = '''
select a.email as tocandidate , comments as comments
from applicant_flagged as a_flagged
inner join applicants as a on a.applicant_id = a_flagged.applicant_id
where trim(email) in (%s );
'''
GET_RESUMES = '''
select email , resume from applicants
where trim(email) in (%s );
'''
GET_COMPANY_CANDIDATE = '''
select a.email as candidate , 116091 as company, a.created_on as date, s.referral_source as source,
case af.flag_id
when 165 then True
else False end as blacklist,
case af.flag_id
when 166 then True
else False end as favorite
from applicants as a
left join applicant_flagged as af on af.applicant_id = a.applicant_id
left join source as s on s.id = a.referral_source
where trim(email) in (%s );
'''
GET_APPLICANTS = '''
SELECT applicant_id, user_name , password ,
referral_source AS SOURCE_ID, first_name ,
dob, email, phone1, phone2 , address1 , address2 , city_id ,
state_id, zip , number , neighborhood ,
country_id AS COUNTRY, english_leval ,
spanish_leval , other_level, other_language ,
education , professional_objective ,
resume ,linkedin_profile_url ,
facebook_profile_url, working, rg ,
validity , nature, gender , marital_status ,
dependentes, deficient, availability_start , availability_travel, main_interest, universities_pg,
completion_year_pg, course_pg, area_of_expertise, profile_type,
skills_modules, knowledge_level, other_certifications
FROM essence.applicants;
'''
def __init__(self, host='localhost', db_name='essence', password='root', user_name='root'):
self.user_name = user_name
self.password = password
self.host = host
self.db_name = db_name
self.valid_user_names = []
def connection(self):
connection = pymysql.connect(host=self.host, user=self.user_name, password=self.password,
db=self.db_name, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
return connection
def query(self, sql, parameters=None):
try:
connection = self.connection()
with connection.cursor() as cursor:
if parameters:
_query = sql % ','.join(['%s'] * len(parameters))
cursor.execute(_query, parameters)
else:
cursor.execute(sql)
return cursor.fetchall()
except Exception as e:
print("Error with query: " + str(e))
finally:
connection.close()
@staticmethod
def export_xls(data_frame, file_name):
file_name = '%s.xlsx' % file_name
BASE_DIR = os.path.dirname(__file__)
absolute_path = os.path.join(BASE_DIR, 'exported', file_name)
writer_orig = pd.ExcelWriter(absolute_path, engine='xlsxwriter')
for frame in data_frame:
_data = frame['dataframe']
_sheet_name = frame['sheet']
df = pd.DataFrame(_data)
df.to_excel(writer_orig, index=False, sheet_name=_sheet_name)
writer_orig.save()
@staticmethod
def validate_username(user_name_list):
_user_name_list = []
for _user_name in user_name_list:
for key, user_name in _user_name.items():
if bool(re.search("^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$",
user_name.strip())):
_user_name_list.append(user_name.strip())
return _user_name_list
def load_user_name_from_db(self):
return self.validate_username(self.query(self.GET_USER_NAMES))
def get_valid_users(self):
if len(self.valid_user_names) == 0:
self.valid_user_names = self.load_user_name_from_db()
return self.valid_user_names
def load_form_db(self, select, parameters=None):
if len(self.valid_user_names) == 0:
self.valid_user_names = self.load_user_name_from_db()
return self.query(select, parameters)
def save_pdf(self):
list_resumes = self.query(self.GET_RESUMES)[:1]
dirname = os.path.dirname(__file__)
tic = timeit.default_timer()
for item in list_resumes:
html_string = item['resume'].decode('utf-8')
html = HTML(string=html_string)
file_name = '%s.pdf' % item['email'].strip()
pdf = html.write_pdf()
with open(os.path.join(dirname, 'resume', file_name), 'wb') as f:
f.write(pdf)
toc = timeit.default_timer()
print('tempo de execução', toc - tic)
if __name__ == '__main__':
list_data_frame = []
# init class
conn = Connection()
# get users
valid_users = conn.load_form_db(conn.GET_USER, conn.get_valid_users())
list_data_frame.append(dict(sheet='user', dataframe=valid_users))
print('user', len(valid_users))
# get userprofile
user_profile_data = conn.load_form_db(conn.GET_USER_PROFILES, conn.get_valid_users())
list_data_frame.append(dict(sheet='userprofile', dataframe=user_profile_data))
print('userprofile', len(user_profile_data))
# get usercpf
user_cpf_data = conn.load_form_db(conn.GET_USER_CPF, conn.get_valid_users())
list_data_frame.append(dict(sheet='usercpf', dataframe=user_cpf_data))
print('usercpf', len(user_cpf_data))
# get user comments
user_comments_data = conn.load_form_db(conn.GET_USER_COMMENTS, conn.get_valid_users())
list_data_frame.append(dict(sheet='comments', dataframe=user_comments_data))
print('comments', len(user_comments_data))
# get user skill
user_skill_data = conn.load_form_db(conn.GET_USER_SKILL, conn.get_valid_users())
list_data_frame.append(dict(sheet='userskill', dataframe=user_skill_data))
print('userskill', len(user_skill_data))
# get user disability
user_disability_data = conn.load_form_db(conn.GET_USER_DISABILITY, conn.get_valid_users())
list_data_frame.append(dict(sheet='userdisability', dataframe=user_disability_data))
print('userdisability', len(user_disability_data))
# get company candidate
company_candidate_data = conn.load_form_db(conn.GET_COMPANY_CANDIDATE, conn.get_valid_users())
list_data_frame.append(dict(sheet='companycandidate', dataframe=company_candidate_data))
print('companycandidate', len(company_candidate_data))
conn.export_xls(list_data_frame, 'data_essence_it')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment