Created
December 22, 2018 05:58
-
-
Save olivx/9265c6d8a359e0bb76dc30ff1f57141a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
company = Company.objects.get(pk=116091) | |
def create_users(df): | |
# normalize os dados | |
print 'normalize o username ' | |
df.replace('', np.nan, inplace=True) | |
df = df.replace(np.nan, '', regex=True) | |
df['username'] = df['username'].apply(limit_30_char) | |
df['first_name'] = df['first_name'].apply(limit_30_char) | |
df['last_name'] = df['last_name'].apply(limit_30_char) | |
df['username'] = df['username'].str.strip() | |
df['username'] = df['username'].str.lower() | |
df_user_to_create = df.drop_duplicates('username').sort_values(by=['username']) | |
print 'recuperando usarios cadastrados' | |
data_all_user = list(User.objects.filter( | |
username__in=list(df_user_to_create['username'].values.tolist()) | |
).values_list('username', flat=True) | |
) | |
columns = list(df.columns)[1:] | |
users = [] | |
list_usernames = [] | |
print 'montando data user' | |
for usernames in data_all_user: | |
list_usernames.append(usernames.strip().lower()) | |
print 'gerando userprofile apartir do dataframe' | |
for row in df_user_to_create.itertuples(): | |
user = {} | |
username = getattr(row, 'username') | |
for col in columns: | |
if username not in list_usernames: | |
user[col] = getattr(row, col) | |
users.append(User(**user)) | |
for index, user in enumerate(users): | |
if len(user.username) < 4: | |
users.pop(index) | |
return users | |
def create_objects(df, klass, key, company=None): | |
if company is not None: | |
commentor = User.objects.get(pk=company.pk) | |
# normalize os dados | |
print 'normalize %s ' % klass.__name__ | |
df = df.replace(np.nan, '', regex=True) | |
df[key] = df[key].apply(limit_30_char) | |
df[key] = df[key].str.strip() | |
df[key] = df[key].str.lower() | |
df_to_create = df.drop_duplicates(key).sort_values(by=[key]) | |
columns = list(df.columns)[1:] | |
# recuperando usarios cadastrados | |
data_all_user = list(User.objects.filter( | |
username__in=list(df_to_create[key].values.tolist()) | |
) | |
) | |
if klass.__name__ == 'Comment': | |
data_all_elements = list(klass.objects.filter( | |
tocandidate__username__in=list(df_to_create[key].values.tolist()) | |
).values_list('tocandidate__username', flat=True) | |
) | |
elif klass.__name__ == 'CompanyCandidate': | |
data_all_elements = list(klass.objects.filter( | |
candidate__username__in=list(df_to_create[key].values.tolist()) | |
).values_list('candidate__username', flat=True) | |
) | |
else: | |
data_all_elements = list(klass.objects.filter( | |
user__username__in=list(df_to_create[key].values.tolist()) | |
).values_list('user__username', flat=True) | |
) | |
print 'total em data_all_elements ', len(data_all_elements) | |
print 'total de paginas data_users', len(data_all_user) | |
print 'total em df_to_create', df_to_create.shape[0] | |
data_users = {} | |
print 'montando data user' | |
for _user in data_all_user: | |
data_users[_user.username] = _user | |
list_elements = [] | |
print 'montato data user profile ' | |
for element in data_all_elements: | |
list_elements.append(element.strip().lower()) | |
_list_to_return = [] | |
print 'gerando userprofile apartir do dataframe' | |
for row in df_to_create.itertuples(): | |
_object = {} | |
for col in columns: | |
username = getattr(row, key) | |
if username not in list_elements: | |
if col == 'user': | |
user = data_users.get(username, None) | |
if user is not None: | |
_object['user_id'] = user.pk | |
else: | |
print 'User %s não foi achado no dict, indo buscar no banco.' % username | |
_object['user_id'] = User.objects.get(username=username).pk | |
elif col == 'candidate': | |
user = data_users.get(username) | |
if user is not None: | |
_object['candidate_id'] = user.pk | |
else: | |
print 'User %s não foi achado no dict, indo buscar no banco.' % username | |
_object['candidate_id'] = User.objects.get(username=username).pk | |
elif col == 'tocandidate': | |
user = data_users.get(username) | |
if user is not None: | |
_object['tocandidate_id'] = user.pk | |
_object['commentator_id'] = commentor.pk | |
else: | |
print 'User %s não foi achado no dict, indo buscar no banco.' % username | |
_object['tocandidate_id'] = User.objects.get(username=username).pk | |
_object['commentator_id'] = commentor.pk | |
elif col == 'company': | |
_object[col] = company | |
elif col == 'birthday': | |
try: | |
_object[col] = datetime.strptime(item, 'yy-mm-dd') | |
except: | |
_object[col] = None | |
else: | |
_object[col] = getattr(row, col) | |
if bool(_object): | |
_list_to_return.append(klass(**_object)) | |
return _list_to_return | |
def limit_30_char(x): | |
if len(str(x)) >= 30: | |
return x[:29] | |
return x | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
print 'Incializando importação...' | |
time_start = timeit.default_timer() | |
print 'start em %s' % datetime.now() | |
base_dir = settings.BASE_DIR | |
dict_klass = {} | |
dict_klass['user'] = {'klass': User, 'key': 'username'} | |
dict_klass['userprofile'] = {'klass': UserProfile, 'key': 'user'} | |
dict_klass['usercpf'] = {'klass': UserCPF, 'key': 'user'} | |
dict_klass['comments'] = {'klass': Comment, 'key': 'tocandidate'} | |
dict_klass['userskill'] = {'klass': UserSkill, 'key': 'user'} | |
dict_klass['userdisability'] = {'klass': UserDisability, 'key': 'user'} | |
dict_klass['companycandidate'] = {'klass': CompanyCandidate, 'key': 'candidate'} | |
xls = pd.ExcelFile('data_essence_it.xlsx') | |
for sheet in xls.sheet_names: | |
print '' | |
print '' | |
print '*'* 50 | |
print 'iniciando nova tarefa em %s' % datetime.now() | |
print'começando com %s' % sheet | |
if sheet == 'user': | |
tic = timeit.default_timer() | |
df_import_data = pd.read_excel(xls, sheet) | |
print 'montando os uaurios e fazendo bulk create' | |
users = create_users(df_import_data) | |
print 'bulk create !!!' | |
User.objects.bulk_create(users, batch_size=10000) | |
print '' | |
print 'adicionado os usarios criados em um grupo de candidatos' | |
list_user = df_import_data.username.values.tolist() | |
users = User.objects.filter(username__in=list_user) | |
group = Group.objects.get(name='candidate') | |
group.user_set.add(*users) | |
toc = timeit.default_timer() | |
print 'tempo de tarefa ', toc - tic | |
print 'terminado em ', datetime.now() | |
else: | |
_klass = dict_klass[sheet]['klass'] | |
_key = dict_klass[sheet]['key'] | |
tic = timeit.default_timer() | |
df_import_data = pd.read_excel(xls, sheet) | |
print 'montando os %s e fazendo bulk create' % sheet | |
_objects = create_objects(df_import_data, _klass, _key, company) | |
print 'bulk create !!!' | |
_klass.objects.bulk_create(_objects, batch_size=10000) | |
toc = timeit.default_timer() | |
print 'tempo de tarefa %s', toc - tic | |
print 'terminado em %s', datetime.now() | |
time_end = timeit.default_timer() | |
print '' | |
print '' | |
print 'tempo total de execução ', time_end - time_start | |
print 'terminado em ', datetime.now() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
import pymysql | |
import pandas as pd | |
from weasyprint import HTML | |
import timeit | |
class Connection: | |
GET_USER_NAMES = ''' | |
SELECT DISTINCT EMAIL from applicants where email <> ''; | |
''' | |
GET_USER = ''' | |
SELECT EMAIL AS username, EMAIL as email, | |
Substring(FIRST_NAME, 1, Locate(' ', FIRST_NAME) - 1) AS first_name, | |
Substring(FIRST_NAME, Locate(' ', FIRST_NAME) + 1) AS last_name, | |
'H@shPAssWr0d' as password | |
FROM applicants | |
where trim(email) in (%s ) | |
''' | |
GET_USER_CPF = ''' | |
SELECT email as user, rg, case | |
when user_name REGEXP '^[0-9]+$' then user_name | |
else '' end as cpf | |
FROM applicants | |
where trim(email) in (%s ); | |
''' | |
GET_USER_DISABILITY = ''' | |
SELECT email as user , d.jobconvo as disability | |
FROM applicants | |
left join disability as d on d.id = applicants.deficient | |
where trim(email) in (%s ); | |
''' | |
GET_USER_PROFILES = ''' | |
SELECT first_name as full_name, email as user, | |
case gender | |
when 1323 then '1' | |
when 1324 then '2' | |
else 3 end as born_sex, | |
dob as birthday, phone1, phone2 , address1 as address, number as adddressnumber, | |
address2 as complement, neighborhood as neighbourhood, states.state_name as state, citys.name as city, | |
country_name as country , zip as cep | |
FROM essence.applicants | |
left join states on states.state_id = applicants.state_id | |
left join citys on citys.id = applicants.city_id | |
left join country on country.country_id = applicants.country_id | |
where trim(email) in (%s ); | |
''' | |
GET_USER_SKILL = ''' | |
select a.email as user, s.skill as skill | |
from applicants_skill as a_skill | |
inner join applicants as a on a.applicant_id = a_skill.applicant_id | |
inner join skill as s on s.id = a_skill.skill_id | |
where trim(email) in (%s ); | |
''' | |
GET_USER_COMMENTS = ''' | |
select a.email as tocandidate , comments as comments | |
from applicant_flagged as a_flagged | |
inner join applicants as a on a.applicant_id = a_flagged.applicant_id | |
where trim(email) in (%s ); | |
''' | |
GET_RESUMES = ''' | |
select email , resume from applicants | |
where trim(email) in (%s ); | |
''' | |
GET_COMPANY_CANDIDATE = ''' | |
select a.email as candidate , 116091 as company, a.created_on as date, s.referral_source as source, | |
case af.flag_id | |
when 165 then True | |
else False end as blacklist, | |
case af.flag_id | |
when 166 then True | |
else False end as favorite | |
from applicants as a | |
left join applicant_flagged as af on af.applicant_id = a.applicant_id | |
left join source as s on s.id = a.referral_source | |
where trim(email) in (%s ); | |
''' | |
GET_APPLICANTS = ''' | |
SELECT applicant_id, user_name , password , | |
referral_source AS SOURCE_ID, first_name , | |
dob, email, phone1, phone2 , address1 , address2 , city_id , | |
state_id, zip , number , neighborhood , | |
country_id AS COUNTRY, english_leval , | |
spanish_leval , other_level, other_language , | |
education , professional_objective , | |
resume ,linkedin_profile_url , | |
facebook_profile_url, working, rg , | |
validity , nature, gender , marital_status , | |
dependentes, deficient, availability_start , availability_travel, main_interest, universities_pg, | |
completion_year_pg, course_pg, area_of_expertise, profile_type, | |
skills_modules, knowledge_level, other_certifications | |
FROM essence.applicants; | |
''' | |
def __init__(self, host='localhost', db_name='essence', password='root', user_name='root'): | |
self.user_name = user_name | |
self.password = password | |
self.host = host | |
self.db_name = db_name | |
self.valid_user_names = [] | |
def connection(self): | |
connection = pymysql.connect(host=self.host, user=self.user_name, password=self.password, | |
db=self.db_name, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) | |
return connection | |
def query(self, sql, parameters=None): | |
try: | |
connection = self.connection() | |
with connection.cursor() as cursor: | |
if parameters: | |
_query = sql % ','.join(['%s'] * len(parameters)) | |
cursor.execute(_query, parameters) | |
else: | |
cursor.execute(sql) | |
return cursor.fetchall() | |
except Exception as e: | |
print("Error with query: " + str(e)) | |
finally: | |
connection.close() | |
@staticmethod | |
def export_xls(data_frame, file_name): | |
file_name = '%s.xlsx' % file_name | |
BASE_DIR = os.path.dirname(__file__) | |
absolute_path = os.path.join(BASE_DIR, 'exported', file_name) | |
writer_orig = pd.ExcelWriter(absolute_path, engine='xlsxwriter') | |
for frame in data_frame: | |
_data = frame['dataframe'] | |
_sheet_name = frame['sheet'] | |
df = pd.DataFrame(_data) | |
df.to_excel(writer_orig, index=False, sheet_name=_sheet_name) | |
writer_orig.save() | |
@staticmethod | |
def validate_username(user_name_list): | |
_user_name_list = [] | |
for _user_name in user_name_list: | |
for key, user_name in _user_name.items(): | |
if bool(re.search("^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$", | |
user_name.strip())): | |
_user_name_list.append(user_name.strip()) | |
return _user_name_list | |
def load_user_name_from_db(self): | |
return self.validate_username(self.query(self.GET_USER_NAMES)) | |
def get_valid_users(self): | |
if len(self.valid_user_names) == 0: | |
self.valid_user_names = self.load_user_name_from_db() | |
return self.valid_user_names | |
def load_form_db(self, select, parameters=None): | |
if len(self.valid_user_names) == 0: | |
self.valid_user_names = self.load_user_name_from_db() | |
return self.query(select, parameters) | |
def save_pdf(self): | |
list_resumes = self.query(self.GET_RESUMES)[:1] | |
dirname = os.path.dirname(__file__) | |
tic = timeit.default_timer() | |
for item in list_resumes: | |
html_string = item['resume'].decode('utf-8') | |
html = HTML(string=html_string) | |
file_name = '%s.pdf' % item['email'].strip() | |
pdf = html.write_pdf() | |
with open(os.path.join(dirname, 'resume', file_name), 'wb') as f: | |
f.write(pdf) | |
toc = timeit.default_timer() | |
print('tempo de execução', toc - tic) | |
if __name__ == '__main__': | |
list_data_frame = [] | |
# init class | |
conn = Connection() | |
# get users | |
valid_users = conn.load_form_db(conn.GET_USER, conn.get_valid_users()) | |
list_data_frame.append(dict(sheet='user', dataframe=valid_users)) | |
print('user', len(valid_users)) | |
# get userprofile | |
user_profile_data = conn.load_form_db(conn.GET_USER_PROFILES, conn.get_valid_users()) | |
list_data_frame.append(dict(sheet='userprofile', dataframe=user_profile_data)) | |
print('userprofile', len(user_profile_data)) | |
# get usercpf | |
user_cpf_data = conn.load_form_db(conn.GET_USER_CPF, conn.get_valid_users()) | |
list_data_frame.append(dict(sheet='usercpf', dataframe=user_cpf_data)) | |
print('usercpf', len(user_cpf_data)) | |
# get user comments | |
user_comments_data = conn.load_form_db(conn.GET_USER_COMMENTS, conn.get_valid_users()) | |
list_data_frame.append(dict(sheet='comments', dataframe=user_comments_data)) | |
print('comments', len(user_comments_data)) | |
# get user skill | |
user_skill_data = conn.load_form_db(conn.GET_USER_SKILL, conn.get_valid_users()) | |
list_data_frame.append(dict(sheet='userskill', dataframe=user_skill_data)) | |
print('userskill', len(user_skill_data)) | |
# get user disability | |
user_disability_data = conn.load_form_db(conn.GET_USER_DISABILITY, conn.get_valid_users()) | |
list_data_frame.append(dict(sheet='userdisability', dataframe=user_disability_data)) | |
print('userdisability', len(user_disability_data)) | |
# get company candidate | |
company_candidate_data = conn.load_form_db(conn.GET_COMPANY_CANDIDATE, conn.get_valid_users()) | |
list_data_frame.append(dict(sheet='companycandidate', dataframe=company_candidate_data)) | |
print('companycandidate', len(company_candidate_data)) | |
conn.export_xls(list_data_frame, 'data_essence_it') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment