Last active
October 6, 2020 03:49
-
-
Save shreyas90999/a9c073ce7b53bcfd5f92fa37c3e5f024 to your computer and use it in GitHub Desktop.
This function numerical features from textual data. sentence-"We are selling 10 packs of 512gb SSD of XYZ company" vectorization will be{packs: 10 , gb: 512}
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def num_feature(df,vectorizer_nums=None,scale=None,training=True): | |
""" | |
After some EDA and manually reviewing textual data I found out that there was lot of numerical information avaliable in text eg. 10ml , 2 packs, 10 lipsticks 512gb. | |
This numerical data had impact on the price of item. So this a function does numerical vectorization of this data.So we first find patterns like (10ml,160gb 2 packs..etc) | |
So now we have sentence "250ml 2 packs of xyz company" so here in "ml" column we had "250" and in packs column we add "2".At the end a sparse matrix is given out as output | |
""" | |
def get_featuers(phrase): | |
""" | |
this function finds all possible numercial patterns in training data and accordingly updates the dictionary. | |
""" | |
phrase = str(phrase).lower() | |
phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase) | |
phrase = re.sub(' +', ' ', phrase) | |
nums = re.findall('[0-9]+.[A-Za-z]+',phrase) | |
if len(nums)>0: | |
for i in nums: | |
a = ''.join(re.findall('[A-Za-z]+',i)) | |
dic[a] = 0 | |
else: | |
dic['no_data']=0 | |
return None | |
def number(phrase): | |
""" | |
Finds all patterns and returns dictionary. eg {"ml":10,"gb":512},{"key(pattern)":"value(quantity)"} | |
""" | |
send = dict() | |
phrase = str(phrase).lower() | |
phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase) | |
phrase = re.sub(' +', ' ', phrase) | |
nums = re.findall('[0-9]+.[A-Za-z]+',phrase) | |
if len(nums)>0: | |
for z in nums: | |
a = ''.join(re.findall('[A-Za-z]+',z)) | |
b = ''.join(re.findall(r'\d+',z)) | |
b = int(b) | |
send[a] = b | |
else: | |
send['no_data']=1 | |
return send | |
def get_matrix(temp,dic): | |
""" | |
This function is used to generate a csr matrix for numrecial vectorization | |
""" | |
row_ind = [] | |
col_ind = [] | |
data = [] | |
row=0 | |
for i in temp: | |
for z in i: | |
if z in dic: | |
col = dic[z] | |
data_temp = i[z] | |
row_ind.append(int(row)) | |
col_ind.append(int(col)) | |
data.append(int(data_temp)) | |
row+=1 | |
row_ind = np.array(row_ind,dtype=int) | |
col_ind = np.array(col_ind,dtype=int) | |
data = np.array(data,dtype=float) | |
matrix = csr_matrix((data, (row_ind, col_ind)), shape=(df.shape[0],len(dic))) | |
return matrix | |
if training==True: | |
dic = dict() | |
df['text'] = df['name'] + " " + df['item_description'] | |
df['text'].swifter.apply(get_featuers) | |
c=0 | |
for i in dic: | |
dic[i] = c | |
c+=1 | |
temp = df['text'].apply(number) | |
matrix = get_matrix(temp,dic) | |
scaler = MaxAbsScaler(copy=False) | |
scaler.fit(matrix) | |
matrix = scaler.transform(matrix) | |
return matrix,dic,scaler | |
else: | |
df['text'] = df['name'] + " " + df['item_description'] | |
temp = df['text'].swifter.apply(number) | |
matrix = get_matrix(temp,vectorizer_nums) | |
matrix = scale.transform(matrix) | |
return matrix | |
X_train_num,vectorizer_nums,scaler_num = num_feature(X_train) | |
X_test_num = num_feature(X_test,vectorizer_nums,scaler_num,training=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment