Skip to content

Instantly share code, notes, and snippets.

@shreyas90999
Last active October 6, 2020 03:49
Show Gist options
  • Save shreyas90999/a9c073ce7b53bcfd5f92fa37c3e5f024 to your computer and use it in GitHub Desktop.
Save shreyas90999/a9c073ce7b53bcfd5f92fa37c3e5f024 to your computer and use it in GitHub Desktop.
This function numerical features from textual data. sentence-"We are selling 10 packs of 512gb SSD of XYZ company" vectorization will be{packs: 10 , gb: 512}
def num_feature(df,vectorizer_nums=None,scale=None,training=True):
"""
After some EDA and manually reviewing textual data I found out that there was lot of numerical information avaliable in text eg. 10ml , 2 packs, 10 lipsticks 512gb.
This numerical data had impact on the price of item. So this a function does numerical vectorization of this data.So we first find patterns like (10ml,160gb 2 packs..etc)
So now we have sentence "250ml 2 packs of xyz company" so here in "ml" column we had "250" and in packs column we add "2".At the end a sparse matrix is given out as output
"""
def get_featuers(phrase):
"""
this function finds all possible numercial patterns in training data and accordingly updates the dictionary.
"""
phrase = str(phrase).lower()
phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
phrase = re.sub(' +', ' ', phrase)
nums = re.findall('[0-9]+.[A-Za-z]+',phrase)
if len(nums)>0:
for i in nums:
a = ''.join(re.findall('[A-Za-z]+',i))
dic[a] = 0
else:
dic['no_data']=0
return None
def number(phrase):
"""
Finds all patterns and returns dictionary. eg {"ml":10,"gb":512},{"key(pattern)":"value(quantity)"}
"""
send = dict()
phrase = str(phrase).lower()
phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
phrase = re.sub(' +', ' ', phrase)
nums = re.findall('[0-9]+.[A-Za-z]+',phrase)
if len(nums)>0:
for z in nums:
a = ''.join(re.findall('[A-Za-z]+',z))
b = ''.join(re.findall(r'\d+',z))
b = int(b)
send[a] = b
else:
send['no_data']=1
return send
def get_matrix(temp,dic):
"""
This function is used to generate a csr matrix for numrecial vectorization
"""
row_ind = []
col_ind = []
data = []
row=0
for i in temp:
for z in i:
if z in dic:
col = dic[z]
data_temp = i[z]
row_ind.append(int(row))
col_ind.append(int(col))
data.append(int(data_temp))
row+=1
row_ind = np.array(row_ind,dtype=int)
col_ind = np.array(col_ind,dtype=int)
data = np.array(data,dtype=float)
matrix = csr_matrix((data, (row_ind, col_ind)), shape=(df.shape[0],len(dic)))
return matrix
if training==True:
dic = dict()
df['text'] = df['name'] + " " + df['item_description']
df['text'].swifter.apply(get_featuers)
c=0
for i in dic:
dic[i] = c
c+=1
temp = df['text'].apply(number)
matrix = get_matrix(temp,dic)
scaler = MaxAbsScaler(copy=False)
scaler.fit(matrix)
matrix = scaler.transform(matrix)
return matrix,dic,scaler
else:
df['text'] = df['name'] + " " + df['item_description']
temp = df['text'].swifter.apply(number)
matrix = get_matrix(temp,vectorizer_nums)
matrix = scale.transform(matrix)
return matrix
X_train_num,vectorizer_nums,scaler_num = num_feature(X_train)
X_test_num = num_feature(X_test,vectorizer_nums,scaler_num,training=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment