shreyas90999 · October 6, 2020 03:49
diff --git a/num_vectorizer.py b/num_vectorizer.py
 def num_feature(df,vectorizer_nums=None,scale=None,training=True):
  """
  After some EDA and manually reviewing textual data I found out that there was lot of numerical information avaliable in text eg. 10ml , 2 packs, 10 lipsticks 512gb.
  This numerical data had impact on the price of item. So this a function does numerical vectorization of this data.So we first find patterns like (10ml,160gb 2 packs..etc)
  So now we have sentence "250ml 2 packs of xyz company" so here in "ml" column we had "250" and in packs column we add "2".At the end a sparse matrix is given out as output 
  """
  def get_featuers(phrase):
    """
    this function finds all possible numercial patterns in training data and accordingly updates the dictionary.
    """
    phrase = str(phrase).lower()
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
    phrase = re.sub(' +', ' ', phrase)

    nums = re.findall('[0-9]+.[A-Za-z]+',phrase)

    if len(nums)>0:
      for i in nums:
        a =  ''.join(re.findall('[A-Za-z]+',i))
        dic[a] = 0
    else:
      dic['no_data']=0

    return None

  def number(phrase): 
    """
    Finds all patterns and returns dictionary. eg {"ml":10,"gb":512},{"key(pattern)":"value(quantity)"}
    """
    send = dict()
    phrase = str(phrase).lower()
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
    phrase = re.sub(' +', ' ', phrase)

    nums = re.findall('[0-9]+.[A-Za-z]+',phrase)

    if len(nums)>0:
      for z in nums:
        a =  ''.join(re.findall('[A-Za-z]+',z))
        b =  ''.join(re.findall(r'\d+',z))
        b = int(b)
        send[a] = b
    else:
      send['no_data']=1

    return send

  def get_matrix(temp,dic):
    """
    This function is used to generate a csr matrix for numrecial vectorization
    """
    row_ind = []
    col_ind = []
    data = []
    row=0
    for i in temp:
      for z in i:
        if z in dic:
          col = dic[z]
          data_temp = i[z]
          row_ind.append(int(row))
          col_ind.append(int(col))
          data.append(int(data_temp))
      row+=1

    row_ind = np.array(row_ind,dtype=int)

    col_ind = np.array(col_ind,dtype=int)

    data = np.array(data,dtype=float)      

    matrix = csr_matrix((data, (row_ind, col_ind)), shape=(df.shape[0],len(dic)))

    return matrix

  
  if training==True:
    dic = dict()
    df['text'] = df['name'] + " " + df['item_description']
    df['text'].swifter.apply(get_featuers)
    c=0
    for i in dic:
      dic[i] = c
      c+=1

    temp = df['text'].apply(number)

    matrix = get_matrix(temp,dic)

    scaler = MaxAbsScaler(copy=False)
    scaler.fit(matrix)
    matrix = scaler.transform(matrix)

    return matrix,dic,scaler

  else:
    df['text'] = df['name'] + " " + df['item_description']
    temp = df['text'].swifter.apply(number)

    matrix = get_matrix(temp,vectorizer_nums)

    matrix = scale.transform(matrix)

    return matrix

 X_train_num,vectorizer_nums,scaler_num = num_feature(X_train)
 X_test_num = num_feature(X_test,vectorizer_nums,scaler_num,training=False)
	def num_feature(df,vectorizer_nums=None,scale=None,training=True):
	"""
	After some EDA and manually reviewing textual data I found out that there was lot of numerical information avaliable in text eg. 10ml , 2 packs, 10 lipsticks 512gb.
	This numerical data had impact on the price of item. So this a function does numerical vectorization of this data.So we first find patterns like (10ml,160gb 2 packs..etc)
	So now we have sentence "250ml 2 packs of xyz company" so here in "ml" column we had "250" and in packs column we add "2".At the end a sparse matrix is given out as output
	"""
	def get_featuers(phrase):
	"""
	this function finds all possible numercial patterns in training data and accordingly updates the dictionary.
	"""
	phrase = str(phrase).lower()
	phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
	phrase = re.sub(' +', ' ', phrase)

	nums = re.findall('[0-9]+.[A-Za-z]+',phrase)

	if len(nums)>0:
	for i in nums:
	a = ''.join(re.findall('[A-Za-z]+',i))
	dic[a] = 0
	else:
	dic['no_data']=0

	return None

	def number(phrase):
	"""
	Finds all patterns and returns dictionary. eg {"ml":10,"gb":512},{"key(pattern)":"value(quantity)"}
	"""
	send = dict()
	phrase = str(phrase).lower()
	phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
	phrase = re.sub(' +', ' ', phrase)

	nums = re.findall('[0-9]+.[A-Za-z]+',phrase)

	if len(nums)>0:
	for z in nums:
	a = ''.join(re.findall('[A-Za-z]+',z))
	b = ''.join(re.findall(r'\d+',z))
	b = int(b)
	send[a] = b
	else:
	send['no_data']=1

	return send

	def get_matrix(temp,dic):
	"""
	This function is used to generate a csr matrix for numrecial vectorization
	"""
	row_ind = []
	col_ind = []
	data = []
	row=0
	for i in temp:
	for z in i:
	if z in dic:
	col = dic[z]
	data_temp = i[z]
	row_ind.append(int(row))
	col_ind.append(int(col))
	data.append(int(data_temp))
	row+=1

	row_ind = np.array(row_ind,dtype=int)

	col_ind = np.array(col_ind,dtype=int)

	data = np.array(data,dtype=float)

	matrix = csr_matrix((data, (row_ind, col_ind)), shape=(df.shape[0],len(dic)))

	return matrix


	if training==True:
	dic = dict()
	df['text'] = df['name'] + " " + df['item_description']
	df['text'].swifter.apply(get_featuers)
	c=0
	for i in dic:
	dic[i] = c
	c+=1

	temp = df['text'].apply(number)

	matrix = get_matrix(temp,dic)

	scaler = MaxAbsScaler(copy=False)
	scaler.fit(matrix)
	matrix = scaler.transform(matrix)

	return matrix,dic,scaler

	else:
	df['text'] = df['name'] + " " + df['item_description']
	temp = df['text'].swifter.apply(number)

	matrix = get_matrix(temp,vectorizer_nums)

	matrix = scale.transform(matrix)

	return matrix

	X_train_num,vectorizer_nums,scaler_num = num_feature(X_train)
	X_test_num = num_feature(X_test,vectorizer_nums,scaler_num,training=False)