This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import numpy as np | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.model_selection import train_test_split | |
# setting up path to the data file | |
PATH = os.path.dirname(os.path.realpath(__file__)) | |
PATH = os.path.join(PATH, 'data') | |
print(PATH) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import OrdinaryEncoder | |
# creating mapping from unique label texts to unique integers | |
# note this can be re-used to encode and decode the labels after as well | |
encoder = OrdinaryEncoder().fit(df['code']) | |
# using the encoder to encode the entire dataset | |
y = encoder.transform(encoder) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer | |
# Initialize and fit CountVectorizer with given text documents | |
vectorizer = CountVectorizer().fit(df['name']) | |
# use the vectorizer to transform the document into word count vectors (Sparse) | |
word_mat = vectorizer.transform(df['name']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.naive_bayes import MultinomialNB | |
# instantiate the model as clf(classifier) and train it | |
clf = MultinomialNB() | |
clf.fit(x_train, y_train) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
x_train, x_test, y_train, y_test = train_test_split(word_mat, y, test_size=0.3) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
x_train, x_test, y_train, y_test = train_test_split(word_mat, y, test_size=0.3) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.