Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Created September 18, 2022 17:40
Show Gist options
  • Save Steboss89/68bd990987c34ec805b54d443e9bfc5f to your computer and use it in GitHub Desktop.
Save Steboss89/68bd990987c34ec805b54d443e9bfc5f to your computer and use it in GitHub Desktop.
Wrap model and preprocess to a sklearn pipeline
def training_process(model:str,
vectorizer:str):
r""" Function to create the training pipeline with cleaner and model
Parameters
----------
model: str, type of model we want to run, see get_model function
vectorizer: str, type of vectorizer, `countvectorizer` or `tfidf`
Return
-------
training_pipeline: sklearn.pipeline with data cleaner and model
"""
# retrieve the model
classifier = get_model(model)
if vectorizer=="countvectorizer":
print(f"selected vectorizer CountVectorizer")
vector = CountVectorizer()
elif vectorizer=="tfidf":
print(f"selected vectorizer TfidfVectorizer")
vector = TfidfVectorizer(ngram_range=(1,4),
use_idf=True,
smooth_idf=True,
sublinear_tf=True,
analyzer='word',
token_pattern=r'\w{1,}',
max_features=1000)
else:
print(f"Vectorizer {vectorizer} doesn't exist. Please select among:")
print("countvectorizer or tfidf")
sys.exit(-1)
# create the pipeline
training_pipeline = Pipeline(steps=[
("clean", PreprocessTweets("text")),
("countVectorizer",vector),
("trainModel", classifier)
]
)
return training_pipeline
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment