Created
June 19, 2020 11:35
-
-
Save fherbine/d84b8650c80af778a380d8ff3f253c1d to your computer and use it in GitHub Desktop.
Reproduction (in french) of an exercise in: NLP - Crash course AI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" French cat AI. | |
This code is inspired from crash course AI about NLP: | |
https://www.youtube.com/watch?v=oi0JXuL19TA | |
""" | |
__author__ = 'fherbine' | |
import requests | |
from bs4 import BeautifulSoup | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem.snowball import SnowballStemmer | |
from tabulate import tabulate | |
bs_parser = 'html.parser' | |
stemmer = SnowballStemmer(language='french') | |
def word_frequency(tokenized_words): | |
fdist = nltk.FreqDist(tokenized_words) | |
print(tabulate(fdist.most_common(10), headers=['word', 'frequency'])) | |
# get wikis of cat and Felidae in french. | |
french_cat_wiki = requests.get('https://fr.wikipedia.org/wiki/Chat') | |
french_felinae_wiki = requests.get('https://fr.wikipedia.org/wiki/Felinae') | |
# Scrapping the content from webpages | |
french_cat_content = BeautifulSoup(french_cat_wiki.content, bs_parser) | |
french_felinae_content = BeautifulSoup(french_felinae_wiki.content, bs_parser) | |
# Extracting text, and tokenize them in a very simple way | |
cat_text = '\n'.join( | |
[p.text for p in french_cat_content.find_all('p')] | |
).split() | |
felinae_text = '\n'.join( | |
[p.text for p in french_felinae_content.find_all('p')] | |
).split() | |
# Applying stemming over our tokenized sentences | |
cat_text = [stemmer.stem(word) for word in cat_text] | |
felinae_text = [stemmer.stem(word) for word in felinae_text] | |
# removes french stopwords of the texts | |
french_sw = set(stopwords.words('french')) | |
cat_text = [word for word in cat_text if word not in french_sw] | |
felinae_text = [word for word in felinae_text if word not in french_sw] | |
word_frequency(cat_text) | |
word_frequency(felinae_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment