Created
August 19, 2015 12:28
-
-
Save magnusnissel/0c7d0d69f288240ba548 to your computer and use it in GitHub Desktop.
A Python 3 functions that tries to guess the number of syllables in an English token
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
""" | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see http://www.gnu.org/licenses/. | |
""" | |
def guess_syllables(token_str): | |
""" Uses hardcoded rules to guess syllables for English words, indicates risk of error. | |
Obviously neither the rules nor risk counter are perfect, | |
but testing shows decent results. Postcorrection is advised. | |
""" | |
token_str = token_str.lower() | |
vow_cluster_re = re.compile(r"[aeiou]+") | |
# ending in consonant + e | |
final_e_re = re.compile(r"[^aeiou]e+\b") | |
# ending in two consonants + y or just consontant + y for 2-letter words | |
final_2cons_y_re = re.compile(r"([^aeiou]{2,}y\b)|(\b[^aeiou]+y\b)") | |
guess = 0 | |
risk = 0 | |
vowels = vow_cluster_re.findall(token_str) | |
# Initially add one syllable per vowel cluster | |
for vc in vowels: | |
guess += 1 | |
if vc in ("ue", "ei"): # see below | |
risk += 1 | |
# --- START of token_str --- # | |
# starts with "rei" => assume "re-i" and remove syllable (fails for reign etc.) | |
if token_str.startswith("rei"): | |
guess += 1 | |
# --- END of token_str --- # | |
# ends with 2+ consonants & y => treat y as vowel and add syllable | |
final_2cons_y = final_2cons_y_re.search(token_str) | |
if final_2cons_y: | |
guess += 1 | |
risk += 1 | |
else: | |
final_e = final_e_re.search(token_str) | |
# if only one vowel, then treat final e as syllable indicator | |
if final_e and guess > 1: | |
# ends with e but not "ee" => assume "e" is not indicator of syllable (so remove one) | |
guess = guess -1 | |
# Is it worh / possible to make a good rule for ue (e.g. "true") vs u-e (e.g. "fluent")? | |
return guess, risk | |
def run_demo(): | |
try: | |
from nltk.corpus import wordnet as wn | |
for syn in wn.all_synsets(): | |
token_str = syn.name().split(".")[0] | |
if "_" not in token_str: | |
guess, risk = guess_syllables(token_str) | |
print(token_str, "\t", guess, "\t", risk) | |
except ImportError: | |
print("This demo requires the wordnet data from the NLTK module") | |
if __name__ == "__main__": | |
run_demo() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment