Skip to content

Instantly share code, notes, and snippets.

@magnusnissel
Created August 19, 2015 12:28
Show Gist options
  • Save magnusnissel/0c7d0d69f288240ba548 to your computer and use it in GitHub Desktop.
Save magnusnissel/0c7d0d69f288240ba548 to your computer and use it in GitHub Desktop.
A Python 3 functions that tries to guess the number of syllables in an English token
import re
"""
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
"""
def guess_syllables(token_str):
""" Uses hardcoded rules to guess syllables for English words, indicates risk of error.
Obviously neither the rules nor risk counter are perfect,
but testing shows decent results. Postcorrection is advised.
"""
token_str = token_str.lower()
vow_cluster_re = re.compile(r"[aeiou]+")
# ending in consonant + e
final_e_re = re.compile(r"[^aeiou]e+\b")
# ending in two consonants + y or just consontant + y for 2-letter words
final_2cons_y_re = re.compile(r"([^aeiou]{2,}y\b)|(\b[^aeiou]+y\b)")
guess = 0
risk = 0
vowels = vow_cluster_re.findall(token_str)
# Initially add one syllable per vowel cluster
for vc in vowels:
guess += 1
if vc in ("ue", "ei"): # see below
risk += 1
# --- START of token_str --- #
# starts with "rei" => assume "re-i" and remove syllable (fails for reign etc.)
if token_str.startswith("rei"):
guess += 1
# --- END of token_str --- #
# ends with 2+ consonants & y => treat y as vowel and add syllable
final_2cons_y = final_2cons_y_re.search(token_str)
if final_2cons_y:
guess += 1
risk += 1
else:
final_e = final_e_re.search(token_str)
# if only one vowel, then treat final e as syllable indicator
if final_e and guess > 1:
# ends with e but not "ee" => assume "e" is not indicator of syllable (so remove one)
guess = guess -1
# Is it worh / possible to make a good rule for ue (e.g. "true") vs u-e (e.g. "fluent")?
return guess, risk
def run_demo():
try:
from nltk.corpus import wordnet as wn
for syn in wn.all_synsets():
token_str = syn.name().split(".")[0]
if "_" not in token_str:
guess, risk = guess_syllables(token_str)
print(token_str, "\t", guess, "\t", risk)
except ImportError:
print("This demo requires the wordnet data from the NLTK module")
if __name__ == "__main__":
run_demo()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment