Here’s pig latin dialect that takes into account how the words are pronounced:
#!/usr/bin/env python # -*- coding: utf-8 -*- import re sentences = ["Pig qoph an egg.", "Quiet European rhythms.", "My nth happy hour.", "Herb unit -- a dynasty heir."] for sent in sentences: entsay = " ".join(["".join(map(to_piglatin, re.split("(\W+)", nonws))) for nonws in sent.split()]) print(u'"{}" → "{}"'.format(sent, entsay))
Output
"Pig qoph an egg." → "igpay ophqay anway eggway." "Quiet European rhythms." → "ietquay uropeaneay ythmsrhay." "My nth happy hour." → "ymay nthway appyhay hourway." "Herb unit -- a dynasty heir." → "herbway itunay -- away ynastyday heirway."
Note:
"-way"
suffix is used for words that start with a vowel soundqu
in “quiet” is treated as a unitEuropean
,unit
start with a consonanty
in “rhythms”, “dynasty” is a vowelnth
,hour
,herb
,heir
start with a vowel
where to_piglatin()
is:
from nltk.corpus import cmudict # $ pip install nltk # $ python -c "import nltk; nltk.download('cmudict')" def to_piglatin(word, pronunciations=cmudict.dict()): word = word.lower() #NOTE: ignore Unicode casefold i = 0 # find out whether the word start with a vowel sound using # the pronunciations dictionary for syllables in pronunciations.get(word, []): for i, syl in enumerate(syllables): isvowel = syl[-1].isdigit() if isvowel: break else: # no vowels assert 0 if i == 0: # starts with a vowel return word + "way" elif "y" in word: # allow 'y' as a vowel for known words return to_piglatin_naive(word, vowels="aeiouy", start=i) break # use only the first pronunciation return to_piglatin_naive(word, start=i) def to_piglatin_naive(word, vowels="aeiou", start=0): word = word.lower() i = 0 for i, c in enumerate(word[start:], start=start): if c in vowels: break else: # no vowel in the word i += 1 return word[i:] + word[:i] + "w"*(i == 0) + "ay"*word.isalnum()
To split the text into sentences, words you could use nltk
tokenizers. It is possible to modify the code to respect letters’ case (uppercase/lowercase), contractions.