Appendix III: Code#
Data#
""" palindromes.parse: Module for parsing external data sources
"""
import enum
import string
from typing import List, Callable
try:
import nltk
from nltk.corpus import brown, cess_esp, indian
from nltk.tokenize import sent_tokenize
except ImportError:
print("NLTK is not installed. Please install it using: pip install nltk")
exit()
class CORPORA(enum.Enum):
ENGLISH = "english"
SPANISH = "spanish"
HINDI = "hindi"
def init():
nltk.download('brown')
nltk.download('cess_esp')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('indian')
def _english() -> List[str]:
"""Retrieves and performs initial cleaning on sentences from the Brown (English) corpus."""
# The Brown corpus is a collection of texts. Flatten them into single strings
# before tokenizing into sentences to handle sentence breaks across text boundaries.
raw_text = " ".join(" ".join(sent) for sent in brown.sents())
sentences = sent_tokenize(raw_text)
# Remove standard punctuation.
translator = str.maketrans('', '', string.punctuation)
return [s.translate(translator) for s in sentences]
def _spanish() -> List[str]:
"""Retrieves and performs initial cleaning on sentences from the CESS-ESP (Spanish) corpus."""
# Spanish-specific punctuation to remove.
spanish_punctuation = string.punctuation + "¡¿"
translator = str.maketrans('', '', spanish_punctuation)
sentences = [" ".join(sent) for sent in cess_esp.sents()]
return [s.translate(translator) for s in sentences]
def _hindi() -> List[str]:
"""Retrieves and reconstructs sentences from the Indian (Hindi) corpus."""
sentences = []
current_sentence = []
# The Hindi corpus uses '।'-_ a 'purna viram' - as a sentence delimiter.
for word in indian.words('hindi.pos'):
if word == '।':
if current_sentence:
sentences.append(" ".join(current_sentence))
current_sentence = []
else:
current_sentence.append(word)
# Add the last sentence if the corpus doesn't end with a delimiter.
if current_sentence:
sentences.append(" ".join(current_sentence))
return sentences
def _normalize(
sentences: List[str], min_length: int, max_length: int
) -> List[str]:
"""
Applies final normalization and length filtering to a list of sentences.
- Converts to lowercase.
- Removes extra whitespace.
- Filters by character length.
"""
cleaned_sentences = []
for sentence in sentences:
# Consolidate whitespace and convert to lowercase.
normalized = " ".join(sentence.split()).lower()
if min_length <= len(normalized) <= max_length:
cleaned_sentences.append(normalized)
return cleaned_sentences
# Function dispatch dictionaries
_CORPUS: dict[CORPORA, Callable[[], List[str]]] = {
CORPORA.ENGLISH: _english,
CORPORA.SPANISH: _spanish,
CORPORA.HINDI: _hindi,
}
_LANGUAGE: dict[CORPORA, Callable[[], List[str]]] = {
CORPORA.ENGLISH: lambda: brown.words(),
CORPORA.SPANISH: lambda: cess_esp.words(),
CORPORA.HINDI: lambda: indian.words('hindi.pos'),
}
def corpus(
language: CORPORA = CORPORA.ENGLISH, min_length: int = 100, max_length: int = 200
) -> List[str]:
"""
Extracts and cleans sentences from a specified NLTK corpus.
Args:
language: The language corpus to use (from the CORPORA enum).
min_length: The minimum character length for a sentence to be included.
max_length: The maximum character length for a sentence to be included.
Returns:
A list of cleaned and filtered sentences.
Raises:
ValueError: If an unsupported language is specified.
"""
if language not in _CORPUS:
supported = ", ".join(f"'{lang.value}'" for lang in CORPORA)
raise ValueError(f"Invalid language. Choose from: {supported}.")
# 1. Get raw sentences using the language-specific getter.
raw_sentences = _CORPUS[language]()
# 2. Apply common normalization and filtering logic.
return _normalize(raw_sentences, min_length, max_length)
def language(length: int, language: CORPORA = CORPORA.ENGLISH) -> List[str]:
"""
Extracts alphabetical words of a specific length from a corpus.
Args:
length: The desired length of the words.
language: The language corpus to use.
Returns:
A list of words matching the criteria.
Raises:
ValueError: If an unsupported language is specified.
"""
if language not in _LANGUAGE:
supported = ", ".join(f"'{lang.value}'" for lang in CORPORA)
raise ValueError(f"Invalid language. Choose from: {supported}.")
# 1. Get the raw word list using the dispatcher.
word_list = _LANGUAGE[language]()
# 2. Filter the list based on length and alphabetic characters.
return [
word.lower()
for word in word_list
if len(word) == length and word.isalpha()
]
init()
Functions#
import string
import typing
constants = {
"delimiter" : " ",
"punctuation" : string.punctuation
}
# FORMALIZING FUNCTIONS
## These functions convert the Python data structures into formal entities.
def _reindex(s: str, i: int) -> str:
"""
Args:
- s: Any string.
- i: Character index, starting at 1.
"""
if i < 1:
raise ValueError("Character Index starts at 1")
if i > len(s):
raise ValueError(f"Character Index ends at {len(s)}")
return s[int(i-1)]
def _depunctuate(s: str) -> str:
"""
Args:
- s: Any string.
"""
return "".join([a for a in s if a not in constants["punctuation"]])
# FORMALIZED FUNCTIONS
## These functions implement various functions of the formal model.
def invert(s: str) -> str:
return s[::-1]
def reduce(s: str) -> str:
"""
Args:
- s: Any string.
"""
return "".join([a for a in s if a != constants["delimiter"]])
def delimter_count(s: str) -> int:
"""
Args:
- s: Any string.
"""
return len([1 for a in s if a == constants["delimiter"]])
def words(
s : str,
language : typing.Callable = lambda _: True
) -> list:
"""
Args:
- s: Any string.
- language: function that when a word is inputted determines if the word belongs to a given language.
"""
words = []
word = ""
for a in s:
if a != constants["delimiter"]:
word += a
continue
if language:
words.append(word)
word = ""
if not is_empty(word):
words.append(word)
return words
def word_length(
s : str,
language : typing.Callable = lambda _: True
) -> int:
"""
Args:
- s: Any string.
- language: function that when a word is inputted determines if the word belongs to a given language.
"""
return len(words(s, language))
def pivot_char(s: str) -> str:
"""
Args:
- s: Any string.
"""
if len(s) % 2 == 0:
left_index, right_index = len(s)/2, (len(s) + 2)/2
if len(s) % 2 == 1:
left_index, right_index = (len(s) + 1)/2, (len(s) + 1)/2
if _reindex(s, left_index) == _reindex(s, right_index):
return _reindex(s, left_index)
return None
def pivot_word(
s : str,
language : typing.Callable = lambda _: True
) -> str:
"""
Args:
- s: Any string.
- language: function that when a word is inputted determines if the word belongs to a given language.
"""
w = words(s, language)
if len(w) % 2 == 0:
left_index, right_index = len(w)/2, (len(w) + 2)/2
if len(w) % 2 == 1:
left_index,right_index = (len(w) + 1)/2, (len(w) + 1)/2
if _reindex(w, left_index) == invert(_reindex(w, right_index)):
return _reindex(w, left_index)
return None
# TRUTH VALUE FUNCTIONS
## SYNTACTIC TRUTH VALUES
def is_empty(s: str):
"""
Args:
- s: Any string.
"""
return s == "" or s is None
def is_palindrome(s: str) -> bool:
"""
Args:
- s: Any string.
"""
return reduce(s) == invert(reduce(s))
## SEMANTIC TRUTH VALUES
def is_subvertible(
s : str,
language : typing.Callable = lambda _: True
) -> bool:
"""
Args:
- s: Any string.
- language: function that when a word is inputted determines if the word belongs to a given language.
"""
return pivot_char(s) is not None and pivot_word(s, language) is not None
def is_invertible(
s : str,
language : typing.Callable = lambda _: True
) -> bool:
"""
Args:
- s: Any string.
- language: function that when a word is inputted determines if the word belongs to a given language.
"""
return language(invert(s))
# EXTENSIONAL FUNCTIONS
def invertible_sentences(c: typing.List[str]) -> typing.List[str]:
return [z for z in c if is_invertible(z)]
def subvertible_sentences(c: typing.List[str]) -> typing.List[str]:
return [z for z in c if is_subvertible(z)]
def palindromes(c: typing.List[str]) -> typing.List[str]:
return [z for z in c if is_palindrome(z)]
Graphs#
TODO