Appendix III: Code

Contents

Appendix III: Code#

Data#

""" palindromes.parse: Module for parsing external data sources
"""
import enum
import string
from typing import List, Callable

try:
    import nltk
    from nltk.corpus import brown, cess_esp, indian
    from nltk.tokenize import sent_tokenize
except ImportError:
    print("NLTK is not installed. Please install it using: pip install nltk")
    exit()



class CORPORA(enum.Enum):
    ENGLISH = "english"
    SPANISH = "spanish"
    HINDI = "hindi"


def init():
    nltk.download('brown')
    nltk.download('cess_esp')
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('indian')


def _english() -> List[str]:
    """Retrieves and performs initial cleaning on sentences from the Brown (English) corpus."""
    # The Brown corpus is a collection of texts. Flatten them into single strings
    # before tokenizing into sentences to handle sentence breaks across text boundaries.
    raw_text = " ".join(" ".join(sent) for sent in brown.sents())
    sentences = sent_tokenize(raw_text)
    
    # Remove standard punctuation.
    translator = str.maketrans('', '', string.punctuation)
    return [s.translate(translator) for s in sentences]


def _spanish() -> List[str]:
    """Retrieves and performs initial cleaning on sentences from the CESS-ESP (Spanish) corpus."""
    # Spanish-specific punctuation to remove.
    spanish_punctuation = string.punctuation + "¡¿"
    translator = str.maketrans('', '', spanish_punctuation)
    
    sentences = [" ".join(sent) for sent in cess_esp.sents()]
    return [s.translate(translator) for s in sentences]


def _hindi() -> List[str]:
    """Retrieves and reconstructs sentences from the Indian (Hindi) corpus."""
    sentences = []
    current_sentence = []
    # The Hindi corpus uses '।'-_ a 'purna viram' - as a sentence delimiter.
    for word in indian.words('hindi.pos'):
        if word == '।':
            if current_sentence:
                sentences.append(" ".join(current_sentence))
                current_sentence = []
        else:
            current_sentence.append(word)
    
    # Add the last sentence if the corpus doesn't end with a delimiter.
    if current_sentence:
        sentences.append(" ".join(current_sentence))
        
    return sentences


def _normalize(
    sentences: List[str], min_length: int, max_length: int
) -> List[str]:
    """
    Applies final normalization and length filtering to a list of sentences.
    
    - Converts to lowercase.
    - Removes extra whitespace.
    - Filters by character length.
    """
    cleaned_sentences = []
    for sentence in sentences:
        # Consolidate whitespace and convert to lowercase.
        normalized = " ".join(sentence.split()).lower()
        if min_length <= len(normalized) <= max_length:
            cleaned_sentences.append(normalized)
    return cleaned_sentences


# Function dispatch dictionaries

_CORPUS: dict[CORPORA, Callable[[], List[str]]] = {
    CORPORA.ENGLISH: _english,
    CORPORA.SPANISH: _spanish,
    CORPORA.HINDI: _hindi,
}

_LANGUAGE: dict[CORPORA, Callable[[], List[str]]] = {
    CORPORA.ENGLISH: lambda: brown.words(),
    CORPORA.SPANISH: lambda: cess_esp.words(),
    CORPORA.HINDI:   lambda: indian.words('hindi.pos'),
}


def corpus(
    language: CORPORA = CORPORA.ENGLISH, min_length: int = 100, max_length: int = 200
) -> List[str]:
    """
    Extracts and cleans sentences from a specified NLTK corpus.

    Args:
        language: The language corpus to use (from the CORPORA enum).
        min_length: The minimum character length for a sentence to be included.
        max_length: The maximum character length for a sentence to be included.

    Returns:
        A list of cleaned and filtered sentences.
        
    Raises:
        ValueError: If an unsupported language is specified.
    """
    if language not in _CORPUS:
        supported = ", ".join(f"'{lang.value}'" for lang in CORPORA)
        raise ValueError(f"Invalid language. Choose from: {supported}.")

    # 1. Get raw sentences using the language-specific getter.
    raw_sentences = _CORPUS[language]()
    
    # 2. Apply common normalization and filtering logic.
    return _normalize(raw_sentences, min_length, max_length)


def language(length: int, language: CORPORA = CORPORA.ENGLISH) -> List[str]:
    """
    Extracts alphabetical words of a specific length from a corpus.

    Args:
        length: The desired length of the words.
        language: The language corpus to use.

    Returns:
        A list of words matching the criteria.
        
    Raises:
        ValueError: If an unsupported language is specified.
    """
    if language not in _LANGUAGE:
        supported = ", ".join(f"'{lang.value}'" for lang in CORPORA)
        raise ValueError(f"Invalid language. Choose from: {supported}.")

    # 1. Get the raw word list using the dispatcher.
    word_list = _LANGUAGE[language]()
    
    # 2. Filter the list based on length and alphabetic characters.
    return [
        word.lower() 
        for word in word_list 
        if len(word) == length and word.isalpha()
    ]

init()

Functions#

import string
import typing

constants                                           = {
    "delimiter"                                     : " ",
    "punctuation"                                   : string.punctuation
}


# FORMALIZING FUNCTIONS
## These functions convert the Python data structures into formal entities.

def _reindex(s: str, i: int)                        -> str:
    """
    Args:
        - s: Any string.
        - i: Character index, starting at 1.
    """
    if i < 1:
        raise ValueError("Character Index starts at 1")
    
    if i > len(s):
        raise ValueError(f"Character Index ends at {len(s)}")
    
    return s[int(i-1)]


def _depunctuate(s: str)                             -> str:
    """
    Args:
        - s: Any string.
    """
    return "".join([a for a in s if a not in constants["punctuation"]])


# FORMALIZED FUNCTIONS
## These functions implement various functions of the formal model.

def invert(s: str)                                  -> str:
    return s[::-1]


def reduce(s: str)                                  -> str:
    """
    Args:
        - s: Any string.
    """
    return "".join([a for a in s if a != constants["delimiter"]])


def delimter_count(s: str)                          -> int:
    """
    Args:
        - s: Any string.
    """
    return len([1 for a in s if a == constants["delimiter"]])


def words(
    s                                               : str, 
    language                                        : typing.Callable = lambda _: True
)                                                   -> list:
    """
    Args:
        - s: Any string.
        - language: function that when a word is inputted determines if the word belongs to a given language. 
    """
    words                                           = []
    word                                            = ""

    for a in s:
        if a != constants["delimiter"]:
            word                                    += a
            continue

        if language:
            words.append(word)

        word                                        = ""

    if not is_empty(word):
        words.append(word)
        
    return words


def word_length(
    s                                               : str, 
    language                                        : typing.Callable = lambda _: True
)                                                   -> int: 
    """
    Args:
        - s: Any string.
        - language: function that when a word is inputted determines if the word belongs to a given language. 
    """
    return len(words(s, language))


def pivot_char(s: str)                              -> str:
    """
    Args:
        - s: Any string.
    """
    if len(s) % 2 == 0:
        left_index, right_index                     = len(s)/2, (len(s) + 2)/2
            
    if len(s) % 2 == 1:
        left_index, right_index                     = (len(s) + 1)/2, (len(s) + 1)/2

    if _reindex(s, left_index) == _reindex(s, right_index):
        return _reindex(s, left_index)

    return None


def pivot_word(
    s                                               : str, 
    language                                        : typing.Callable = lambda _: True
)                                                   -> str:
    """
    Args:
        - s: Any string.
        - language: function that when a word is inputted determines if the word belongs to a given language. 
    """
    w                                               = words(s, language)
    if len(w) % 2 == 0:
        left_index, right_index                     = len(w)/2, (len(w) + 2)/2
    
    if len(w) % 2 == 1:
        left_index,right_index                      = (len(w) + 1)/2, (len(w) + 1)/2

    if _reindex(w, left_index) == invert(_reindex(w, right_index)):
        return _reindex(w, left_index)
        
    return None


# TRUTH VALUE FUNCTIONS

## SYNTACTIC TRUTH VALUES

def is_empty(s: str):
    """
    Args:
        - s: Any string.
    """
    return s == "" or s is None


def is_palindrome(s: str)                           -> bool:
    """
    Args:
        - s: Any string.
    """
    return reduce(s) == invert(reduce(s))


## SEMANTIC TRUTH VALUES

def is_subvertible(
    s                                               : str, 
    language                                        : typing.Callable = lambda _: True
)                                                   -> bool:
    """
    Args:
        - s: Any string.
        - language: function that when a word is inputted determines if the word belongs to a given language. 
    """
    return pivot_char(s) is not None and pivot_word(s, language) is not None


def is_invertible(
    s                                               : str,
    language                                        : typing.Callable = lambda _: True
)                                                   -> bool:
    """
    Args:
        - s: Any string.
        - language: function that when a word is inputted determines if the word belongs to a given language. 
    """
    return language(invert(s))


# EXTENSIONAL FUNCTIONS

def invertible_sentences(c: typing.List[str])       -> typing.List[str]:
    return [z for z in c if is_invertible(z)]


def subvertible_sentences(c: typing.List[str])      -> typing.List[str]:
    return [z for z in c if is_subvertible(z)]


def palindromes(c: typing.List[str])                -> typing.List[str]:
    return [z for z in c if is_palindrome(z)]

Graphs#

TODO