Section IX: Code#
Main#
""" palindromes.main: Main module.
"""
import json
# application modules
import estimators
import parse
import graphs
import model
def write(data, file_name):
with open(file_name, "w") as outfile:
json.dump(data, outfile)
def update_posterior(p_values, prior_probs, sentence, likelihood_func):
"""
Updates the prior distribution based on the observed sentence.
Args:
p_values: The values of p for which the prior is defined.
prior_probs: The prior probabilities for each value of p.
sentence: The observed sentence (string).
likelihood_func: A function that calculates the likelihood of observing a sentence given delimiter indices and length.
Returns:
A new list representing the updated posterior distribution over p values.
"""
n = len(sentence)
z = sentence.count(' ') # Count of delimiters in the sentence
# Calculate likelihood for the observed sentence length and each value of p
likelihoods = [likelihood_func(n, z, p_val) for p_val in p_values]
# Calculate the denominator P(ζ) using the law of total probability
p_zeta = sum(l * p for l, p in zip(likelihoods, prior_probs))
# Update the prior based on the likelihood and the normalizing constant
posterior_probs = [(l * p) / p_zeta if p_zeta > 0 else 0 for l, p in zip(likelihoods, prior_probs)]
return posterior_probs
def analyze_sentence_integrals(sentences, sentence_length):
"""
Analyzes the Left and Right-Hand Sentence Integrals of sentences in a corpus.
Args:
sentences: The list of sentences.
sentence_length: The desired sentence length.
Returns:
A tuple containing two lists:
- left_integrals: A list of Left-Hand Sentence Integrals.
- right_integrals: A list of Right-Hand Sentence Integrals.
"""
left_integrals = []
right_integrals = []
if not sentences:
return left_integrals, right_integrals
for sentence in sentences:
if len(sentence) == sentence_length:
left_integrals.append(model.lefthand_integral(sentence, sentence_length))
right_integrals.append(model.righthand_integral(sentence, sentence_length))
return left_integrals, right_integrals
def analyze_delimiter_densities(sentences, min_length, max_length):
"""
Iterates over sentence lengths, analyzes Sentence Integrals, and calculates delimiter densities.
Args:
sentences: The list of sentences.
min_length: The minimum sentence length to analyze.
max_length: The maximum sentence length to analyze.
Returns:
A dictionary containing the statistics for each sentence length and a list of delimiter densities.
"""
delimiter_densities = []
for length in range(min_length, max_length + 1):
left_integrals, right_integrals = model.sentence_integrals(sentences, length)
if not left_integrals and not right_integrals:
continue
left_stats = estimators.summarize(left_integrals)
right_stats = estimators.summarize(right_integrals)
# Calculate delimiter densities based on mean integral values
d_left = model.delimiter_density(left_stats["mean"], length)
d_right = model.delimiter_density(right_stats["mean"], length)
delimiter_densities.append({
"sentence_length": length,
"n": left_stats["number of samples"], # Assuming n is the same for both left and right
"left": left_stats["mean"],
"right": right_stats["mean"],
"stdev(left)": left_stats["stdev"],
"stdev(right)": right_stats["stdev"],
"d_left": d_left,
"d_right": d_right,
"stdev(d_left)": None, # Placeholder for now
"stdev(d_right)": None, # Placeholder for now
})
return delimiter_densities
def analyze_languages(min_length, max_length):
"""
Analyzes delimiter densities for English, Spanish, and Hindi corpora.
Args:
min_length: The minimum sentence length to analyze.
max_length: The maximum sentence length to analyze.
Returns:
A list of dictionaries, where each dictionary contains the results of the difference of means tests for a specific sentence length.
"""
languages = ["english", "spanish", "hindi"]
all_corpora_delimiter_data = {}
for language in languages:
sentences = parse.corpus(min_length, max_length, language, )
all_corpora_delimiter_data[language] = analyze_delimiter_densities(sentences,
min_length,
max_length)
print(sentences[:3])
comparison_results = []
for length in range(min_length, max_length + 1):
# Check if data exists for all languages at this length
if not all(any(d["sentence_length"] == length for d in all_corpora_delimiter_data[lang]) for lang in languages):
continue
english_data = all_corpora_delimiter_data["english"]
spanish_data = all_corpora_delimiter_data["spanish"]
hindi_data = all_corpora_delimiter_data["hindi"]
# Find the data for the current length in each language
english_stats = next((d for d in english_data if d["sentence_length"] == length), None)
spanish_stats = next((d for d in spanish_data if d["sentence_length"] == length), None)
hindi_stats = next((d for d in hindi_data if d["sentence_length"] == length), None)
if not english_stats or not spanish_stats or not hindi_stats:
continue
result = {
"sentence_length": length,
"n": english_stats["n"], # Assuming n is the same across languages for a given length
"comparisons": {}
}
# Perform comparisons and store results
comparisons = [("spanish", "english"), ("spanish", "hindi"), ("hindi", "english")]
for lang1, lang2 in comparisons:
data1 = english_stats if lang1 == "english" else spanish_stats if lang1 == "spanish" else hindi_stats
data2 = english_stats if lang2 == "english" else spanish_stats if lang2 == "spanish" else hindi_stats
t_left, p_left = estimators.difference_of_means_test(data1["left"],
data1["stdev(left)"],
data1["n"],
data2["left"],
data2["stdev(left)"],
data2["n"])
t_right, p_right = estimators.difference_of_means_test(data1["right"],
data1["stdev(right)"],
data1["n"],
data2["right"],
data2["stdev(right)"],
data2["n"])
result["comparisons"][f"{lang1}-{lang2}"] = {
"t_left": t_left,
"p_left": p_left,
"t_right": t_right,
"p_right": p_right,
}
comparison_results.append(result)
return comparison_results
def analyze_sentence_lengths(min_length, max_length):
"""
Analyzes sentence lengths for English, Spanish, and Hindi corpora.
Args:
min_length: The minimum sentence length to analyze.
max_length: The maximum sentence length to analyze.
Returns:
A dictionary containing the length frequencies and mean lengths for each corpus.
"""
corpora = [parse.CORPORA.ENGLISH, parse.CORPORA.SPANISH, parse.CORPORA.HINDI]
results = {}
for corpus in corpora:
sentences = parse.corpus(min_length, max_length, corpus)
length_freq = estimators.length_frequencies(sentences)
mean_length = estimators.sample_mean_freq(length_freq)
graphs.length_histogram(length_freq, mean_length)
results[corpus.value] = {
"length_frequencies": length_freq,
"mean_length": mean_length,
}
return results
def analyze_delimiter_posterior(min_length, max_length):
cleaned_sentences = parse.corpus(min_length, max_length)
p_values, prior_probs = estimators.beta_prior(alpha=2, beta=10)
# Iterate through the sentences and update the posterior
posterior_probs = prior_probs.copy()
for sentence in cleaned_sentences:
if len(sentence) in range(min_length, max_length):
posterior_probs = update_posterior(dict(zip([len(sentence)], [posterior_probs])),
sentence,
estimators.binomial_likelihood)
midpoint = int((min_length + max_length)/2)
graphs.posterior_delimiter_histogram(p_values, posterior_probs, midpoint)
return posterior_probs
def analyze_delimiter_distribution(min_length, max_length):
"""
Analyzes the distribution of delimiter indices in sentences of varying lengths across different corpora.
Args:
min_length: The minimum sentence length to analyze.
max_length: The maximum sentence length to analyze.
Returns:
A dictionary containing the delimiter index frequency distributions for each language and sentence length.
"""
corpora = [parse.CORPORA.ENGLISH, parse.CORPORA.SPANISH, parse.CORPORA.HINDI]
results = {}
for corpus in corpora:
results[corpus.value] = {}
sentences = parse.corpus(min_length, max_length, corpus)
for sentence in sentences:
delimiter_indices = model.delimit(sentence)
length = len(sentence)
if length not in results[corpus.value]:
results[corpus.value][length] = {}
for index in delimiter_indices:
results[corpus.value][length][index] = results[corpus.value][length].get(index, 0) + 1
graphs.delimiter_histogram(results)
return results
def analyze_sentence_delimiters(sentence):
"""
Analyzes the delimiter distribution in a sentence.
Args:
sentence: The input sentence (string).
"""
delimiter_indices = model.delimit(sentence)
graphs.delimiter_barchart(delimiter_indices, sentence)
def analyze_conditional_word_probability(length, condition, offset=0):
"""
Calculates the frequency distribution of characters at a specific position
in words of a given length that start with a given condition.
Args:
length: The desired length of the words.
condition: The starting string condition (e.g., "da").
Returns:
A dictionary representing the frequency distribution of characters
at the position after the condition.
"""
if len(condition) >= length:
raise ValueError("Length of condition must be less than the length of the word.")
# Get all words from the Brown corpus, convert to lowercase, remove non-alphanumeric,
# and filter by length and starting condition
words = parse.words(length)
all_words = set(
word.lower()
for word in words
if word.startswith(condition)
)
# Calculate the position after the condition
position = len(condition) + offset
# Create a frequency distribution of characters at the specified position
freq_dist = {}
for word in all_words:
char = word[position]
freq_dist[char] = freq_dist.get(char, 0) + 1
graphs.conditional_character_histogram(freq_dist, length, condition, position)
return freq_dist
if __name__ == "__main__":
length = 4
condition = "wor"
freq_dist = analyze_conditional_word_probability(length, condition, 0)
Model#
""" palindromes.module: Module containing the results and theorems of the formal system.
"""
import string
def invert(sentence):
return sentence[::-1]
def sigma_reduce(sentence):
# Remove punctuation (except spaces) and convert to lowercase
processed_sentence = "".join(
c for c in sentence if c not in string.punctuation or c == " "
)
processed_sentence = " ".join(processed_sentence.split()).lower()
# Calculate the sigma-reduction (remove spaces)
sigma_reduced_sentence = "".join(c for c in processed_sentence if c != " ")
return sigma_reduced_sentence
def delimiter_count(char):
"""
Calculates the delimiter count of a single character.
Args:
char: The character to check.
Returns:
1 if the character is a delimiter (space), 0 otherwise.
"""
return 1 if char == ' ' else 0
def delimiter_density(mean_integral_value, sentence_length):
"""
Calculates the delimiter density (d) based on the mean Sentence Integral value and sentence length.
Args:
mean_integral_value: The mean value of the Sentence Integral (either Left or Right).
sentence_length: The length of the sentences.
Returns:
The estimated delimiter density (d).
"""
if sentence_length < 1:
return None
# From our approximation before: E[Ω:sub:`-`(ζ,l(ζ))] ≈ d * (l(ζ) + 1)/2
# We also know that E[Ω:sub:`-`(ζ,l(ζ))] ≈ mean_integral_value
d = (2 * mean_integral_value) / (sentence_length + 1)
return d
def delimit(sentence):
"""
Returns a list of delimiter indices in a sentence.
Args:
sentence: The input sentence (string).
Returns:
A list of integers, where each integer is the index of a delimiter in the sentence.
"""
delimiter_indices = []
for i, char in enumerate(sentence):
if delimiter_count(char):
delimiter_indices.append(i + 1) # Add 1 to match our 1-based indexing
return delimiter_indices
def is_palindrome(sentence):
"""
Checks if a sentence is a palindrome based on our formal definition.
Args:
sentence: The input sentence (string).
Returns:
True if the sentence is a palindrome, False otherwise.
"""
sigma_sentence = sigma_reduce(sentence)
inverse_sigma_sentence = invert(sigma_sentence)
return sigma_sentence == inverse_sigma_sentence
def filter_palindromes(sentences):
"""
Filters a list of sentences to find palindromes.
Args:
sentences: A list of sentences (strings).
Returns:
A list of palindromes (strings).
"""
return [sentence for sentence in sentences if is_palindrome(sentence)]
def lefthand_integral(sentence, k):
"""
Calculates the Left-Hand Sentence Integral of a sentence up to index k.
Args:
sentence: The input sentence (string).
k: The upper limit of the summation (natural number).
Returns:
The Left-Hand Sentence Integral (float).
"""
l = len(sentence)
total = 0
for i in range(1, min(k + 1, l + 1)):
total += delimiter_count(sentence[i - 1]) * (i / l)
return total
def righthand_integral(sentence, k):
"""
Calculates the Right-Hand Sentence Integral of a sentence up to index k.
Args:
sentence: The input sentence (string).
k: The upper limit of the summation (natural number).
Returns:
The Right-Hand Sentence Integral (float).
"""
l = len(sentence)
total = 0
for i in range(1, min(k + 1, l + 1)):
total += delimiter_count(sentence[i - 1]) * ((l - i + 1) / l)
return total
def sentence_integrals(sentences, sentence_length):
"""
Analyzes the Left and Right-Hand Sentence Integrals of sentences in a corpus.
Args:
sentences: The list of sentences.
sentence_length: The desired sentence length.
Returns:
A tuple containing two lists:
- left_integrals: A list of Left-Hand Sentence Integrals.
- right_integrals: A list of Right-Hand Sentence Integrals.
"""
left_integrals = []
right_integrals = []
for sentence in sentences:
if len(sentence) == sentence_length:
left_integrals.append(lefthand_integral(sentence, sentence_length))
right_integrals.append(righthand_integral(sentence, sentence_length))
return left_integrals, right_integrals
def integral_coefficients(sentence):
"""
Calculates the coefficients (2i - l(ζ) - 1) for each delimiter in a sentence.
Args:
sentence: The input sentence (string).
Returns:
A list of coefficients, one for each delimiter in the sentence.
"""
l = len(sentence)
coefficients = []
for i in range(1, l + 1):
if sentence[i - 1] == ' ': # Assuming space as the delimiter
coefficients.append(2 * i - l - 1)
return coefficients
def integral_distribution(corpus, sentence_length):
"""
Processes a corpus of sentences, filters for sentences of a specific length,
and calculates the coefficients for each sentence.
Args:
corpus: A list of sentences (strings).
sentence_length: The desired sentence length.
Returns:
A list of lists, where each inner list contains the coefficients for a single sentence.
"""
all_coefficients = []
for sentence in corpus:
if len(sentence) == sentence_length:
coefficients = integral_coefficients(sentence)
all_coefficients.append(coefficients)
return all_coefficients
Estimators#
""" palindrome.estimators: Module for statistical analysis.
"""
import math
import statistics
import numpy as np
import scipy.stats
from scipy.special import comb
def summarize(data):
"""
Calculates descriptive statistics for a given dataset.
Args:
data: A list of numerical data.
Returns:
A dictionary containing the statistics.
"""
if not data:
return {
"number of samples": 0,
"mean": None,
"median": None,
"stdev": None,
"skewness": None,
"min": None,
"max": None,
"mode": None
}
try:
mode = statistics.mode(data)
except statistics.StatisticsError:
mode = None # Handle cases with no unique mode
stats = {
"number of samples": len(data),
"mean": statistics.mean(data),
"median": statistics.median(data),
"min": min(data),
"max": max(data),
"mode": mode,
}
if len(data) > 1:
stats["stdev"] = statistics.stdev(data)
stats["skewness"] = scipy.stats.skew(data, bias=False) # Using Pearson's moment coefficient of skewness
else:
stats["stdev"] = None
stats["skewness"] = None
return stats
def sample_mean_freq(data_dict):
"""
Calculates the sample mean of a dictionary, weighted by the values.
Args:
data_dict: A dictionary where keys are data points and values are their frequencies.
Returns:
The weighted sample mean.
"""
total_sum = sum(key * value for key, value in data_dict.items())
total_count = sum(value for value in data_dict.values())
if total_count == 0:
return None # Handle the case of an empty dictionary
return total_sum / total_count
def length_frequencies(corpus):
"""
Calculates the frequency of each sentence length in a corpus.
Args:
corpus: A list of sentences (strings).
Returns:
A dictionary where keys are sentence lengths and values are their frequencies.
"""
freq_dict = {}
for sentence in corpus:
length = len(sentence)
freq_dict[length] = freq_dict.get(length, 0) + 1
return freq_dict
def difference_of_means_test(mean_1, stdev_1, n1, mean_2, stdev_2, n2):
"""
Performs a two-sample t-test (difference of means test) assuming unequal variances.
Args:
mean_1: Mean of the first sample.
stdev_1: Standard deviation of the first sample.
n1: Number of observations in the first sample.
mean_2: Mean of the second sample.
stdev_2: Standard deviation of the second sample.
n2: Number of observations in the second sample.
Returns:
A tuple containing the t-statistic and the p-value.
"""
print("performing tests")
if stdev_1 is None or stdev_2 is None or n1 < 2 or n2 < 2:
return None, None
# Calculate the t-statistic
t_statistic = (mean_1 - mean_2) / math.sqrt((stdev_1**2 / n1) + (stdev_2**2 / n2))
# Calculate the degrees of freedom using the Welch-Satterthwaite equation
df = ((stdev_1**2 / n1) + (stdev_2**2 / n2))**2 / (
(stdev_1**4) / (n1**2 * (n1 - 1)) + (stdev_2**4) / (n2**2 * (n2 - 1))
)
# Calculate the p-value (two-tailed test)
p_value = 2 * (1 - scipy.stats.t.cdf(abs(t_statistic), df))
return t_statistic, p_value
def uniform_prior(num_points=1000):
"""
Creates a uniform prior distribution for the delimiter probability p.
Args:
num_points: The number of points to use for discretization.
Returns:
A tuple of two arrays:
- x: The values of p (from 0 to 1).
- prior: The corresponding prior probabilities for each value of p.
"""
x = np.linspace(0, 1, num_points)
prior = np.ones_like(x) / num_points # Uniform distribution
return x, prior
def beta_prior(alpha, beta, num_points=1000):
"""
Creates a Beta distribution prior for the delimiter probability p.
Args:
alpha: The alpha parameter of the Beta distribution.
beta: The beta parameter of the Beta distribution.
num_points: The number of points to use for discretization.
Returns:
A tuple of two arrays:
- x: The values of p (from 0 to 1).
- prior: The corresponding prior probabilities for each value of p.
"""
x = np.linspace(0, 1, num_points)
prior = scipy.stats.beta.pdf(x, alpha, beta)
return x, prior
def binomial_likelihood(n, z, p):
"""
Calculates the binomial likelihood of observing z delimiters in a sentence of length n.
Args:
n: The length of the sentence (integer).
z: The number of delimiters in the sentence (integer).
p: The prior probability of a character being a delimiter.
Returns:
The likelihood of observing z delimiters in a sentence of length n.
"""
return comb(n, z) * (p ** z) * ((1 - p) ** (n - z))
Graphs#
""" palindromes.graphs: Module for visualizing palindromic structures.
"""
import matplotlib.pyplot as plt
def conditional_character_histogram(freq_dist, length, condition, index):
"""
Plots a histogram of the frequency distribution.
Args:
freq_dist: The frequency distribution dictionary.
length: The length of the words analyzed.
condition: The starting condition used.
"""
sorted_freq = dict(sorted(freq_dist.items()))
plt.figure(figsize=(10, 5))
plt.bar(sorted_freq.keys(), sorted_freq.values())
i = len(condition)
plt.title(f"α[{index + 1}] Frequency Distribution | l(α) = {length} and α[:{i}] = '{condition}')")
plt.xlabel("Character")
plt.ylabel("Frequency")
plt.show()
def integral_histograms(left_integrals, right_integrals, sentence_length, num_bins=20):
"""
Generates histograms for the Left and Right-Hand Sentence Integrals.
Args:
left_integrals: A list of Left-Hand Sentence Integrals.
right_integrals: A list of Right-Hand Sentence Integrals.
sentence_length: The length of the sentences analyzed.
num_bins: The number of bins for the histograms.
"""
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(left_integrals, bins=num_bins, range=(0, 10))
plt.title(f"Left-Hand Integrals (Length = {sentence_length})")
plt.xlabel("Integral Value")
plt.ylabel("Frequency")
plt.subplot(1, 2, 2)
plt.hist(right_integrals, bins=num_bins, range=(0, 10))
plt.title(f"Right-Hand Integrals (Length = {sentence_length})")
plt.xlabel("Integral Value")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
def coefficient_histogram(all_coefficients, sentence_length):
"""
Generates a histogram of the delimiter coefficients.
Args:
all_coefficients: A list of lists, where each inner list contains the coefficients for a sentence.
sentence_length: The length of the sentences analyzed.
"""
# Flatten the list of lists into a single list
flat_coefficients = [item for sublist in all_coefficients for item in sublist]
plt.hist(flat_coefficients, bins=range(-sentence_length + 1, sentence_length, 2)) # Bins for odd/even coefficients
plt.title(f"Delimiter Coefficient Distribution (Sentence Length = {sentence_length})")
plt.xlabel("Coefficient (2i - l(ζ) - 1)")
plt.ylabel("Frequency")
plt.show()
def length_histogram(length_freq_dict, mean_length):
"""
Generates a histogram of sentence lengths for a given corpus.
Args:
length_freq_dict: A dictionary where keys are sentence lengths and values are their frequencies.
mean_length: The sample mean of the sentence lengths.
"""
lengths = list(length_freq_dict.keys())
frequencies = list(length_freq_dict.values())
plt.figure(figsize=(10, 5))
plt.bar(lengths, frequencies, width=0.85)
plt.axvline(mean_length, color='red', linestyle='dashed', linewidth=1, label=f"Mean: {mean_length:.2f}")
plt.title("Sentence Length Distribution")
plt.xlabel("Sentence Length (Characters)")
plt.ylabel("Frequency")
plt.legend()
plt.show()
def posterior_delimiter_histogram(p_values, posterior_probs, num_bins=20):
"""
Generates a histogram of the posterior delimiter probabilities.
Args:
p_values: The values of p for which the prior is defined.
posterior_probs: The posterior probabilities for each p_value.
num_bins: The number of bins for the histogram.
"""
plt.figure(figsize=(10, 5))
plt.hist(p_values, weights=posterior_probs, bins=num_bins)
plt.title(f"Posterior Delimiter Probability Distribution")
plt.xlabel("p")
plt.ylabel("Probability Density")
plt.show()
def delimiter_histogram(distribution_data):
"""
Generates histograms of delimiter index distributions for each language and sentence length.
Args:
distribution_data: A dictionary containing the delimiter index frequency distributions.
"""
for language, length_data in distribution_data.items():
for length, index_freq in length_data.items():
indices = list(index_freq.keys())
frequencies = list(index_freq.values())
plt.figure(figsize=(10, 5))
plt.bar(indices, frequencies)
plt.title(f"Delimiter Index Distribution ({language}, Length = {length})")
plt.xlabel("Delimiter Index")
plt.ylabel("Frequency")
plt.show()
def delimiter_barchart(delimiter_indices, sentence): # Modified function
"""
Generates a bar chart of delimiter indices with a specified left limit.
Args:
delimiter_indices: A list of delimiter indices.
limit: The left limit of the x-axis (integer).
"""
if not delimiter_indices:
return # Handle empty list
plt.figure(figsize=(10, 5))
plt.bar(delimiter_indices, [1] * len(delimiter_indices), width=0.05) # Adjust width as needed
plt.xlim(0, len(sentence)) # Set the left limit of the x-axis
plt.title("Delimiter Index Distribution")
plt.xlabel("Delimiter Index")
plt.ylabel("Frequency")
plt.suptitle(sentence, fontsize=10)
plt.show()
Parse#
""" palindromes.parse: Module for parsing external data sources
"""
import enum
import nltk
from nltk.corpus import brown, cess_esp, indian
from nltk.tokenize import sent_tokenize
import string
class CORPORA(enum.Enum):
ENGLISH = "english"
SPANISH = "spanish"
HINDI = "hindi"
def init():
# Download necessary NLTK data if you haven't already
nltk.download('brown')
nltk.download('cess_esp')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('indian')
def _clean_corpus(language, min_length, max_length):
# Updated to use the CORPORA enum
if language == CORPORA.ENGLISH:
corpus = brown
all_sentences = corpus.sents()
flattened_sentences = [" ".join(sentence) for sentence in all_sentences]
tokenized_sentences = []
for text in flattened_sentences:
tokenized_sentences.extend(sent_tokenize(text))
cleaned_sentences = []
for sentence in tokenized_sentences:
cleaned_sentence = "".join(
c for c in sentence if c not in string.punctuation or c == ' '
)
cleaned_sentence = " ".join(cleaned_sentence.split()).lower()
if min_length <= len(cleaned_sentence) <= max_length:
cleaned_sentences.append(cleaned_sentence)
return cleaned_sentences
elif language == CORPORA.SPANISH:
corpus = cess_esp.sents()
cleaned_sentences = []
for sentence in corpus:
cleaned_sentence = " ".join(word for word in sentence if word not in string.punctuation and word not in "¡¿")
cleaned_sentence = " ".join(cleaned_sentence.split()).lower()
if min_length <= len(cleaned_sentence) <= max_length:
cleaned_sentences.append(cleaned_sentence)
return cleaned_sentences
elif language == CORPORA.HINDI:
corpus = indian
cleaned_sentences = []
words = corpus.words('hindi.pos')
# Build sentences based on full stop delimiter ('।')
sentence = ""
for word in words:
if word == '।':
if min_length <= len(sentence) <= max_length:
cleaned_sentences.append(sentence.strip())
sentence = ""
else:
if len(sentence) > 0:
sentence += " "
sentence += word
# Add the last sentence if it meets the length criteria
if len(sentence) > 0 and min_length <= len(sentence) <= max_length:
cleaned_sentences.append(sentence.strip())
return cleaned_sentences
else:
raise ValueError("Invalid language specified. Choose from 'english', 'spanish', or 'hindi'.")
def corpus(min_length = 100, max_length = 200, language = CORPORA.ENGLISH):
return _clean_corpus(language, min_length, max_length)
def words(length, language = CORPORA.ENGLISH):
if language == CORPORA.ENGLISH:
return [
word
for word
in brown.words()
if len(word) == length and word.isalpha()
]