# -*- coding: utf-8 -*- import re import codecs import textwrap from num2words import num2words # IPA Phonemizer: https://github.com/bootphon/phonemizer _pad = "$" _punctuation = ';:,.!?¡¿—…"«»“” ' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" # Export all symbols: symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) dicts = {} for i in range(len((symbols))): dicts[symbols[i]] = i class TextCleaner: def __init__(self, dummy=None): self.word_index_dictionary = dicts print(len(dicts)) def __call__(self, text): indexes = [] for char in text: try: indexes.append(self.word_index_dictionary[char]) except KeyError: print(text) return indexes # == Sentence Splitter import re def split_into_sentences(text, max_len=200): """ Splits a string into chunks of max_len characters, ensuring each chunk terminates with a period if it was split mid-sentence. Prioritizes splitting at natural sentence breaks and avoids splitting words. Args: text (str): The input string. max_len (int): The maximum desired length for each chunk. Returns: list: A list of strings, where each string is a sentence chunk. """ if not text: return [] # Regex to split text into potential sentence candidates. # We still use the lookbehind to keep the punctuation with the sentence. sentence_candidates = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()] # Handle the last part if it doesn't end with a punctuation (e.g., a phrase or incomplete sentence) if text and not text.strip().endswith(('.', '!', '?')) and text.strip() not in sentence_candidates: # Check if the last candidate already contains the end of the text. # This is a heuristic, as re.split can sometimes be tricky with trailing non-matches. if not (sentence_candidates and text.strip().endswith(sentence_candidates[-1])): remaining_text = text.strip() if sentence_candidates: # Find the part of the text that wasn't included in sentence_candidates last_candidate_start_index = text.rfind(sentence_candidates[-1]) if last_candidate_start_index != -1: remaining_text = text[last_candidate_start_index + len(sentence_candidates[-1]):].strip() if remaining_text and not remaining_text.endswith(('.', '!', '?')): sentence_candidates.append(remaining_text) chunks = [] current_chunk_elements = [] # Stores individual sentences that form the current chunk current_chunk_length = 0 for sentence in sentence_candidates: # Calculate the length this sentence would add to the current chunk. # Add 1 for the space that will separate sentences within a chunk, if needed. potential_addition_length = len(sentence) + (1 if current_chunk_elements else 0) # Check if adding this sentence would exceed the maximum length if current_chunk_length + potential_addition_length > max_len: # First, finalize the current chunk if current_chunk_elements: final_chunk = " ".join(current_chunk_elements).strip() chunks.append(final_chunk) # Reset for the new chunk and handle the current `sentence`. # This `sentence` itself might be longer than `max_len`. remaining_sentence = sentence while len(remaining_sentence) > max_len: # Prioritize splitting at a period or a space to avoid splitting words. # Search backwards from `max_len - 1` to find the last valid break point. split_point = -1 search_area = remaining_sentence[:max_len] # Option 1: Find the last period in the search area last_period_idx = search_area.rfind('.') if last_period_idx != -1: split_point = last_period_idx # Option 2: If no period, find the last space (to avoid splitting words) if split_point == -1: last_space_idx = search_area.rfind(' ') if last_space_idx != -1: split_point = last_space_idx if split_point != -1: # If a period or space is found, split there. # If it's a period, include it. If it's a space, don't include the space # but ensure the chunk ends with a period if it didn't already. chunk_to_add = remaining_sentence[:split_point + (1 if remaining_sentence[split_point] == '.' else 0)].strip() if not chunk_to_add.endswith('.'): chunk_to_add += '.' # Ensure period termination chunks.append(chunk_to_add) remaining_sentence = remaining_sentence[split_point + 1:].lstrip() # Update remaining else: # No natural break (period or space) within max_len. # This happens for extremely long words or sequences without spaces. # In this rare case, we force split at max_len and append a period. chunks.append(remaining_sentence[:max_len].strip() + '.') remaining_sentence = remaining_sentence[max_len:].lstrip() # Update remaining # The `remaining_sentence` (now guaranteed to be `<= max_len`) # becomes the start of the new `current_chunk`. current_chunk_elements = [remaining_sentence] current_chunk_length = len(remaining_sentence) else: # The current sentence fits within the `max_len`, so add it. current_chunk_elements.append(sentence) current_chunk_length += potential_addition_length # After iterating through all sentences, add any remaining elements # in `current_chunk_elements` as the final chunk. if current_chunk_elements: chunks.append(" ".join(current_chunk_elements).strip()) return chunks def store_ssml(text=None, voice=None): '''create ssml: text : list of sentences voice: https://github.com/MycroftAI/mimic3-voices ''' print('\n___________________________\n', len(text), text[0], '\n___________________________________\n') _s = '' for short_text in text: # 1.44) # 1.24 for bieber rate = min(max(.87, len(short_text) / 76), 1.14) volume = int(74 * np.random.rand() + 24) # text = ('' # THe other voice does not have volume _s += f'' _s += f'' _s += f'' _s += '' _s += short_text _s += '' _s += '' _s += '' _s += '' _s += '' print(len(text), '\n\n\n\n\n\n\n', _s) with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f: f.write(_s) def transliterate_number(number_string, lang='en'): """ Converts a number string to words in the specified language, handling decimals, scientific notation, and preserving text before and after the numeral. """ if lang == 'rmc-script_latin': lang = 'sr' exponential_pronoun = ' puta deset na stepen od ' comma = ' tačka ' elif lang == 'ron': lang = 'ro' exponential_pronoun = ' tízszer a erejéig ' comma = ' virgulă ' elif lang == 'hun': lang = 'hu' exponential_pronoun = ' tízszer a erejéig ' comma = ' virgula ' elif lang == 'deu': exponential_pronoun = ' mal zehn hoch ' comma = ' komma ' else: lang = lang[:2] exponential_pronoun = ' times ten to the power of ' comma = ' point ' def replace_number(match): prefix = match.group(1) or "" number_part = match.group(2) suffix = match.group(5) or "" try: if 'e' in number_part.lower(): base, exponent = number_part.lower().split('e') base = float(base) exponent = int(exponent) words = num2words( base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang) elif '.' in number_part: integer_part, decimal_part = number_part.split('.') words = num2words(int(integer_part), lang=lang) + comma + " ".join( [num2words(int(digit), lang=lang) for digit in decimal_part]) else: words = num2words(int(number_part), lang=lang) return prefix + words + suffix except ValueError: return match.group(0) # Return original if conversion fails pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)' return re.sub(pattern, replace_number, number_string) def discard_leading_numeral(text): """Discards a leading numeral (integer or float) from a string. Args: text: The input string. Returns: The string with the leading numeral removed, or the original string if it doesn't start with a numeral. """ match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text) if match: return text[match.end():].lstrip() else: return text