# -*- coding: utf-8 -*-
import re
import codecs
import textwrap
from num2words import num2words
# IPA Phonemizer: https://github.com/bootphon/phonemizer

_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

dicts = {}
for i in range(len((symbols))):
    dicts[symbols[i]] = i


class TextCleaner:
    def __init__(self, dummy=None):
        self.word_index_dictionary = dicts
        print(len(dicts))

    def __call__(self, text):
        indexes = []
        for char in text:
            try:
                indexes.append(self.word_index_dictionary[char])
            except KeyError:
                print(text)
        return indexes


# == Sentence Splitter

import re

def split_into_sentences(text, max_len=200):
    """
    Splits a string into chunks of max_len characters, ensuring each chunk
    terminates with a period if it was split mid-sentence. Prioritizes
    splitting at natural sentence breaks and avoids splitting words.

    Args:
        text (str): The input string.
        max_len (int): The maximum desired length for each chunk.

    Returns:
        list: A list of strings, where each string is a sentence chunk.
    """
    if not text:
        return []

    # Regex to split text into potential sentence candidates.
    # We still use the lookbehind to keep the punctuation with the sentence.
    sentence_candidates = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]

    # Handle the last part if it doesn't end with a punctuation (e.g., a phrase or incomplete sentence)
    if text and not text.strip().endswith(('.', '!', '?')) and text.strip() not in sentence_candidates:
        # Check if the last candidate already contains the end of the text.
        # This is a heuristic, as re.split can sometimes be tricky with trailing non-matches.
        if not (sentence_candidates and text.strip().endswith(sentence_candidates[-1])):
            remaining_text = text.strip()
            if sentence_candidates:
                # Find the part of the text that wasn't included in sentence_candidates
                last_candidate_start_index = text.rfind(sentence_candidates[-1])
                if last_candidate_start_index != -1:
                    remaining_text = text[last_candidate_start_index + len(sentence_candidates[-1]):].strip()
            
            if remaining_text and not remaining_text.endswith(('.', '!', '?')):
                sentence_candidates.append(remaining_text)


    chunks = []
    current_chunk_elements = []  # Stores individual sentences that form the current chunk
    current_chunk_length = 0

    for sentence in sentence_candidates:
        # Calculate the length this sentence would add to the current chunk.
        # Add 1 for the space that will separate sentences within a chunk, if needed.
        potential_addition_length = len(sentence) + (1 if current_chunk_elements else 0)

        # Check if adding this sentence would exceed the maximum length
        if current_chunk_length + potential_addition_length > max_len:
            # First, finalize the current chunk
            if current_chunk_elements:
                final_chunk = " ".join(current_chunk_elements).strip()
                chunks.append(final_chunk)

            # Reset for the new chunk and handle the current `sentence`.
            # This `sentence` itself might be longer than `max_len`.
            remaining_sentence = sentence
            while len(remaining_sentence) > max_len:
                # Prioritize splitting at a period or a space to avoid splitting words.
                # Search backwards from `max_len - 1` to find the last valid break point.
                split_point = -1
                search_area = remaining_sentence[:max_len]

                # Option 1: Find the last period in the search area
                last_period_idx = search_area.rfind('.')
                if last_period_idx != -1:
                    split_point = last_period_idx

                # Option 2: If no period, find the last space (to avoid splitting words)
                if split_point == -1:
                    last_space_idx = search_area.rfind(' ')
                    if last_space_idx != -1:
                        split_point = last_space_idx

                if split_point != -1:
                    # If a period or space is found, split there.
                    # If it's a period, include it. If it's a space, don't include the space
                    # but ensure the chunk ends with a period if it didn't already.
                    chunk_to_add = remaining_sentence[:split_point + (1 if remaining_sentence[split_point] == '.' else 0)].strip()
                    if not chunk_to_add.endswith('.'):
                        chunk_to_add += '.' # Ensure period termination

                    chunks.append(chunk_to_add)
                    remaining_sentence = remaining_sentence[split_point + 1:].lstrip() # Update remaining
                else:
                    # No natural break (period or space) within max_len.
                    # This happens for extremely long words or sequences without spaces.
                    # In this rare case, we force split at max_len and append a period.
                    chunks.append(remaining_sentence[:max_len].strip() + '.')
                    remaining_sentence = remaining_sentence[max_len:].lstrip() # Update remaining

            # The `remaining_sentence` (now guaranteed to be `<= max_len`)
            # becomes the start of the new `current_chunk`.
            current_chunk_elements = [remaining_sentence]
            current_chunk_length = len(remaining_sentence)

        else:
            # The current sentence fits within the `max_len`, so add it.
            current_chunk_elements.append(sentence)
            current_chunk_length += potential_addition_length

    # After iterating through all sentences, add any remaining elements
    # in `current_chunk_elements` as the final chunk.
    if current_chunk_elements:
        chunks.append(" ".join(current_chunk_elements).strip())
    return chunks


def store_ssml(text=None,
               voice=None):
    '''create ssml:
           text : list of sentences
           voice: https://github.com/MycroftAI/mimic3-voices
    '''
    print('\n___________________________\n', len(text),
          text[0], '\n___________________________________\n')
    _s = '<speak>'
    for short_text in text:

        # 1.44)  # 1.24 for bieber
        rate = min(max(.87, len(short_text) / 76), 1.14)

        volume = int(74 * np.random.rand() + 24)
        # text = ('<speak>'
        # THe other voice does not have volume
        _s += f'<prosody volume=\'{volume}\'>'
        _s += f'<prosody rate=\'{rate}\'>'
        _s += f'<voice name=\'{voice}\'>'
        _s += '<s>'
        _s += short_text
        _s += '</s>'
        _s += '</voice>'
        _s += '</prosody>'
        _s += '</prosody>'
    _s += '</speak>'
    print(len(text), '\n\n\n\n\n\n\n', _s)

    with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
        f.write(_s)


def transliterate_number(number_string, lang='en'):
    """
    Converts a number string to words in the specified language,
    handling decimals, scientific notation, and preserving text
    before and after the numeral.
    """

    if lang == 'rmc-script_latin':
        lang = 'sr'
        exponential_pronoun = ' puta deset na stepen od '
        comma = ' tačka '
    elif lang == 'ron':
        lang = 'ro'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgulă '
    elif lang == 'hun':
        lang = 'hu'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgula '
    elif lang == 'deu':
        exponential_pronoun = ' mal zehn hoch '
        comma = ' komma '
    else:
        lang = lang[:2]
        exponential_pronoun = ' times ten to the power of '
        comma = ' point '

    def replace_number(match):
        prefix = match.group(1) or ""
        number_part = match.group(2)
        suffix = match.group(5) or ""

        try:
            if 'e' in number_part.lower():
                base, exponent = number_part.lower().split('e')
                base = float(base)
                exponent = int(exponent)
                words = num2words(
                    base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang)
            elif '.' in number_part:
                integer_part, decimal_part = number_part.split('.')
                words = num2words(int(integer_part), lang=lang) + comma + " ".join(
                    [num2words(int(digit), lang=lang) for digit in decimal_part])
            else:
                words = num2words(int(number_part), lang=lang)
            return prefix + words + suffix
        except ValueError:
            return match.group(0)  # Return original if conversion fails

    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
    return re.sub(pattern, replace_number, number_string)


def discard_leading_numeral(text):
  """Discards a leading numeral (integer or float) from a string.

  Args:
    text: The input string.

  Returns:
    The string with the leading numeral removed, or the original string
    if it doesn't start with a numeral.
  """
  match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text)
  if match:
    return text[match.end():].lstrip()
  else:
    return text