Search code examples
pythonazureazure-functionslanguage-translationgemini

How can I ensure that Azure Text-to-Speech properly pronounces word-for-word translations?


I'm working on an app using Azure, Gemini, Python, and Dart, and I want to make sure the pronunciation between languages is spot on. For example, I want to translate between German and Spanish: the goal is for 'hallo' -> 'hola' to be pronounced correctly in both languages. The same goes for English and Spanish 'hello' -> 'hola'. Azure does well with sentences, but struggles with word-for-word translations.

Here's my code:

  • translation_service.py

class TranslationService:
    def __init__(self):
        load_dotenv()
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            raise ValueError("GEMINI_API_KEY not found in environment variables")

        genai.configure(api_key=api_key)
        
        self.generation_config = {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 8192,
            "response_mime_type": "text/plain",
        }
        
        self.model = GenerativeModel(
            model_name="gemini-2.0-flash-exp",
            generation_config=self.generation_config
        )
        
        self.tts_service = EnhancedTTSService()

        # Initialize chat session with translation instructions
        self.chat_session = self.model.start_chat(
            history=[
   {
                    "role": "user",
                    "parts": [
                        """
                       
                        Text
"
(Could be any phrase or word)
"

German Translation:
Conversational-native:
"Ich suche einen Job, damit ich finanziell unabhängig sein kann."
word by word Conversational-native German-Spanish:
"Ich (Yo) suche (busco) einen (un) Job (trabajo), damit (para que) ich (yo) finanziell (económicamente) unabhängig (independiente) sein (ser) kann (pueda)."


English Translation:

Conversational-native:
"I'm looking for a job so I can be financially independent."
word by word Conversational-native English-Spanish:
"I'm (Yo estoy) looking for (buscando) a job (un trabajo) so (para que) I (yo) can be (pueda ser) financially (económicamente) independent (independiente)."



                        """
                    ]
                }
            ]
        )
           
    def _restore_accents(self, text: str) -> str:
        """Restore proper accents and special characters."""
        accent_map = {
            "a": "á", "e": "é", "i": "í", "o": "ó", "u": "ú", "n": "ñ",
            "A": "Á", "E": "É", "I": "Í", "O": "Ó", "U": "Ú", "N": "Ñ",
        }

        patterns = {
            r"([aeiou])´": lambda m: accent_map[m.group(1)],
            r"([AEIOU])´": lambda m: accent_map[m.group(1)],
            r"n~": "ñ",
            r"N~": "Ñ",
        }

        for pattern, replacement in patterns.items():
            if callable(replacement):
                text = re.sub(pattern, replacement, text)
            else:
                text = re.sub(pattern, replacement, text)

        return text

    async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
        try:
            response = self.chat_session.send_message(text)
            generated_text = response.text
            
            print(f"Generated text from Gemini: {generated_text[:100]}...")
            
            audio_filename = await self.tts_service.text_to_speech(
                text=generated_text
            )
            
            if audio_filename:
                print(f"Successfully generated audio: {audio_filename}")
            else:
                print("Audio generation failed")
            
            return Translation(
                original_text=text,
                translated_text=generated_text,
                source_language=source_lang,
                target_language=target_lang,
                audio_path=audio_filename,
                translations={"main": generated_text},
                word_by_word=self._generate_word_by_word(text, generated_text),
                grammar_explanations=self._generate_grammar_explanations(generated_text)
            )
            
        except Exception as e:
            print(f"Error in process_prompt: {str(e)}")
            raise Exception(f"Translation processing failed: {str(e)}")


    def _generate_word_by_word(self, original: str, translated: str) -> dict[str, dict[str, str]]:
        """Generate word-by-word translation mapping."""
        result = {}
        original_words = original.split()
        translated_words = translated.split()

        for i, word in enumerate(original_words):
            if i < len(translated_words):
                result[word] = {
                    "translation": translated_words[i],
                    "pos": "unknown",
                }
        return result


    def _auto_fix_spelling(self, text: str) -> str:
        """Fix spelling in the given text."""
        words = re.findall(r"\b\w+\b|[^\w\s]", text)
        corrected_words = []

        for word in words:
            if not re.match(r"\w+", word):
                corrected_words.append(word)
                continue

            if self.spell.unknown([word]):
                correction = self.spell.correction(word)
                if correction:
                    if word.isupper():
                        correction = correction.upper()
                    elif word[0].isupper():
                        correction = correction.capitalize()
                    word = correction

            corrected_words.append(word)

        return " ".join(corrected_words)
  • tts_service.py

from azure.cognitiveservices.speech.audio import AudioOutputConfig
import os
from typing import Optional
from datetime import datetime
import asyncio
import re

class EnhancedTTSService:
    def __init__(self):
        # Initialize Speech Config
        self.subscription_key = os.getenv("AZURE_SPEECH_KEY")
        self.region = os.getenv("AZURE_SPEECH_REGION")
        
        if not self.subscription_key or not self.region:
            raise ValueError("Azure Speech credentials not found in environment variables")
        
        # Create speech config
        self.speech_config = SpeechConfig(
            subscription=self.subscription_key, 
            region=self.region
        )
        self.speech_config.set_speech_synthesis_output_format(
            SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
        )
        
        # Voice mapping with specific styles and roles
        self.voice_mapping = {
             'en': 'en-US-JennyMultilingualNeural',
             'es': 'es-ES-ArabellaMultilingualNeural',
             'de': 'de-DE-SeraphinaMultilingualNeural'
        }



    def _get_temp_directory(self) -> str:
        """Create and return the temporary directory path"""
        if os.name == 'nt':  # Windows
            temp_dir = os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
        else:  # Unix/Linux
            temp_dir = '/tmp/tts_audio'
        os.makedirs(temp_dir, exist_ok=True)
        return temp_dir

    def _detect_language(self, text: str) -> str:
        """Detect the primary language of the text"""
        # Simple language detection based on character patterns
        if re.search(r'[äöüßÄÖÜ]', text):
            return 'de'
        elif re.search(r'[áéíóúñ¿¡]', text):
            return 'es'
        return 'en'

    def _generate_ssml(self, text: str) -> str:
        """Generate valid SSML with proper escaping and language tags"""
        # Clean the text
        text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
        
        # Detect primary language
        primary_lang = self._detect_language(text)
        voice_name = self.voice_mapping.get(primary_lang, self.voice_mapping['en'])
        
        ssml = f"""<?xml version='1.0'?>
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='{primary_lang}'>
    <voice name='{voice_name}'>
        <prosody rate="0.95" pitch="0%">
            {text}
        </prosody>
    </voice>
</speak>"""
        return ssml

    async def text_to_speech(self, text: str, output_path: Optional[str] = None) -> Optional[str]:
        """Convert text to speech with robust error handling"""
        synthesizer = None
        try:
            print(f"\nStarting TTS process for text: {text[:100]}...")  # First 100 chars
            
            # Generate output path if not provided
            if not output_path:
                temp_dir = self._get_temp_directory()
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                output_path = os.path.join(temp_dir, f"speech_{timestamp}.mp3")
            
            # Configure audio output
            audio_config = AudioOutputConfig(filename=output_path)
            
            # Create synthesizer for this request
            synthesizer = SpeechSynthesizer(
                speech_config=self.speech_config,
                audio_config=audio_config
            )
            
            # Generate and validate SSML
            ssml = self._generate_ssml(text)
            print(f"Generated SSML length: {len(ssml)} characters")
            
            # Perform synthesis
            print("Starting speech synthesis...")
            result = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: synthesizer.speak_ssml_async(ssml).get()
            )
            
            # Handle result
            if result.reason == ResultReason.SynthesizingAudioCompleted:
                print("Speech synthesis completed successfully")
                return os.path.basename(output_path)
            
            elif result.reason == ResultReason.Canceled:
                print(f"Speech synthesis canceled: {result.cancellation_details.reason}")
                print(f"Error details: {result.cancellation_details.error_details}")
                return None
            
            return None
            
        except Exception as e:
            print(f"Exception in text_to_speech: {str(e)}")
            return None
            
        finally:
            # Proper cleanup
            if synthesizer:
                try:
                    synthesizer.stop_speaking_async()
                except:
                    pass

This is an example of how the correct pronunciation should sound:

German-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/s/8sftiJ01aUreR3LDYRWn

English-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/s/9MM1LqTqGH1CvddGhA1l

Now let’s do a word-for-word translation, where we’ll focus on pronouncing the Spanish "ñ," "h," and "ll" properly.

Here’s the Spanish sentence:

"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."

Translation:

"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."

German-Spanish (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/s/aRFlpZc99Dw18Uexi8uS

English-Spanish (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/eY9ZhlTi


Currently I have this pronunciation with the same examples

German-Spanish and English-Spanish (hello example) (which is incorrect because the word-for-word pronunciation is not accurate)

https://jmp.sh/iExSVBGk

Let’s go back to the word-for-word breakdown, again emphasizing Spanish pronunciation for the tricky letters:

"ñ" (sounds like “ny” in canyon, e.g., piña, niña) "h" (silent in Spanish, e.g., hospital) "ll" (varies regionally but often sounds like “y” in yes, e.g., lloviendo). So here’s the sentence again:

"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."

Translation:

"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."

German-Spanish and English-Spanish (which is incorrect because the word-for-word pronunciation is not accurate)

https://jmp.sh/PxKHNWjx


This is the service I use with Azure:

This is the service I use with Azure

I’ve tried the 'langid' library, but it seems like it doesn’t work for me. My goal is to be able to hear the correct pronunciation of the English-Spanish and German-Spanish word pairs during word-for-word translation.

Thank you.


Solution

  • The key was explicit language tagging in SSML combined with strategic pauses. Azure TTS needs clear language context for each word/phrase, especially when mixing languages. Here's the implementation:

    
        def generate_german_spanish_wordforword_ssml(
            self,
            word_pairs: list[tuple[str, str]],
        ) -> str:
            """Generate SSML specifically for German-Spanish word-by-word translations"""
            ssml = """
            <voice name="en-US-JennyMultilingualNeural">
                <prosody rate="0.8">"""
            
            for source_word, target_word in word_pairs:
                source_word = source_word.strip().replace("&", "&amp;")
                target_word = target_word.strip().replace("&", "&amp;")
                
                ssml += f"""
                    <lang xml:lang="de-DE">{source_word}</lang>
                    <break time="300ms"/>
                    <lang xml:lang="es-ES">{target_word}</lang>
                    <break time="500ms"/>"""
            
            ssml += """
                    <break time="1000ms"/>
                </prosody>
            </voice>"""
            
            return ssml
    
    
        def generate_english_spanish_wordforword_ssml(
            self,
            word_pairs: list[tuple[str, str]],
        ) -> str:
            """Generate SSML specifically for English-Spanish word-by-word translations"""
            ssml = """
            <voice name="en-US-JennyMultilingualNeural">
                <prosody rate="0.8">"""
            
            for source_word, target_word in word_pairs:
                source_word = source_word.strip().replace("&", "&amp;")
                target_word = target_word.strip().replace("&", "&amp;")
                
                ssml += f"""
                    <lang xml:lang="en-US">{source_word}</lang>
                    <break time="300ms"/>
                    <lang xml:lang="es-ES">{target_word}</lang>
                    <break time="500ms"/>"""
            
            ssml += """
                    <break time="1000ms"/>
                </prosody>
            </voice>"""
            
            return ssml
    
    

    Then, I modified the translation service to parse Gemini's output into clean word pairs:

        def _extract_word_pairs(self, text: str) -> list[tuple[str, str]]:
            word_pairs = []
            word_by_word_pattern = r'\* word by word.*?\n"([^"]+)"'
            word_by_word_match = re.search(word_by_word_pattern, text, re.DOTALL)
            
            if word_by_word_match:
                word_by_word_text = word_by_word_match.group(1)
                # Improved regex to capture multi-word phrases including those with apostrophes
                parts = re.findall(r'([^()]+?)\s*\(([^)]+)\)', word_by_word_text)
                for source, target in parts:
                    # Clean and normalize both phrases
                    source = re.sub(r'\s+', ' ', source.strip().replace("'", ""))
                    target = target.strip()
                    if source and target:
                        word_pairs.append((source, target))
            return word_pairs
    
    

    And then I've updated and added new code on tts_service.py

    
        def _is_german_word(self, word: str) -> bool:
            # List of common German words that might appear in the English section
            german_words = {"dir", "ich", "du", "sie", "er", "es", "wir", "ihr", "ist", "sind", "haben", 
                        "sein", "werden", "kann", "könnte", "möchte", "muss", "darf", "soll"}
            return word.lower() in german_words
    
        def _is_english_word(self, word: str) -> bool:
            # List of common English words to verify
            english_words = {"the", "a", "an", "in", "on", "at", "to", "for", "with", "by"}
            return word.lower() in english_words
    
    
        def generate_enhanced_ssml(
            self,
            text: Optional[str] = None,
            word_pairs: Optional[list[tuple[str, str, bool]]] = None,
            source_lang: str = "de",
            target_lang: str = "es",
        ) -> str:
            """Generate SSML with proper phrase handling for both German and English"""
            ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">"""
    
            if text:
                # Split text into lines and pad to 8 elements
                sentences = (text.split("\n") + [""] * 8)[:8]
                sentences = [t.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;") 
                            for t in sentences]
    
                # Destructure sentences
                (german_native, german_colloquial, german_informal, german_formal,
                english_native, english_colloquial, english_informal, english_formal) = sentences
    
                if word_pairs:
                    # Separate pairs with language flag
                    german_pairs = [(src, tgt) for src, tgt, is_german in word_pairs if is_german]
                    english_pairs = [(src, tgt) for src, tgt, is_german in word_pairs if not is_german]
    
                    # German Sections
                    if german_native:
                        ssml += self._generate_language_section(
                            german_native, german_pairs,
                            voice="de-DE-SeraphinaMultilingualNeural",
                            lang="de-DE"
                        )
    
                    if german_colloquial:
                        ssml += self._generate_language_section(
                            german_colloquial, german_pairs,
                            voice="de-DE-SeraphinaMultilingualNeural",
                            lang="de-DE"
                        )
    
                    if german_informal:
                        ssml += self._generate_language_section(
                            german_informal, german_pairs,
                            voice="de-DE-KatjaNeural",
                            lang="de-DE"
                        )
    
                    if german_formal:
                        ssml += self._generate_language_section(
                            german_formal, german_pairs,
                            voice="de-DE-SeraphinaMultilingualNeural",
                            lang="de-DE"
                        )
    
                    # English Sections
                    if english_native:
                        ssml += self._generate_language_section(
                            english_native, english_pairs,
                            voice="en-US-JennyMultilingualNeural",
                            lang="en-US"
                        )
    
                    if english_colloquial:
                        ssml += self._generate_language_section(
                            english_colloquial, english_pairs,
                            voice="en-US-JennyMultilingualNeural",
                            lang="en-US"
                        )
    
                    if english_informal:
                        ssml += self._generate_language_section(
                            english_informal, english_pairs,
                            voice="en-US-JennyNeural",
                            lang="en-US"
                        )
    
                    if english_formal:
                        ssml += self._generate_language_section(
                            english_formal, english_pairs,
                            voice="en-US-JennyMultilingualNeural",
                            lang="en-US"
                        )
    
            # Final cleanup of SSML
            ssml = re.sub(r'(<break time="500ms"\s*/>\s*)+', '<break time="500ms"/>', ssml)
            ssml += "</speak>"
            return ssml
    
        def _generate_language_section(
            self,
            sentence: str,
            word_pairs: list[tuple[str, str]],
            voice: str,
            lang: str
        ) -> str:
            """Generate complete language section with phrase handling"""
            section = f"""
            <voice name="{voice}">
                <prosody rate="1.0">
                    <lang xml:lang="{lang}">{sentence}</lang>
                    <break time="1000ms"/>
                </prosody>
            </voice>"""
    
            if word_pairs:
                section += """
            <voice name="en-US-JennyMultilingualNeural">
                <prosody rate="0.8">"""
                
                # Create phrase map and sort by phrase length
                phrase_map = {src.lower(): (src, tgt) for src, tgt in word_pairs}
                phrases = sorted(phrase_map.keys(), key=lambda x: len(x.split()), reverse=True)
                words = sentence.split()
                index = 0
                
                while index < len(words):
                    matched = False
                    
                    # Try to match multi-word phrases first
                    for phrase_key in phrases:
                        phrase_words = phrase_key.split()
                        if index + len(phrase_words) > len(words):
                            continue
                        
                        candidate = ' '.join(words[index:index+len(phrase_words)]).lower()
                        if candidate == phrase_key:
                            original_phrase, translation = phrase_map[phrase_key]
                            section += f"""
                <lang xml:lang="{lang}">{original_phrase}</lang>
                <break time="300ms"/>
                <lang xml:lang="es-ES">{translation}</lang>
                <break time="500ms"/>"""
                            index += len(phrase_words)
                            matched = True
                            break
                            
                    # Single word fallback
                    if not matched:
                        word = words[index].strip(".,!?")
                        translation = next((tgt for src, tgt in word_pairs if src.lower() == word.lower()), None)
                        section += f"""
                <lang xml:lang="{lang}">{word}</lang>
                <break time="300ms"/>"""
                        if translation:
                            section += f"""
                <lang xml:lang="es-ES">{translation}</lang>
                <break time="500ms"/>"""
                        else:
                            section += """<break time="500ms"/>"""
                        index += 1
    
                section += """
                <break time="1000ms"/>
                </prosody>
            </voice>"""
            
            return section
    
        def _generate_sentence_section(
            self,
            sentence: str,
            word_pairs: list[tuple[str, str]],
            voice: str,
            lang: str,
        ) -> str:
            if not sentence:
                return ""
            
            # Generate the main sentence SSML
            ssml = f"""
                <voice name="{voice}">
                    <prosody rate="1.0">
                        <lang xml:lang="{lang}">{sentence}</lang>
                        <break time="1000ms"/>
                    </prosody>
                </voice>"""
            
            if word_pairs:
                ssml += """
                    <voice name="en-US-JennyMultilingualNeural">
                        <prosody rate="0.8">"""
                
                # Create phrase map and sort by phrase length (longest first)
                phrase_map = {src.lower(): (src, tgt) for src, tgt in word_pairs}
                phrases = sorted(phrase_map.keys(), key=lambda x: len(x.split()), reverse=True)
                words = sentence.split()
                index = 0
                
                while index < len(words):
                    matched = False
                    
                    # Try to match multi-word phrases first
                    for phrase_key in phrases:
                        phrase_words = phrase_key.split()
                        phrase_len = len(phrase_words)
                        
                        if index + phrase_len <= len(words):
                            current_phrase = ' '.join(words[index:index+phrase_len]).lower()
                            if current_phrase == phrase_key:
                                original_phrase, translation = phrase_map[phrase_key]
                                ssml += f"""
                                    <lang xml:lang="{lang}">{original_phrase}</lang>
                                    <break time="300ms"/>
                                    <lang xml:lang="es-ES">{translation}</lang>
                                    <break time="500ms"/>"""
                                index += phrase_len
                                matched = True
                                break
                                
                    # Fallback to single-word matching
                    if not matched:
                        current_word = words[index].strip(".,!?").lower()
                        original_word = words[index]
                        translation = next((tgt for src, tgt in word_pairs if src.lower() == current_word), None)
                        
                        ssml += f"""
                            <lang xml:lang="{lang}">{original_word}</lang>
                            <break time="300ms"/>"""
                        if translation:
                            ssml += f"""
                                <lang xml:lang="es-ES">{translation}</lang>
                                <break time="500ms"/>"""
                        else:
                            ssml += """<break time="500ms"/>"""
                        
                        index += 1
                
                ssml += """
                            <break time="1000ms"/>
                        </prosody>
                    </voice>"""
            
            return ssml
    
    
    

    In translation_service.py I've updated this code to find the solution

    
    
        def _format_for_tts(self, word_pairs: list[tuple[str, str]], source_lang: str, target_lang: str) -> str:
            lang_map = {
                'en': 'en-US',
                'de': 'de-DE',
                'es': 'es-ES'
            }
    
            # Make sure to use the correct source language code for each word
            ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
            <voice name="en-US-JennyMultilingualNeural">"""
    
            for source_word, target_word in word_pairs:
                source_word = source_word.strip()
                target_word = target_word.strip()
                
                # Use the correct source language code based on the source_lang parameter
                source_lang_code = lang_map.get(source_lang, 'en-US')
                target_lang_code = lang_map.get(target_lang, 'es-ES')
    
                ssml += f"""
                <lang xml:lang="{source_lang_code}">{source_word}</lang>
                <break time="500ms"/>
                <lang xml:lang="{target_lang_code}">{target_word}</lang>
                <break time="500ms"/>"""
    
            ssml += """
            </voice>
        </speak>"""
            return ssml
    
        async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
            
            try:
                
                response = self.chat_session.send_message(text)
                generated_text = response.text
                
                print(f"Generated text from Gemini: {generated_text[:100]}...")
                
                translations, word_pairs = self._extract_text_and_pairs(generated_text)
                            
                audio_filename = None
                
                if translations and word_pairs:
                    
                    audio_filename = await self.tts_service.text_to_speech_word_pairs(
                        word_pairs=word_pairs,  
                        source_lang=source_lang,  
                        target_lang=target_lang,  
                        complete_text="\n".join(translations)  
                    )
                elif translations:
                    
                    formatted_ssml = self.tts_service.generate_enhanced_ssml(
                        text="\n".join(translations),  
                        source_lang=source_lang,  
                        target_lang=target_lang  
                    )
                    audio_filename = await self.tts_service.text_to_speech(formatted_ssml)  
                
                if audio_filename:
                    
                    print(f"Successfully generated audio: {audio_filename}")
                else:
                    
                    print("Audio generation failed")
                
                return Translation(
                    original_text=text, 
                    translated_text=generated_text,  
                    source_language=source_lang,  
                    target_language=target_lang, 
                    audio_path=audio_filename if audio_filename else None,  
                    translations={"main": translations[0] if translations else generated_text},  
                    word_by_word=self._generate_word_by_word(text, generated_text),  
                    grammar_explanations=self._generate_grammar_explanations(generated_text)  
                )
    
            except Exception as e:
                
                print(f"Error in process_prompt: {str(e)}")
                raise Exception(f"Translation processing failed: {str(e)}")
    
    
        def _extract_text_and_pairs(self, generated_text: str) -> tuple[list[str], list[tuple[str, str, bool]]]:
            """
            Extract both native, colloquial, informal, and formal texts and word pairs from generated text.
            Returns: tuple of ([texts], [(source_word, target_word, is_german)])
            """
            translations = []
            word_pairs = []
            
            # Patterns for German translations
            german_patterns = [
                {
                    'text_pattern': r'German Translation:.*?\* Conversational-native:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-native German-Spanish:\s*"([^"]+)"',
                    'is_german': True
                },
                {
                    'text_pattern': r'\* Conversational-colloquial:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-colloquial German-Spanish:\s*"([^"]+)"',
                    'is_german': True
                },
                {
                    'text_pattern': r'\* Conversational-informal:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-informal German-Spanish:\s*"([^"]+)"',
                    'is_german': True
                },
                {
                    'text_pattern': r'\* Conversational-formal:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-formal German-Spanish:\s*"([^"]+)"',
                    'is_german': True
                }
            ]
            
            # Patterns for English translations
            english_patterns = [
                {
                    'text_pattern': r'English Translation:.*?\* Conversational-native:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-native English-Spanish:\s*"([^"]+)"',
                    'is_german': False
                },
                {
                    'text_pattern': r'English Translation:.*?\* Conversational-colloquial:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-colloquial English-Spanish:\s*"([^"]+)"',
                    'is_german': False
                },
                {
                    'text_pattern': r'English Translation:.*?\* Conversational-informal:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-informal English-Spanish:\s*"([^"]+)"',
                    'is_german': False
                },
                {
                    'text_pattern': r'English Translation:.*?\* Conversational-formal:\s*"([^"]+)"',
                    'pairs_pattern': r'\* word by word Conversational-formal English-Spanish:\s*"([^"]+)"',
                    'is_german': False
                }
            ]
            
            # Combine patterns
            all_patterns = german_patterns + english_patterns
            
            # Extract translations and word pairs
            for pattern_set in all_patterns:
                # Extract text
                text_match = re.search(pattern_set['text_pattern'], generated_text, re.DOTALL | re.IGNORECASE)
                if text_match:
                    translations.append(text_match.group(1).strip())
                
                # Extract word pairs
                pairs_match = re.search(pattern_set['pairs_pattern'], generated_text, re.IGNORECASE)
                if pairs_match:
                    pairs_text = pairs_match.group(1)
                    # More robust word pair extraction
                    pair_matches = re.findall(r'(\S+)\s*\(([^)]+)\)', pairs_text)
                    for source, target in pair_matches:
                        source = source.strip()
                        target = target.strip()
                        if source and target:
                            word_pairs.append((source, target, pattern_set['is_german']))
            
            # Remove duplicates while preserving order
            seen_pairs = set()
            unique_pairs = []
            for pair in word_pairs:
                pair_tuple = (pair[0], pair[1], pair[2])
                if pair_tuple not in seen_pairs:
                    seen_pairs.add(pair_tuple)
                    unique_pairs.append(pair)
            
            return translations, unique_pairs