Search code examples
pythonregexpython-2.7mime

How to remove the first four lines and the last 12 lines in to a file in Python?


        h = httplib.HTTPSConnection(host, port)
        h.set_debuglevel(0)

        headers = {

        "Content-Type": "multipart/form-data; boundary=%s" % (boundary,),

        "Connection": "Keep-Alive",

        }

        h.request('POST', uri, body, headers)
        res = h.getresponse()
        #print res.read()
        data = """MIME-Version: 1.0
        Content-Type: multipart/mixed; boundary=--Nuance_NMSP_vutc5w1XobDdefsYG3wq
        """ + res.read()

        msg = email.message_from_string(data)
        #print msg

        for index, part in enumerate(msg.walk(), start=1):
            content_type = part.get_content_type()
            #print content_type
            payload = part.get_payload()
            print res.getheaders()

            if content_type == "audio/x-wav" and len(payload):
                with open('output.pcm'.format(index), 'wb') as f_pcm:
                    print f_pcm.write(payload)

I am sending a request to the server and the server is sending a response back to the client as above in the form of .txt. The .txt contains an information header on the top and header at the bottom, which is of text format and the rest is binary.

How to write and parse the text and write it into a separate .txt file, and the binary into .pcm file?


Solution

  • The following kind of approach is recommended using Python's email library to try and decode the MIME:

    import ssl
    import os
    import json
    import email
    import uuid
    from io import BytesIO
    import httplib
    
    
    input_folder = os.path.dirname(os.path.abspath(__file__)) 
    output_folder = os.path.join(input_folder, 'output')
    
    def get_filename(ext, base, sub_folder):
        filename = '{}.{}'.format(base, ext)
        return os.path.join(output_folder, sub_folder, filename)
    
    def compare_files(file1, file2):
        with open(file1, 'rb') as f_file1, open(file2, 'rb') as f_file2:
            if f_file1.read() == f_file2.read():
                print 'Same:\n  {}\n  {}'.format(file1, file2)
            else:
                print 'Different:\n  {}\n  {}'.format(file1, file2)
    
    class Part(object):
        """Represent a part in a multipart messsage"""
    
        def __init__(self, name, contentType, data, paramName=None):
            super(Part, self).__init__()
            self.name = name
            self.paramName = paramName
            self.contentType = contentType
            self.data = data
    
        def encode(self):
            body = BytesIO()
    
            if self.paramName:
                body.write('Content-Disposition: form-data; name="%s"; paramName="%s"\r\n' % (self.name, self.paramName))
            else:
                body.write('Content-Disposition: form-data; name="%s"\r\n' % (self.name,))
    
            body.write("Content-Type: %s\r\n" % (self.contentType,))
            body.write("\r\n")
            body.write(self.data)
            return body.getvalue()
    
    class Request(object):
        """A handy class for creating a request"""
    
        def __init__(self):    
            super(Request, self).__init__()
            self.parameters = []
    
        def add_json_parameter(self, name, paramName, data):
            self.parameters.append(Part(name=name, paramName=paramName, contentType="application/json; charset=utf-8", data=data))
    
        def add_audio_parameter(self, name, paramName, data):
            self.parameters.append(Part(name=name, paramName=paramName, contentType="audio/x-wav;codec=pcm;bit=16;rate=16000", data=data))
    
        def encode(self):
            boundary = uuid.uuid4().hex
            body = BytesIO()
    
            for parameter in self.parameters:
                body.write("--%s\r\n" % (boundary,))
                body.write(parameter.encode())
                body.write("\r\n")
    
            body.write("--%s--\r\n" % (boundary,))
            return body.getvalue(), boundary
    
    
    def get_tts(required_text, LNG):
        required_text = required_text.strip()
        output_filename = "".join([x if x.isalnum() else "_" for x in required_text[:80]]) 
    
        host = "mtldev08.nuance.com"
        port = 443
        uri = "/NmspServlet/"
    
        if LNG == "ENG":
            parameters = {'lang' : 'eng_GBR', 'location' : '47.4925, 19.0513'}
    
        if LNG == "GED":
            parameters = {'lang' : 'deu-DEU', 'location' : '48.396231, 9.972909'}
    
        RequestData = """{
            "appKey": "9c9fa7201e90d3d96718bc3f36ce4cfe1781f2e82f4e5792996623b3b474fee2c77699eb5354f2136063e1ff19c378f0f6dd984471a38ca5c393801bffb062d6",
            "appId": "NMDPTRIAL_AutomotiveTesting_NCS61HTTP",
            "uId": "Alexander",
            "inCodec": "PCM_16_8K",
            "outCodec": "PCM_16_8K",
            "cmdName": "NVC_TTS_CMD",
            "appName": "Python",
            "appVersion": "1",
            "language": "%(lang)s",
            "carrier": "carrier",
            "deviceModel": "deviceModel",
            "cmdDict": {
                "tts_voice": "Serena",
                "tts_language": "%(lang)s",
                "locale": "canada",
                "application_name": "Testing Python Script",
                "organization_id": "NUANCE",
                "phone_OS": "4.0",
                "phone_network": "wifi",
                "audio_source": "SpeakerAndMicrophone",
                "location": "%(location)s",
                "application_session_id": "1234567890",
                "utterance_number": "5",
                "ui_langugage": "en",
                "phone_submodel": "nmPhone2,1",
                "application_state_id": "45"        
            }
        }""" % (parameters)
    
        TEXT_TO_READ = """{
            "tts_type": "text"
        }"""
    
        TEXT_TO_READ = json.loads(TEXT_TO_READ)
        TEXT_TO_READ["tts_input"] = required_text
        TEXT_TO_READ = json.dumps(TEXT_TO_READ)
    
        request = Request()
        request.add_json_parameter("RequestData", None, RequestData)
        request.add_json_parameter("TtsParameter", "TEXT_TO_READ", TEXT_TO_READ)
    
        #ssl._create_default_https_context = ssl._create_unverified_context
        body, boundary = request.encode()
        h = httplib.HTTPSConnection(host, port)
        #h.set_debuglevel(1)
    
        headers = {
            "Content-Type": "multipart/form-data; boundary=%s" % (boundary,),
            "Connection": "Keep-Alive",
        }
    
        h.request('POST', uri, body, headers)
        res = h.getresponse()
    
        data = """MIME-Version: 1.0
    Content-Type: multipart/mixed; boundary=--Nuance_NMSP_vutc5w1XobDdefsYG3wq
    """ + res.read()
    
        msg = email.message_from_string(data)
    
        for part in msg.walk():
            content_type = part.get_content_type()
            payload = part.get_payload()
    
            if content_type == "audio/x-wav" and len(payload):
                ref_filename = get_filename('pcm', output_filename + '_ref', LNG)
                if not os.path.exists(ref_filename):
                    with open(ref_filename, 'wb') as f_pcm:
                        f_pcm.write(payload)
    
                cur_filename = get_filename('pcm', output_filename, LNG)
                with open(cur_filename, 'wb') as f_pcm:
                    f_pcm.write(payload)
    
                compare_files(ref_filename, cur_filename)
    
            elif content_type == "application/json":
                with open(get_filename('json', output_filename, LNG), 'w') as f_json:
                    f_json.write(payload)
    
    
    filename = r'input.txt'
    
    with open(filename) as f_input:
        for line in f_input:
            LNG, text = line.strip().split('|')
            print "Getting {}: {}".format(LNG, text)
            get_tts(text, LNG)
    

    This assumes your input.txt file has the following format:

    ENG|I am tired
    GED|Ich gehe nach hause
    

    This will produce an output pcm and json file per line of text. It works with multiple files/languages.