Search code examples
pythonnlpnltktokenize

How to ignore punctuation in-between words using word_tokenize in NLTK?


I'm looking to ignore characters in-between words using NLTK word_tokenize.

If I have a a sentence:

test = 'Should I trade on the S&P? This works with a phone number 333-445-6635 and email [email protected]'

The word_tokenize method is splitting the S&P into

'S','&','P','?'

Is there a way to have this library ignore punctuation between words or letters? Expected output: 'S&P','?'


Solution

  • Let me know how this works with your sentences.
    I added an additional test with a bunch of punctuation.
    The regular expression is, in the final portion, modified from the WordPunctTokenizer regexp.

    from nltk.tokenize import RegexpTokenizer
    
    punctuation = r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]?'
    tokenizer = RegexpTokenizer(r'\w+' + punctuation + r'\w+?|[^\s]+?')
    
    # result: 
    In [156]: tokenizer.tokenize(test)
    Out[156]: ['Should', 'I', 'trade', 'on', 'the', 'S&P', '?']
    
    # additional test:
    In [225]: tokenizer.tokenize('"I am tired," she said.')
    Out[225]: ['"', 'I', 'am', 'tired', ',', '"', 'she', 'said', '.']
    

    Edit: the requirements changed a bit so we can slightly modify PottsTweetTokenizer for this purpose.

    emoticon_string = r"""
        (?:
          [<>]?
          [:;=8]                     # eyes
          [\-o\*\']?                 # optional nose
          [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
          |
          [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
          [\-o\*\']?                 # optional nose
          [:;=8]                     # eyes
          [<>]?
        )"""
    # Twitter symbols/cashtags:  # Added by awd, 20140410.
    # Based upon Twitter's regex described here: <https://blog.twitter.com/2013/symbols-entities-tweets>.
    cashtag_string = r"""(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)"""
    
    # The components of the tokenizer:
    regex_strings = (
        # Phone numbers:
        r"""
        (?:
          (?:            # (international)
            \+?[01]
            [\-\s.]*
          )?            
          (?:            # (area code)
            [\(]?
            \d{3}
            [\-\s.\)]*
          )?    
          \d{3}          # exchange
          [\-\s.]*   
          \d{4}          # base
        )"""
        ,
        # Emoticons:
        emoticon_string
        ,
        # HTML tags:
        r"""(?:<[^>]+>)"""
        ,
        # URLs:
        r"""(?:http[s]?://t.co/[a-zA-Z0-9]+)"""
        ,
        # Twitter username:
        r"""(?:@[\w_]+)"""
        ,
        # Twitter hashtags:
        r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
        ,
        # Twitter symbols/cashtags:
        cashtag_string
        ,
        # email addresses
        r"""(?:[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-])""",
        # Remaining word types:
        r"""
        (?:[a-z][^\s]+[a-z])           # Words with punctuation (modification here).
        |
        (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
        |
        (?:[\w_]+)                     # Words without apostrophes or dashes.
        |
        (?:\.(?:\s*\.){1,})            # Ellipsis dots. 
        |
        (?:\S)                         # Everything else that isn't whitespace.
        """
        )
    word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
    # The emoticon and cashtag strings get their own regex so that we can preserve case for them as needed:
    emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE)
    cashtag_re = re.compile(cashtag_string, re.VERBOSE | re.I | re.UNICODE)
    
    # These are for regularizing HTML entities to Unicode:
    html_entity_digit_re = re.compile(r"&#\d+;")
    html_entity_alpha_re = re.compile(r"&\w+;")
    amp = "&amp;"
    
    class CustomTweetTokenizer(object):
        def __init__(self, *, preserve_case: bool=False):
            self.preserve_case = preserve_case
    
        def tokenize(self, tweet: str) -> list:
            """
            Argument: tweet -- any string object.
            Value: a tokenized list of strings; concatenating this list returns the original string if preserve_case=True
            """
            # Fix HTML character entitites:
            tweet = self._html2unicode(tweet)
            # Tokenize:
            matches = word_re.finditer(tweet)
            if self.preserve_case:
                return [match.group() for match in matches]
            return [self._normalize_token(match.group()) for match in matches]
    
        @staticmethod
        def _normalize_token(token: str) -> str:
    
            if emoticon_re.search(token):
                # Avoid changing emoticons like :D into :d
                return token
            if token.startswith('$') and cashtag_re.search(token):
                return token.upper()
            return token.lower()
    
        @staticmethod
        def _html2unicode(tweet: str) -> str:
            """
            Internal method that seeks to replace all the HTML entities in
            tweet with their corresponding unicode characters.
            """
            # First the digits:
            ents = set(html_entity_digit_re.findall(tweet))
            if len(ents) > 0:
                for ent in ents:
                    entnum = ent[2:-1]
                    try:
                        entnum = int(entnum)
                        tweet = tweet.replace(ent, chr(entnum))
                    except:
                        pass
            # Now the alpha versions:
            ents = set(html_entity_alpha_re.findall(tweet))
            ents = filter((lambda x: x != amp), ents)
            for ent in ents:
                entname = ent[1:-1]
                try:
                    tweet = tweet.replace(ent, chr(html.entities.name2codepoint[entname]))
                except:
                    pass
                tweet = tweet.replace(amp, " and ")
            return tweet
    

    To test it out:

    tknzr = CustomTweetTokenizer(preserve_case=True)
    tknzr.tokenize(test)
    
    # result:
    ['Should',
     'I',
     'trade',
     'on',
     'the',
     'S&P',
     '?',
     'This',
     'works',
     'with',
     'a',
     'phone',
     'number',
     '333-445-6635',
     'and',
     'email',
     '[email protected]']