Search code examples
pythonpython-3.xpython-dateutil

Get the format in dateutil.parse


Is there a way to get the "format" after parsing a date in dateutil. For example something like:

>>> x = parse("2014-01-01 00:12:12")
datetime.datetime(2014, 1, 1, 0, 12, 12)

x.get_original_string_format()
YYYY-MM-DD HH:MM:SS # %Y-%m-%d %H:%M:%S

# Or, passing the date-string directly
get_original_string_format("2014-01-01 00:12:12")
YYYY-MM-DD HH:MM:SS # %Y-%m-%d %H:%M:%S

Update: I'd like to add a bounty to this question to see if someone could add an answer that would do the equivalent on getting the string-format of a common date-string passed. It can use dateutil if you want, but it doesn't have to. Hopefully we'll get some creative solutions here.


Solution

  • My idea was to:

    1. Create an object that has a list of candidate specifiers you think might be in the date pattern (the more you add, the more possibilities you will get out the other end)
    2. Parse the date string
    3. Create a list of possible specifiers for each element in the string, based on the date and the list of candidates you supplied.
    4. Recombine them to produce a list of 'possibles'.

    If you get only a single candidate, you can be pretty sure is it the right format. But you will often get many possibilities (especially with dates, months, minutes and hours all in the 0-10 range).

    Example class:

    import re
    from itertools import product
    from dateutil.parser import parse
    from collections import defaultdict, Counter
    
    COMMON_SPECIFIERS = [
        '%a', '%A', '%d', '%b', '%B', '%m',
        '%Y', '%H', '%p', '%M', '%S', '%Z',
    ]
    
    
    class FormatFinder:
        def __init__(self,
                     valid_specifiers=COMMON_SPECIFIERS,
                     date_element=r'([\w]+)',
                     delimiter_element=r'([\W]+)',
                     ignore_case=False):
            self.specifiers = valid_specifiers
            joined = (r'' + date_element + r"|" + delimiter_element)
            self.pattern = re.compile(joined)
            self.ignore_case = ignore_case
    
        def find_candidate_patterns(self, date_string):
            date = parse(date_string)
            tokens = self.pattern.findall(date_string)
    
            candidate_specifiers = defaultdict(list)
    
            for specifier in self.specifiers:
                token = date.strftime(specifier)
                candidate_specifiers[token].append(specifier)
                if self.ignore_case:
                    candidate_specifiers[token.
                                         upper()] = candidate_specifiers[token]
                    candidate_specifiers[token.
                                         lower()] = candidate_specifiers[token]
    
            options_for_each_element = []
            for (token, delimiter) in tokens:
                if token:
                    if token not in candidate_specifiers:
                        options_for_each_element.append(
                            [token])  # just use this verbatim?
                    else:
                        options_for_each_element.append(
                            candidate_specifiers[token])
                else:
                    options_for_each_element.append([delimiter])
    
            for parts in product(*options_for_each_element):
                counts = Counter(parts)
                max_count = max(counts[specifier] for specifier in self.specifiers)
                if max_count > 1:
                    # this is a candidate with the same item used more than once
                    continue
                yield "".join(parts)
    

    And some sample tests:

    def test_it_returns_value_from_question_1():
        s = "2014-01-01 00:12:12"
        candidates = FormatFinder().find_candidate_patterns(s)
        sut = FormatFinder()
        candidates = sut.find_candidate_patterns(s)
        assert "%Y-%m-%d %H:%M:%S" in candidates
    
    
    def test_it_returns_value_from_question_2():
        s = 'Jan. 04, 2017'
        sut = FormatFinder()
        candidates = sut.find_candidate_patterns(s)
        candidates = list(candidates)
        assert "%b. %d, %Y" in candidates
        assert len(candidates) == 1
    
    
    def test_it_can_ignore_case():
        # NB: apparently the 'AM/PM' is meant to be capitalised in my locale! 
        # News to me!
        s = "JANUARY 12, 2018 02:12 am"
        sut = FormatFinder(ignore_case=True)
        candidates = sut.find_candidate_patterns(s)
        assert "%B %d, %Y %H:%M %p" in candidates
    
    
    def test_it_returns_parts_that_have_no_date_component_verbatim():
        # In this string, the 'at' is considered as a 'date' element, 
        # but there is no specifier that produces a candidate for it
        s = "January 12, 2018 at 02:12 AM"
        sut = FormatFinder()
        candidates = sut.find_candidate_patterns(s)
        assert "%B %d, %Y at %H:%M %p" in candidates
    

    To make it a bit clearer, here's some example of using this code in an iPython shell:

    In [2]: ff = FormatFinder()
    
    In [3]: list(ff.find_candidate_patterns("2014-01-01 00:12:12"))
    Out[3]:
    ['%Y-%d-%m %H:%M:%S',
     '%Y-%d-%m %H:%S:%M',
     '%Y-%m-%d %H:%M:%S',
     '%Y-%m-%d %H:%S:%M']
    
    In [4]: list(ff.find_candidate_patterns("Jan. 04, 2017"))
    Out[4]: ['%b. %d, %Y']
    
    In [5]: list(ff.find_candidate_patterns("January 12, 2018 at 02:12 AM"))
    Out[5]: ['%B %d, %Y at %H:%M %p', '%B %M, %Y at %H:%d %p']
    
    In [6]: ff_without_case = FormatFinder(ignore_case=True)
    
    In [7]: list(ff_without_case.find_candidate_patterns("JANUARY 12, 2018 02:12 am"))
    Out[7]: ['%B %d, %Y %H:%M %p', '%B %M, %Y %H:%d %p']