Search code examples
pythonhivesqlalchemyprestopyhive

python - presto - timestamps and decimal(38,18) returned as strings?


Why are presto timestamp/decimal(38,18) data types returned a string (enclosed in u'') instead of python datetime/numeric types?

presto jdbc:

select typeof(col1),typeof(col2),typeof(col3),typeof(col4),typeof(col5),typeof(col6) from hive.x.y

result is

timestamp timestamp bigint decimal(38,18) varchar varchar

desc hive.x.y
#result is
for_dt  timestamp   NO  NO  NO  NO  1
for_d   timestamp   NO  NO  NO  NO  2
for_h   bigint  NO  NO  NO  NO  3
value   decimal(38,18)  NO  NO  NO  NO  4
metric  varchar(2147483647) NO  NO  NO  NO  5
lat_lon varchar(2147483647) NO  NO  NO  NO  6

attempt 1
#python
from sqlalchemy.engine import create_engine
engine = create_engine('presto://u:p@host:port',connect_args={'protocol': 'https', 'requests_kwargs': {'verify': 'mypem'}})
result = engine.execute('select * from hive.x.y limit 1')
print(result.fetchall())
#result is
[(u'2010-02-18 03:00:00.000', u'2010-02-18 00:00:00.000', 3, u'-0.191912651062011660', u'hey', u'there')]

attempt 2
#python
from pyhive import presto
import requests
from requests.auth import HTTPBasicAuth

req_kw = {
'verify': 'mypem',
'auth': HTTPBasicAuth('u', 'p')
}

cursor = presto.connect(
host='host',
port=port,
protocol='https',
username='u',
requests_kwargs=req_kw,
).cursor()

query = '''select * from x.y limit 1'''
cursor.execute(query)
print cursor.fetchall()
#result is
[(u'2010-02-18 03:00:00.000', u'2010-02-18 00:00:00.000', 3, u'-0.191912651062011660', u'hey', u'there')]

Solution

  • The output you are getting from your sql query comes from the database in that format.

    You have two choices

    1. Map the Data Yourself (Write Your Own ORM)
    2. Learn to use the ORM

    Option 1

    Note I've just hardcoded your query result in here for my testing.

    from sqlalchemy.engine import create_engine
    from datetime import datetime
    from decimal import Decimal
    
    # 2010-02-18 03:00:00.000
    dateTimeFormat = "%Y-%m-%d %H:%M:%S.%f"
    
    class hivexy:
        def __init__(self, for_dt, for_d, for_h, value, metric, lat_lon):
            self.for_dt = for_dt
            self.for_d = for_d
            self.for_h = for_h
            self.value = value
            self.metric = metric
            self.lat_lon = lat_lon
    
        # Pretty Printing on print(hivexy)
        def __str__(self):
            baseString =  ("for_dt: {}\n"
                           "for_d: {}\n"
                           "for_h: {}\n"
                           "value: {}\n"
                           "metric: {}\n"
                           "lat_lon: {}\n")
            return baseString.format(for_dt, for_d, for_h, value, metric, lat_lon)
    
    #engine = create_engine('presto://u:p@host:port',connect_args={'protocol': 'https', 'requests_kwargs': {'verify': 'mypem'}})
    #results = engine.execute("select * from 'hive.x.y' limit 1")
    results = [(u'2010-02-18 03:00:00.000', u'2010-02-18 00:00:00.000', 3, u'-0.191912651062011660', u'hey', u'there')]
    
    hiveObjects = []
    
    for row in results:
        for_dt = datetime.strptime(row[0], dateTimeFormat)
        for_d = datetime.strptime(row[1], dateTimeFormat)
        for_h = row[2]
        value = Decimal(row[3])
        metric = row[4]
        lat_lon = row[5]
    
        hiveObjects.append(hivexy(for_dt, for_d, for_h, value, metric, lat_lon))
    
    for hiveObject in hiveObjects:
        print(hiveObject)
    

    Option 2 This uses reflection - it queries the database metadata for field types so you don't have to do all that stuff in option 1.

    from sqlalchemy import *
    from sqlalchemy.engine import create_engine
    from sqlalchemy.schema import *
    
    engine = create_engine('presto://u:p@host:port',connect_args={'protocol': 'https', 'requests_kwargs': {'verify': 'mypem'}})
    
    # Reflection - SQLAlchemy will get metadata from database including field types
    hiveXYTable = Table('hive.x.y', MetaData(bind=engine), autoload=True)
    s = select([hiveXYTable]).limit(1)
    results = engine.execute(s)
    
    for row in results:
        print(row)