Search code examples
rreticulateinteractive-brokers

How do I make a data frame or tibble in r from data passed using a Python API in a list or environment object?


Using reticulate, I'm getting data from a Python API through Interactive Brokers. I'd like to convert the data passed from my Python API to a data frame or a tibble, but I'm completely baffled about how to get it done. I also cannot figure out how to recreate the data in a reprex, so I'm just providing the text as it appears on my R console.

Here's the data that gets passed to me in the openOrders object from the Python API. It looks like this when I enter the object name at the R prompt:

> openOrders
[[1]]
Order(orderId=16, clientId=501, permId=115804563, action='Buy', totalQuantity=1.0)

[[2]]
Order(orderId=17, clientId=501, permId=115804564, action='SELL', totalQuantity=1.0)

[[3]]
Order(orderId=18, clientId=501, permId=115804565, action='SELL', totalQuantity=1.0)

When I dput openOrders and view the file, I get:

> dput(openOrders) %>% print()
list(<environment>, <environment>, <environment>)
[[1]]

Order(orderId=25, clientId=501, permId=306800005, action='Buy', totalQuantity=1.0)

[[2]]
Order(orderId=26, clientId=501, permId=306800006, action='SELL', totalQuantity=1.0)

[[3]]
Order(orderId=27, clientId=501, permId=306800007, action='SELL', totalQuantity=1.0)

R tells me the data type is a list:

> typeof(openOrders)
[1] "list"

I can access individual entries in the list:

> openOrders[1]
[[1]]

Order(orderId=16, clientId=501, permId=115804563, action='Buy', totalQuantity=1.0)

I can access individual data elements by name in the list using the following:

> openOrders[[1]]$orderId
[1] 16

When I ask for the names in the list, there are 134 of them (here's an excerpt):

> names(openOrders[[1]])
  [1] "account"                        "action"                         "activeStartTime"            
    ...         
[133] "volatilityType"                 "whatIf"

But I cannot figure out how to get the data into a data frame or tibble. Ideally, the data frame or tibble would look like this:

> openOrders

    orderId clientId permId     action totalQuantity
[1] 16      501      115804563  'Buy'  1.0 
[2] 17      501      115804564  'SELL' 1.0
[3] 18      501      115804565. 'SELL' 1.0

I've tried the enframe function, as suggested in another post and I get:

> enframe(openOrders)
# A tibble: 3 x 2
   name value     
  <int> <list>    
1     1 <ib_ns..O>
2     2 <ib_ns..O>
3     3 <ib_ns..O>

I also tried the following from another post and got the error:

x <- as.data.frame(do.call(rbind, openOrders))
Warning: Error in <Anonymous>: environments cannot be coerced to other types

Here's the python code:

from ib_insync import *
import pandas as pd
import numpy as np

# Identify open orders 
def ibOpenOrders():
  orders = ib.openOrders()
  ib.sleep(0)
  return (orders)

ib = IB()

And here is my code in R:

library (reticulate)
use_python("/usr/local/bin/python3.7")
source_python("iBrokersCallsReprex.py")
openOrders <- ibOpenOrders()

And to get the data for dput, I set up the data in a pandas data frame on the python side before returning it to R. I changed the python code to:

def ibOpenOrders():
  orders = ib.openOrders()
  ib.sleep(0)
  df = util.df(orders)
  return (df)

By doing that, dput returns the following:

structure(list(orderId = c(68, 69, 70), clientId = c(500, 500, 
500), permId = c(306801738, 306801739, 306801740), action = c("Buy", 
"SELL", "SELL"), totalQuantity = c(1, 1, 1), orderType = c("LMT", 
"LMT", "STP"), lmtPrice = c(9646.25, 9656.25, 1.79769313486232e+308
), auxPrice = c(1.79769313486232e+308, 1.79769313486232e+308, 
9626.25), tif = c("", "", ""), activeStartTime = c("", "", ""
), activeStopTime = c("", "", ""), ocaGroup = c("", "", ""), 
ocaType = c(0, 0, 0), orderRef = c("", "", ""), transmit = c(FALSE, 
FALSE, TRUE), parentId = c(0, 68, 68), blockOrder = c(FALSE, 
FALSE, FALSE), sweepToFill = c(FALSE, FALSE, FALSE), displaySize =  c(0, 
0, 0), triggerMethod = c(0, 0, 0), outsideRth = c(FALSE, 
FALSE, FALSE), hidden = c(FALSE, FALSE, FALSE), goodAfterTime = c("", 
"", ""), goodTillDate = c("", "", ""), rule80A = c("", "", 
""), allOrNone = c(FALSE, FALSE, FALSE), minQty = c(2147483647, 
2147483647, 2147483647), percentOffset = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308),      overridePercentageConstraints = c(FALSE, 
FALSE, FALSE), trailStopPrice = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), trailingPercent = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), faGroup = c("", 
"", ""), faProfile = c("", "", ""), faMethod = c("", "", 
""), faPercentage = c("", "", ""), designatedLocation = c("", 
"", ""), openClose = c("O", "O", "O"), origin = c(0, 0, 0
), shortSaleSlot = c(0, 0, 0), exemptCode = c(-1, -1, -1), 
discretionaryAmt = c(0, 0, 0), eTradeOnly = c(TRUE, TRUE, 
TRUE), firmQuoteOnly = c(TRUE, TRUE, TRUE), nbboPriceCap = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), optOutSmartRouting = c(FALSE, 
FALSE, FALSE), auctionStrategy = c(0, 0, 0), startingPrice = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), stockRefPrice = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), delta = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), stockRangeLower = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), stockRangeUpper = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), randomizePrice = c(FALSE, 
FALSE, FALSE), randomizeSize = c(FALSE, FALSE, FALSE), volatility = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), volatilityType = c(2147483647, 
2147483647, 2147483647), deltaNeutralOrderType = c("", "", 
""), deltaNeutralAuxPrice = c(1.79769313486232e+308, 1.79769313486232e+308, 
1.79769313486232e+308), deltaNeutralConId = c(0, 0, 0), deltaNeutralSettlingFirm = c("", 
"", ""), deltaNeutralClearingAccount = c("", "", ""), deltaNeutralClearingIntent = c("", 
"", ""), deltaNeutralOpenClose = c("", "", ""), deltaNeutralShortSale = c(FALSE, 
FALSE, FALSE), deltaNeutralShortSaleSlot = c(0, 0, 0), deltaNeutralDesignatedLocation = c("", 
"", ""), continuousUpdate = c(FALSE, FALSE, FALSE), referencePriceType = c(2147483647, 
2147483647, 2147483647), basisPoints = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), basisPointsType = c(2147483647, 
2147483647, 2147483647), scaleInitLevelSize = c(2147483647, 
2147483647, 2147483647), scaleSubsLevelSize = c(2147483647, 
2147483647, 2147483647), scalePriceIncrement = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), scalePriceAdjustValue = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), scalePriceAdjustInterval = c(2147483647, 
2147483647, 2147483647), scaleProfitOffset = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), scaleAutoReset = c(FALSE, 
FALSE, FALSE), scaleInitPosition = c(2147483647, 2147483647, 
2147483647), scaleInitFillQty = c(2147483647, 2147483647, 
2147483647), scaleRandomPercent = c(FALSE, FALSE, FALSE), 
scaleTable = c("", "", ""), hedgeType = c("", "", ""), hedgeParam = c("", 
"", ""), account = c("", "", ""), settlingFirm = c("", "", 
""), clearingAccount = c("", "", ""), clearingIntent = c("", 
"", ""), algoStrategy = c("", "", ""), algoParams = list(
    list(), list(), list()), smartComboRoutingParams = list(
    list(), list(), list()), algoId = c("", "", ""), whatIf = c(FALSE, 
FALSE, FALSE), notHeld = c(FALSE, FALSE, FALSE), solicited = c(FALSE, 
FALSE, FALSE), modelCode = c("", "", ""), orderComboLegs = list(
    list(), list(), list()), orderMiscOptions = list(list(), 
    list(), list()), referenceContractId = c(0, 0, 0), peggedChangeAmount = c(0, 
0, 0), isPeggedChangeAmountDecrease = c(FALSE, FALSE, FALSE
), referenceChangeAmount = c(0, 0, 0), referenceExchangeId = c("", 
"", ""), adjustedOrderType = c("", "", ""), triggerPrice = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), adjustedStopPrice = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), adjustedStopLimitPrice = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), adjustedTrailingAmount = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), adjustableTrailingUnit = c(0, 
0, 0), lmtPriceOffset = c(1.79769313486232e+308, 1.79769313486232e+308, 
1.79769313486232e+308), conditions = list(list(), list(), 
    list()), conditionsCancelOrder = c(FALSE, FALSE, FALSE
), conditionsIgnoreRth = c(FALSE, FALSE, FALSE), extOperator = c("", 
"", ""), softDollarTier = list(<environment>, <environment>, 
    <environment>), cashQty = c(1.79769313486232e+308, 1.79769313486232e+308, 
1.79769313486232e+308), mifid2DecisionMaker = c("", "", ""
), mifid2DecisionAlgo = c("", "", ""), mifid2ExecutionTrader = c("", 
"", ""), mifid2ExecutionAlgo = c("", "", ""), dontUseAutoPriceForHedge = c(FALSE, 
FALSE, FALSE), isOmsContainer = c(FALSE, FALSE, FALSE), discretionaryUpToLimitPrice = c(FALSE, 
FALSE, FALSE), autoCancelDate = c("", "", ""), filledQuantity = c(1.79769313486232e+308, 
1.79769313486232e+308, 1.79769313486232e+308), refFuturesConId = c(0, 
0, 0), autoCancelParent = c(FALSE, FALSE, FALSE), shareholder = c("", 
"", ""), imbalanceOnly = c(FALSE, FALSE, FALSE), routeMarketableToBbo = c(FALSE, 
FALSE, FALSE), parentPermId = c(0, 0, 0), usePriceMgmtAlgo = c(FALSE, 
FALSE, FALSE)), class = "data.frame", row.names = c(NA, -3L
), pandas.index = <environment>)

On the R side, here is the structure of openOrders

> str(openOrders)

tibble [9 × 130] (S3: tbl_df/tbl/data.frame)
 $ orderId                       : num [1:9] 140 141 142 133 134 132 148 149 150
 $ clientId                      : num [1:9] 500 500 500 500 500 500 500 500 500
 $ permId                        : num [1:9] 1.78e+09 1.78e+09 1.78e+09 1.78e+09 1.78e+09 ...
 $ action                        : chr [1:9] "BUY" "SELL" "SELL" "SELL" ...
 $ isPeggedChangeAmountDecrease  : logi [1:9] FALSE FALSE FALSE FALSE FALSE FALSE ...
  [list output truncated]
 - attr(*, "pandas.index")=RangeIndex(start=0, stop=9, step=1)

Here's what I get when I print the pandas data frame on the python side:

def ibOpenOrders():
  openOrders = ib.openOrders()
  ib.sleep(0)
  #print (openOrders.head())
  df = util.df(openOrders)
  print (df.head())
   orderId  clientId  ...  parentPermId usePriceMgmtAlgo
0       13       400  ...             0            False
1       14       400  ...             0            False
2       12       400  ...             0            False
3        7       400  ...             0            False
4        5       400  ...             0            False

And, here's what I get when I print just the softDollarTier attribute on the python side:

print (openOrders.softDollarTier)
[18 rows x 130 columns]
0     SoftDollarTier(name='', val='', displayName='')
1     SoftDollarTier(name='', val='', displayName='')
2     SoftDollarTier(name='', val='', displayName='')
3     SoftDollarTier(name='', val='', displayName='')

And here's what I get on the R side for that same attribute when I access it directly.

> head(openOrders$softDollarTier)
[[1]]
SoftDollarTier(name='', val='', displayName='')

[[2]]
SoftDollarTier(name='', val='', displayName='')

[[3]]
SoftDollarTier(name='', val='', displayName='')

Any ideas?


Solution

  • As discussed, as long as the Python method returns an actual Pandas data frame, reticulate will convert to R data frame. To demonstrate with reproducible example of various atomic types:

    Python

    import numpy as np
    import pandas as pd
    
    alpha = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
    data_tools = ['sas', 'stata', 'spss', 'python', 'r', 'julia']
    
    ### DATA BUILD
    def build_py_df():
        np.random.seed(6520)
        random_df = pd.DataFrame({'group': np.random.choice(data_tools, 500),
                                  'int': np.random.randint(1, 10, 500),
                                  'num': np.random.randn(500),
                                  'char': [''.join(np.random.choice(list(alpha), 3)) for _ in range(500)],
                                  'bool': np.random.choice([True, False], 500),
                                  'date': np.random.choice(pd.date_range('2000-01-01', '2019-05-31'), 500)
                                 })
    
    
        return random_df
    
    df = build_py_df()
    
    print(df.head(10))
    

    Output

    #     group  int       num char   bool       date
    # 0       r    8 -0.604529  eNR   True 2008-09-01
    # 1   stata    7  0.875878  0G9   True 2004-07-13
    # 2    spss    4 -0.857370  mrH  False 2017-11-29
    # 3   stata    6 -2.144899  MFj   True 2003-03-03
    # 4   stata    3 -0.408117  Gsh   True 2008-11-28
    # 5   stata    2  1.324790  gR0   True 2004-04-15
    # 6   julia    6  0.682228  jhR   True 2004-09-18
    # 7  python    6 -0.993106  cqT   True 2002-03-27
    # 8   julia    5 -0.346687  GfC   True 2007-04-30
    # 9       r    7  0.925665  d1a   True 2006-01-01
    

    R

    library (reticulate)
    
    source_python("/path/to/Python/script.py")
    
    py_df <- build_py_df()
    head(py_df, 10) 
    

    Output

    #     group int        num char  bool       date
    # 1       r   8 -0.6045292  eNR  TRUE 2008-09-01
    # 2   stata   7  0.8758784  0G9  TRUE 2004-07-13
    # 3    spss   4 -0.8573697  mrH FALSE 2017-11-29
    # 4   stata   6 -2.1448990  MFj  TRUE 2003-03-03
    # 5   stata   3 -0.4081175  Gsh  TRUE 2008-11-28
    # 6   stata   2  1.3247895  gR0  TRUE 2004-04-15
    # 7   julia   6  0.6822280  jhR  TRUE 2004-09-18
    # 8  python   6 -0.9931057  cqT  TRUE 2002-03-27
    # 9   julia   5 -0.3466866  GfC  TRUE 2007-04-30
    # 10      r   7  0.9256647  d1a  TRUE 2006-01-01
    

    For meta data

    str(py_df)
    # 'data.frame': 500 obs. of  6 variables:
    #  $ group: chr  "r" "stata" "spss" "stata" ...
    #  $ int  : num  8 7 4 6 3 2 6 6 5 7 ...
    #  $ num  : num  -0.605 0.876 -0.857 -2.145 -0.408 ...
    #  $ char : chr  "eNR" "0G9" "mrH" "MFj" ...
    #  $ bool : logi  TRUE TRUE FALSE TRUE TRUE TRUE ...
    #  $ date : POSIXct, format: "2008-09-01" "2004-07-13" "2017-11-29" "2003-03-03" ...
    #  - attr(*, "pandas.index")=RangeIndex(start=0, stop=500, step=1)
    
    attributes(py_df)
    # $`names`
    # [1] "group" "int"   "num"   "char"  "bool"  "date" 
    
    # $class
    # [1] "data.frame"
    
    # $row.names
    # [1]   1   2   3   4   5   6  
    # SHOW ALL ATTRIBUTES
    
    # $pandas.index
    # RangeIndex(start=0, stop=500, step=1)
    
    attributes(py_df)$pandas.index
    # RangeIndex(start=0, stop=500, step=1)