I was testing a CSV download of my app events using the API. I noticed that the CSV had different event counts for different calls for the same time period. All data (for each download) was correct for my app and the requested time periods. Does anyone knows if they sample the data to create the file for download?
Edited to include sample call, code for extraction, and result for 2 calls for the same time period.
Call
str_init = '20191101'
str_end = '20191102'
# Call data extraction for Flurry from IOS app
get_csv_from_flurry(str_init, str_end, 'IOS')
Code for Extraction
from datetime import datetime
from dateutil import parser
import requests
import json
import time
from functions.ribon_path import ribon_root_path_join
from functions.ribon_s3_integration import ribon_upload_to_s3
"""
Make CSV extraction from flurry based on initial date (yyyy-mm-dd), end date (yyyy-mm-dd) and platform
Save Uncompressed CSV locally for processing
Save compressed file (parquet) to S3 for backup
"""
def get_csv_from_flurry(str_ini, str_end, str_platform):
# Convert time period to datetime format
dt_ini = parser.parse(str_ini)
dt_end = parser.parse(str_end)
def unix_time_millis(dt):
# Convert date periods to unix milisecon epoch
epoch = datetime.utcfromtimestamp(0)
return (dt - epoch).total_seconds() * 1000.0
epoch_ini = unix_time_millis(dt_ini)
epoch_end = unix_time_millis(dt_end)
#print(epoch_ini)
#print(epoch_fim)
if str_platform == 'IOS' :
Flurry_apiKey = 'XXX'
else :
Flurry_apiKey = 'YYY'
# Build the parameters of the post request to the flurry API
url = 'https://rawdata.flurry.com/pulse/v1/rawData'
payload = {"data": {
"type":"rawData",
"attributes":{
"startTime": epoch_ini,
"endTime": epoch_end,
"outputFormat": "CSV",
"apiKey": Flurry_apiKey
}
}
}
headers = {"accept": "application/vnd.api+json",
"authorization": "Bearer ZZZ",
"cache-control": "no-cache",
"content-type": "application/vnd.api+json"
}
#print(payload)
# Make the request
print('Make Request to Flurry')
r = requests.post(url, data=json.dumps(payload), headers=headers)
#print(r.content)
# Test the return, get the status, download url and request id
test = r.json()
#print(teste['data']['attributes']['s3URI'])
#print(teste['data']['id'])
r_s3URI = test['data']['attributes']['s3URI']
r_id = test['data']['id']
# Check if the download link is ready
url = 'https://rawdata.flurry.com/pulse/v1/rawData/' + r_id + '?fields[rawData]=requestStatus,s3URI'
#print(url)
payload = {}
headers = {"accept": "application/vnd.api+json",
"authorization": "Bearer ZZZ",
"cache-control": "no-cache",
"content-type": "application/vnd.api+json"
}
print('Request OK')
# Check each minute if the download link is ready
print('Start Pooling to Check if the File is Ready for Download')
while r_s3URI == None:
time.sleep(60)
# Make the request
r = requests.get(url, data=json.dumps(payload), headers=headers)
print(r.content)
test = r.json()
#print(test['data']['attributes']['s3URI'])
r_s3URI = test['data']['attributes']['s3URI']
# When the download is ready, get the file and save
# Set local folder to save file
flurry_filename = str_ini + '_' + str_end + '_' + str_platform + '.csv.gz'
flurry_path_gz = ribon_root_path_join('data', 'Flurry_Download', flurry_filename)
# Download the file
print('Start Flurry Download')
myfile = requests.get(r_s3URI)
open(flurry_path_gz, 'wb').write(myfile.content)
With the help from Flurry Support, I found out the differences. For API downloads older than 15 days, the API calls are giving the same numbers every time. API calls for dates up to 15 days most times get different results (newer calls with more records). The older the call the smaller the difference, so I agree with the support that this can be accounted for late arriving events. Flurry is not online and works by queuing data on the mobile and dumping that to the server.