Search code examples
rsql-serverjdbcohdsi-omop

How to fix connection error seen when running Synthea ETL tool but not seen running DQD?


I can run DQD using the connection parameters shown below but I get an SSL error if I try to use connect() and I get the same error trying to use the synthea etl tool.

Full script is below.

Any help would be greatly appreciated.

Connection Parameters

dbms <- "sql server"
user <- "synthea_user" 
password <- "sneeker" 
server <- "localhost" 
port <- "1433"
pathToDriver <- pathToDriver
extraSettings <- ";databaseName=master;integratedSecurity=true;encrypt=false"

Error

Error in rJava::.jcall(jdbcDriver, "Ljava/sql/Connection;", "connect",  : 
  com.microsoft.sqlserver.jdbc.SQLServerException: The driver could not establish a secure connection to SQL Server by using Secure Sockets Layer (SSL) encryption. Error: "PKIX path building failed: sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target". ClientConnectionId:1799f03c-5ddf-4498-9670-6fcf87f557ac

Full Script

# local files
pathToDriver <- "D:\\NACHC\\SYNTHEA\\DQD\\resources\\jar\\sqlserver-jar"  # location of the mssql-jdbc-10.2.0.jre8.jar
outputFolder <- "D:\\NACHC\\SYNTHEA\\DQD\\output"                         # location where output file will be written
outputFile <- "results.json"                                              # file for results json

# database connectivity
dbms <- "sql server"
user <- "synthea_user" 
password <- "sneeker" 
server <- "localhost" 
port <- "1433"
pathToDriver <- pathToDriver
extraSettings <- ";databaseName=master;integratedSecurity=true;encrypt=false"

# database schemas
cdmDatabaseSchema <- "synthea_micro.dbo"                       # your omop instance
resultsDatabaseSchema <- "synthea_micro_dqd_results.dbo"       # instance where results will be written
cdmSourceName <- "SYNTHEA_MICRO_Test_Database"                 # a human readable name for your CDM source

# config parameters
numThreads <- 1       # number of threads to run, 3 seems to work well on Redshift
sqlOnly <- FALSE      # set to TRUE if you just want to get the SQL scripts and not actually run the queries
verboseMode <- TRUE   # set to TRUE if you want to see activity written to the console
writeToTable <- TRUE  # set to FALSE if you want to skip writing to a SQL table in the results schema

# dqd parameters
checkLevels <- c("TABLE", "FIELD", "CONCEPT")  # which DQ check levels to run
checkNames <- c()                              # which DQ checks to run, names can be found in inst/csv/OMOP_CDM_v5.3.1_Check_Desciptions.csv
tablesToExclude <- c()                         # which CDM tables to exclude?


# ---
#
# run-dqd.r
#
# Run the init-parameters.r script before running this script.   
#
# Script to run the Data Quality Dashboard (DQD)
# 
# The output of this script is a single JSON file that can then be rendered with the run-dqd.r script 
# to view the results as a web page in a browser.  
# 
# Prior to running this script you will need to download and install the jdbc driver For MS Sql Server.  
# This is entered as the pathToDriver variable below.  
#
# ---

# ---
# 
# CREATE THE CONNECTION OBJECT AND RUN THE JOB
# 
# ---

# create connection details object
connectionDetails <- DatabaseConnector::createConnectionDetails(
  dbms = dbms, 
  user = user, 
  password = password, 
  server = server, 
  port = port, 
  pathToDriver = pathToDriver,
  extraSettings = extraSettings 
)


# conn <- connect(cd)
# disconnect(conn)

# (OPTIONAL): if writing to table and using Redshift, bulk loading can be initialized
# Sys.setenv (
#   "AWS_ACCESS_KEY_ID" = "",
#   "AWS_SECRET_ACCESS_KEY" = "",
#   "AWS_DEFAULT_REGION" = "",
#   "AWS_BUCKET_NAME" = "",
#   "AWS_OBJECT_KEY" = "",
#   "AWS_SSE_TYPE" = "AES256",
#   "USE_MPP_BULK_LOAD" = TRUE
# )

# ---
#
# run the job
#
# ---

DataQualityDashboard::executeDqChecks (
  connectionDetails = connectionDetails, 
  cdmDatabaseSchema = cdmDatabaseSchema, 
  resultsDatabaseSchema = resultsDatabaseSchema,
  cdmSourceName = cdmSourceName, 
  numThreads = numThreads,
  sqlOnly = sqlOnly, 
  outputFolder = outputFolder, 
  outputFile = outputFile,
  verboseMode = verboseMode,
  writeToTable = writeToTable,
  checkLevels = checkLevels,
  tablesToExclude = tablesToExclude,
  checkNames = checkNames,
  cdmVersion = "5.4"
)

# (OPTIONAL) inspect logs
# ParallelLogger::launchLogViewer(
#   logFileName = file.path(outputFolder, cdmSourceName, 
#   sprintf("log_DqDashboard_%s.txt", cdmSourceName))
# )

# (OPTIONAL) if you want to write the JSON file to the results table separately
# jsonFilePath <- ""
# DataQualityDashboard::writeJsonResultsToTable(
#   connectionDetails = connectionDetails, 
#   resultsDatabaseSchema = resultsDatabaseSchema, 
#   jsonFilePath = jsonFilePath
# )

print("TRYING TO CONNECT USING connect(cd)")
conn <- connect(cd)
disconnect(conn)

Solution

  • This are the connection settings that needed to be used to get this to work.

    extraSettings <- ";databaseName=synthea_etl;integratedSecurity=false;encrypt=false;trustServerCertificate=true;sslProtocol=TLSv1"
    

    This is the full script.

    # run the following once
    devtools::install_github("OHDSI/ETL-Synthea", INSTALL_opts = "--no-multiarch")
    
    library(ETLSyntheaBuilder)
    
    
    # database connectivity
    dbms <- "sql server"
    user <- "synthea_etl" 
    password <- "Sneaker01" 
    server <- "localhost" 
    port <- "1433"
    pathToDriver <- "D:\\NACHC\\SYNTHEA\\DQD\\resources\\jar\\sqlserver-jar" 
    extraSettings <- ";databaseName=synthea_etl;integratedSecurity=false;encrypt=false;trustServerCertificate=true;sslProtocol=TLSv1"
    
    cd <- DatabaseConnector::createConnectionDetails(
      dbms = dbms, 
      user = user, 
      password = password, 
      server = server, 
      port = port, 
      pathToDriver = pathToDriver,
      extraSettings = extraSettings 
    )
    
    # test the connection
    conn <- connect(cd)
    disconnect(conn)
    
    cdmSchema      <- "synthea_etl.dbo"
    cdmVersion     <- "5.4"
    syntheaVersion <- "2.7.0"
    syntheaSchema  <- "synthea_etl.dbo"
    syntheaFileLoc <- "C:\\Users\\gresh\\Downloads\\synthea-etl\\synthea_sample_data_csv_apr2020\\csv"
    vocabFileLoc   <- "C:\\fhir-to-omop\\terminology\\vocabulary_download_v5_{8c94604f-71b9-47ea-aef2-04be2a7d52b1}_1646823567698"
    
    # ETLSyntheaBuilder::CreateCDMTables(connectionDetails = cd, cdmSchema = cdmSchema, cdmVersion = cdmVersion)
    
    ETLSyntheaBuilder::CreateSyntheaTables(connectionDetails = cd, syntheaSchema = syntheaSchema, syntheaVersion = syntheaVersion)
    
    ETLSyntheaBuilder::LoadSyntheaTables(connectionDetails = cd, syntheaSchema = syntheaSchema, syntheaFileLoc = syntheaFileLoc)
    
    # ETLSyntheaBuilder::LoadVocabFromCsv(connectionDetails = cd, cdmSchema = cdmSchema, vocabFileLoc = vocabFileLoc)
    
    conn <- connect(cd)
    dbExecute(conn, paste("delete from ", cdmSchema, ".provider", sep=""))
    dbExecute(conn, paste("delete from ", cdmSchema, ".person", sep=""))
    disconnect(conn)
    
    ETLSyntheaBuilder::LoadEventTables(connectionDetails = cd, cdmSchema = cdmSchema, syntheaSchema = syntheaSchema, cdmVersion = cdmVersion, syntheaVersion = syntheaVersion)