Title: | A Certe R Package for Connecting to Databases |
---|---|
Description: | A Certe R Package for connecting to internal Certe cBases and local DuckDB databases. This package is part of the 'certedata' universe. |
Authors: | Matthijs S. Berends [aut, cre], Erwin E. A. Hassing [aut], Certe Medical Diagnostics & Advice Foundation [cph, fnd] |
Maintainer: | Matthijs S. Berends <[email protected]> |
License: | GPL-2 |
Version: | 1.12.8 |
Built: | 2024-11-22 10:50:20 UTC |
Source: | https://github.com/certe-medical-epidemiology/certedb |
Connect to an internal (yet remote) Certe database server using DBI::dbConnect()
, or any other database.
db_connect(driver, ..., print = TRUE) db_close(conn, ..., print = TRUE)
db_connect(driver, ..., print = TRUE) db_close(conn, ..., print = TRUE)
driver |
database driver to use, such as |
... |
arguments passed on to |
print |
a logical to indicate whether info about the connection should be printed |
conn |
connection to close, such as the output of |
# create a local duckdb database db <- db_connect(duckdb::duckdb(), "~/my_duck.db") db db |> db_write_table("my_iris_table", values = iris) db |> db_list_tables() db |> db_has_table("my_iris_table") if (require(dplyr, warn.conflicts = FALSE)) { db |> tbl("my_iris_table") |> filter(Species == "setosa", Sepal.Width > 3) |> collect() |> as_tibble() } db |> db_drop_table("my_iris_table") db |> db_list_tables() db |> db_close() # remove the database unlink("~/my_duck.db")
# create a local duckdb database db <- db_connect(duckdb::duckdb(), "~/my_duck.db") db db |> db_write_table("my_iris_table", values = iris) db |> db_list_tables() db |> db_has_table("my_iris_table") if (require(dplyr, warn.conflicts = FALSE)) { db |> tbl("my_iris_table") |> filter(Species == "setosa", Sepal.Width > 3) |> collect() |> as_tibble() } db |> db_drop_table("my_iris_table") db |> db_list_tables() db |> db_close() # remove the database unlink("~/my_duck.db")
Thes functions can be used to download local or remote database data, e.g. Spectre data from DiveLine on a Diver server (from Dimensional Insight). The get_diver_data()
function sets up an ODBC connection (using db_connect()
), which requires their quite limited DI-ODBC driver.
get_diver_data( date_range = this_year(), where = NULL, post_where = NULL, review_qry = interactive(), antibiogram_type = "sir", distinct = TRUE, auto_transform = TRUE, snake_case = TRUE, preset = read_secret("db.preset_default"), add_cols = NULL, date_column = NULL, diver_cbase = NULL, diver_project = read_secret("db.diver_project"), diver_dsn = if (diver_testserver == FALSE) read_secret("db.diver_dsn") else read_secret("db.diver_dsn_test"), diver_testserver = FALSE, diver_tablename = "data", info = interactive(), limit = Inf, only_real_patients = TRUE, only_conducted_tests = TRUE, only_validated = FALSE, only_requested = FALSE, in_background = FALSE, ... ) certedb_query(query, where = NULL, auto_transform = TRUE, info = interactive()) get_duckdb_data( date_range = this_year(), where = NULL, preset = NULL, review_qry = interactive(), duckdb_path = read_secret("db.duckdb_path"), duckdb_table = read_secret("db.duckdb_table"), auto_transform = TRUE, info = interactive() ) certedb_getmmb( dates = NULL, where = NULL, select_preset = "mmb", preset = "mmb", select = NULL, add_cols = NULL, info = interactive(), first_isolates = FALSE, eucast_rules = "all", keep_synonyms = getOption("AMR_keep_synonyms", FALSE), mic = FALSE, disk = FALSE, zipcodes = FALSE, ziplength = 4, tat_hours = FALSE, only_real_patients = TRUE, only_conducted_tests = TRUE, only_validated = FALSE, only_show_query = FALSE, review_where = interactive(), auto_transform = TRUE, query = NULL, ... ) certedb_getmmb_tat( dates = NULL, where = NULL, add_cols = NULL, info = interactive(), only_real_patients = TRUE, only_conducted_tests = TRUE, only_validated = TRUE, only_show_query = FALSE, ... ) get_glims10_data( date_range = this_year(), where = NULL, review_qry = interactive(), antibiogram_type = "sir", distinct = TRUE, auto_transform = TRUE, diver_tablename = "glims10", info = interactive(), limit = Inf, ... )
get_diver_data( date_range = this_year(), where = NULL, post_where = NULL, review_qry = interactive(), antibiogram_type = "sir", distinct = TRUE, auto_transform = TRUE, snake_case = TRUE, preset = read_secret("db.preset_default"), add_cols = NULL, date_column = NULL, diver_cbase = NULL, diver_project = read_secret("db.diver_project"), diver_dsn = if (diver_testserver == FALSE) read_secret("db.diver_dsn") else read_secret("db.diver_dsn_test"), diver_testserver = FALSE, diver_tablename = "data", info = interactive(), limit = Inf, only_real_patients = TRUE, only_conducted_tests = TRUE, only_validated = FALSE, only_requested = FALSE, in_background = FALSE, ... ) certedb_query(query, where = NULL, auto_transform = TRUE, info = interactive()) get_duckdb_data( date_range = this_year(), where = NULL, preset = NULL, review_qry = interactive(), duckdb_path = read_secret("db.duckdb_path"), duckdb_table = read_secret("db.duckdb_table"), auto_transform = TRUE, info = interactive() ) certedb_getmmb( dates = NULL, where = NULL, select_preset = "mmb", preset = "mmb", select = NULL, add_cols = NULL, info = interactive(), first_isolates = FALSE, eucast_rules = "all", keep_synonyms = getOption("AMR_keep_synonyms", FALSE), mic = FALSE, disk = FALSE, zipcodes = FALSE, ziplength = 4, tat_hours = FALSE, only_real_patients = TRUE, only_conducted_tests = TRUE, only_validated = FALSE, only_show_query = FALSE, review_where = interactive(), auto_transform = TRUE, query = NULL, ... ) certedb_getmmb_tat( dates = NULL, where = NULL, add_cols = NULL, info = interactive(), only_real_patients = TRUE, only_conducted_tests = TRUE, only_validated = TRUE, only_show_query = FALSE, ... ) get_glims10_data( date_range = this_year(), where = NULL, review_qry = interactive(), antibiogram_type = "sir", distinct = TRUE, auto_transform = TRUE, diver_tablename = "glims10", info = interactive(), limit = Inf, ... )
date_range |
date range, can be length 1 or 2 (or more to use the min/max) to filter on the column specified in the YAML file, see presets. Defaults to |
where , post_where
|
arguments to filter data on, will be passed on to |
review_qry |
a logical to indicate whether the query must be reviewed first, defaults to |
antibiogram_type |
antibiotic transformation mode. Leave blank to strip antibiotic results from the data, |
distinct |
logical to apply |
auto_transform |
logical to apply |
snake_case |
logical to convert column names to snake case, only when |
preset |
a preset to choose from |
add_cols |
extra column to include in the selection. Can be named to set a new column name. Use gl or di to quickly pick from a list. |
date_column |
column name of data set to query. Normally this should be set in a preset, but this argument can be used to override that. |
diver_cbase , diver_project , diver_dsn , diver_testserver
|
properties to set in |
diver_tablename |
name of the database table to download data from. This is hard-coded by DI and should normally never be changed. |
info |
settings for old |
limit |
maximum number of rows to return. |
only_real_patients |
logical to include only real patients, i.e., remove test and Q&A samples |
only_conducted_tests |
logical to include only tests that were not stopped |
only_validated |
settings for old |
only_requested |
logical to include only requested tests |
in_background |
run data collection in the background using |
... |
not used anymore, previously settings for old |
query |
a data.frame to view the query of, or a character string to run as query in |
duckdb_path |
path to the local DuckDB database |
duckdb_table |
name of the DuckDB database to retrieve data from |
dates |
date range, can be length 1 or 2 (or more to use the min/max) to filter on the column |
select_preset |
settings for old |
select |
settings for old |
first_isolates |
settings for old |
eucast_rules |
settings for old |
keep_synonyms |
a logical to indicate if old, previously valid taxonomic names must be preserved and not be corrected to currently accepted names. The default is |
mic |
settings for old |
disk |
settings for old |
zipcodes |
settings for old |
ziplength |
settings for old |
tat_hours |
settings for old |
only_show_query |
settings for old |
review_where |
a logical to indicate whether the query must be reviewed first, defaults to |
These functions return a 'certedb tibble' from Diver or the certemmb
MySQL database, which prints information in the tibble header about the used source and current user.
Use certedb_query()
to retrieve the original query that was used to download the data.
Using certedb_query("your qry here")
is identical to using certedb_getmmb(query = "your qry here")
.
## Not run: # peek-preview of a cBase: get_diver_data(diver_cbase = "models/MedEpi/GLIMS10_Resistenties.cbase", date_range = NULL, limit = 10, review_qry = FALSE, auto_transform = FALSE) # these two work identical: get_diver_data(date_range = 2024, where = BepalingCode == "PXNCOV") get_diver_data(2024, BepalingCode == "PXNCOV") # use gl$ to pull a list with column names while you type get_diver_data(2024, gl$BepalingCode == "PXNCOV") # gl$ can also be used for adding columns, which can also be named get_diver_data(2024, gl$BepalingCode == "PXNCOV", add_cols = gl$AfnameJaar) get_diver_data(2024, gl$BepalingCode == "PXNCOV", add_cols = c(jaar = gl$AfnameJaar)) # for the `where`, use `&` or `|`: get_diver_data(last_month(), gl$BepalingCode == "PXNCOV" & gl$Zorglijn == "2e lijn") get_diver_data(c(2020:2024), where = gl$BepalingCode == "PXNCOV" | gl$Zorglijn == "2e lijn") # use %like%, %unlike%, %like_case% or %unlike_case% for regular expressions get_diver_data(2024, where = gl$MateriaalNaam %like% "Bloed") get_diver_data(2024, where = gl$MateriaalNaam %unlike% "Bloed") get_diver_data(2024, where = gl$MateriaalNaam %like_case% "bloed") get_diver_data(2024, where = gl$MateriaalNaam %unlike_case% "Bloed") get_diver_data(2024, where = gl$BepalingNaam %like% "Noro" & gl$PatientLeeftijd >= 75) # R objects will be converted materialen <- c("A", "B", "C") get_diver_data(2024, where = gl$MateriaalNaam %in% materialen) leeftijden <- 65:85 get_diver_data(2024, where = gl$PatientLeeftijd %in% leeftijden) # USING DIVER INTEGRATOR LANGUAGE -------------------------------------- # See the website for an overview of allowed functions: # https://www.dimins.com/online-help/workbench_help/Content/ODBC/di-odbc-sql-reference.html # Use Diver Integrator functions within EVAL(): get_diver_data(2024, where = EVAL('regexp(value("MateriaalCode"),"^B")')) get_diver_data( 2024, where = EVAL('rolling(12, value("OntvangstDatum"), date("2024/11/27"))') ) ## End(Not run) # USING DUCKDB DATABASE ------------------------------------------------ # create a local duckdb database and write a table db <- db_connect(duckdb::duckdb(), "~/my_duck.db") db |> db_write_table("iris", values = iris) db |> db_close() df <- get_duckdb_data(date_range = NULL, where = Species == "setosa" & Sepal.Width > 3, # not needed in production environment: duckdb_path = "~/my_duck.db", duckdb_table = "iris", review_qry = FALSE, info = TRUE) df certedb_query(df) # remove the database unlink("~/my_duck.db")
## Not run: # peek-preview of a cBase: get_diver_data(diver_cbase = "models/MedEpi/GLIMS10_Resistenties.cbase", date_range = NULL, limit = 10, review_qry = FALSE, auto_transform = FALSE) # these two work identical: get_diver_data(date_range = 2024, where = BepalingCode == "PXNCOV") get_diver_data(2024, BepalingCode == "PXNCOV") # use gl$ to pull a list with column names while you type get_diver_data(2024, gl$BepalingCode == "PXNCOV") # gl$ can also be used for adding columns, which can also be named get_diver_data(2024, gl$BepalingCode == "PXNCOV", add_cols = gl$AfnameJaar) get_diver_data(2024, gl$BepalingCode == "PXNCOV", add_cols = c(jaar = gl$AfnameJaar)) # for the `where`, use `&` or `|`: get_diver_data(last_month(), gl$BepalingCode == "PXNCOV" & gl$Zorglijn == "2e lijn") get_diver_data(c(2020:2024), where = gl$BepalingCode == "PXNCOV" | gl$Zorglijn == "2e lijn") # use %like%, %unlike%, %like_case% or %unlike_case% for regular expressions get_diver_data(2024, where = gl$MateriaalNaam %like% "Bloed") get_diver_data(2024, where = gl$MateriaalNaam %unlike% "Bloed") get_diver_data(2024, where = gl$MateriaalNaam %like_case% "bloed") get_diver_data(2024, where = gl$MateriaalNaam %unlike_case% "Bloed") get_diver_data(2024, where = gl$BepalingNaam %like% "Noro" & gl$PatientLeeftijd >= 75) # R objects will be converted materialen <- c("A", "B", "C") get_diver_data(2024, where = gl$MateriaalNaam %in% materialen) leeftijden <- 65:85 get_diver_data(2024, where = gl$PatientLeeftijd %in% leeftijden) # USING DIVER INTEGRATOR LANGUAGE -------------------------------------- # See the website for an overview of allowed functions: # https://www.dimins.com/online-help/workbench_help/Content/ODBC/di-odbc-sql-reference.html # Use Diver Integrator functions within EVAL(): get_diver_data(2024, where = EVAL('regexp(value("MateriaalCode"),"^B")')) get_diver_data( 2024, where = EVAL('rolling(12, value("OntvangstDatum"), date("2024/11/27"))') ) ## End(Not run) # USING DUCKDB DATABASE ------------------------------------------------ # create a local duckdb database and write a table db <- db_connect(duckdb::duckdb(), "~/my_duck.db") db |> db_write_table("iris", values = iris) db |> db_close() df <- get_duckdb_data(date_range = NULL, where = Species == "setosa" & Sepal.Width > 3, # not needed in production environment: duckdb_path = "~/my_duck.db", duckdb_table = "iris", review_qry = FALSE, info = TRUE) df certedb_query(df) # remove the database unlink("~/my_duck.db")
get_diver_data()
Work with presets for get_diver_data()
. This automates selecting, filtering, and joining cBases.
presets() get_preset(preset)
presets() get_preset(preset)
preset |
name of the preset |
The function presets()
returns a data.frame with available presets, as defined in the secrets YAML file under db.presets
.
The function get_preset()
will return all the details of a preset as a list.
This YAML information should be put into the YAML file that is read using read_secret()
. Afterwards, the name of the preset can be used as get_diver_data(preset = "...")
.
The most basic YAML form:
name_new_preset: cbase: "location/to/name.cbase" date_col: "ColumnNameDate"
The most extensive YAML form:
name_new_preset: cbase: "location/to/name.cbase" date_col: "ColumnNameDate" filter: ColumnName1 %in% c("Filter1", "Filter2") & ColumnName2 %in% c("Filter3", "Filter4") select: ColumnName1, ColumnName2, ColumnNameReceiptDate, new_name = OldName join: cbase: "location/to/another.cbase" by: ColumnName1, ColumnName2 type: "left" filter: ColumnName1 == "abc" select: ColumnName1, col_name_2 = ColumnName2, ColumnName3, everything(), !starts_with("abc") wide_names_from: ColumnName3 join2: cbase: "location/to/yet_another.cbase" by: ColumnName1, ColumnName2 type: "left" select: ColumnName1, ColumnName2, ColumnName3 wide_names_from: ColumnName3 wide_name_trans: gsub("_", "..", .x)
For all presets, cbase
and date_col
are required.
The YAML keys run in this order:
Download cBase and filter (applied in WHERE
statement)
Select
Join(s)
Post-processing
After this, the arguments in get_diver_data()
will run:
Post-WHERE if post_where
is set, using filter()
Distinct if distinct = TRUE
, using distinct()
Auto-transform if autotransform = TRUE
, using auto_transform()
cbase
)This cBase must be a filepath and must exist on the Diver server. For joins, this can also be another type of file, see Joins
select
)Input for select
will be passed on to select()
, meaning that column names can be used, but also tidyselect
functions such as everything()
.
filter
)Input for filter
will be passed on to filter()
.
join
)For joins, you must set at least cbase
, by
, and type
("left", "right", "inner", etc., see here).
Other files than a cBase in the field cbase
will be imported using certetoolbox::import()
, such as an Excel or CSV file. This can be any file on any local or remote location (even live internet files). For example:
join: cbase: "location/to/excel_file.xlsx" by: ColumnName1 type: "left" join: cbase: "https://github.com/certe-medical-epidemiology/certegis/blob/main/data/geo_gemeenten.rda" by: ColumnName1 type: "left"
An unlimited number of joins can be used, but all so-called 'keys' must be unique and start with join
, e.g. joinA
/ joinB
or join
/ join2
/ join3
.
If wide_names_from
is set, the dataset is first transformed to long format using the columns specified in by
, and then reshaped to wide format with values in wide_names_from
.
Use wide_name_trans
to transform the values in wide_names_from
before the reshaping to wide format is applied. Use .x
for the column values. As this will be applied before the data is transformed to a wide format, it allows to refine the values in wide_names_from
using e.g., case_when()
.
post-processing
)Any data transformation can be done after the data have been downloaded and processed according to all previous steps. Use x
to indicate the data set.
name_new_preset: cbase: "location/to/name.cbase" date_col: "ColumnNameDate" post-processing: | x |> mutate(Column1 = case_when(Column2 = "A" ~ 1, Column3 = "B" ~ 2, TRUE ~ 3))
As shown, in YAML the |
character can be used to start a multi-line statement.
The post-processing
field can also be an R file path, which will be sourced:
name_new_preset: cbase: "location/to/name.cbase" date_col: "ColumnNameDate" post-processing: "location/to/file.R" name_new_preset: cbase: "location/to/name.cbase" date_col: "ColumnNameDate" post-processing1: "location/to/file 1.R" post-processing2: "location/to/file 2.R"
Note that all post-processing steps will run directly after the querying the data and thus before auto-transformation if autotransform = TRUE
in get_diver_data()
.
Use this Shiny app to search a cBase. There is also an RStudio add-in.
shiny_explore( preset = read_secret("db.preset_default_shiny"), diver_cbase = NULL, diver_project = read_secret("db.diver_project"), diver_dsn = if (diver_testserver == FALSE) read_secret("db.diver_dsn") else read_secret("db.diver_dsn_test"), diver_testserver = FALSE, diver_tablename = "data" )
shiny_explore( preset = read_secret("db.preset_default_shiny"), diver_cbase = NULL, diver_project = read_secret("db.diver_project"), diver_dsn = if (diver_testserver == FALSE) read_secret("db.diver_dsn") else read_secret("db.diver_dsn_test"), diver_testserver = FALSE, diver_tablename = "data" )
preset |
a preset to choose from |
diver_cbase , diver_project , diver_dsn , diver_testserver
|
properties to set in |
diver_tablename |
name of the database table to download data from. This is hard-coded by DI and should normally never be changed. |