--- title: "Calibration of quantiles and a joint calibration of totals and quanitles for surveys" author: "Maciej Beręsewicz" output: html_vignette: df_print: kable toc: true number_sections: true vignette: > %\VignetteIndexEntry{Calibration of quantiles and a joint calibration of totals and quanitles for surveys} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: chunk_output_type: console bibliography: references.bib link-citations: true --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` # Setup ```{r setup} library(jointCalib) library(survey) library(laeken) library(ebal) ``` # Examples ## Basic example ### Setup Generate small dataset ```{r} set.seed(123) N <- 100 x1 <- rbinom(n = 100, prob = 0.7, size = 1) x2 <- rlnorm(n = 100)*1000 pop<-data.frame(x1, x2) sample_df <- pop[sample(1:N, 20), ] sample_df$d <- as.integer(N/nrow(sample_df)) colSums(sample_df[, c("x1", "x2")]*sample_df$d) with(sample_df, weightedMedian(x2, d)) ``` Calibrate using: + totals: `x1` and `x2`, + median for: `x2`. ```{r} res <- joint_calib(formula_totals = ~ x1 + x2, formula_quantiles = ~ x2, data = sample_df, dweights = as.numeric(sample_df$d), N = 100, pop_totals = c(x1=sum(x1), x2=sum(x2)), pop_quantiles = list(x2=quantile(x2,0.5)), method = "linear", control = control_calib(interpolation = "linear")) for_example <- res$Xs[, c(3,4,2)] |> as.data.frame() for_example$w <- sample_df$d * res$g for_example ``` Median of `x2` after calibration. ```{r} weightedQuantile(sample_df$x2, res$g, 0.5) ``` ## Example 1 -- census case ### Setup Based on Haziza, D., and Lesage, É. (2016). A discussion of weighting procedures for unit nonresponse. Journal of Official Statistics, 32(1), 129-145. ```{r example1-generate-data} set.seed(20230817) N <- 1000 x <- runif(N, 0, 80) #y <- 1000+10*x+rnorm(N, 0, 300) #y <- exp(-0.1 + 0.1*x) + rnorm(N, 0, 300) #y <- rbinom(N, 1, prob = 1/(exp(-0.5*(x-55))+1)) y <- 1300-(x-40)^2 + rnorm(N, 0, 300) #p <- rbinom(N, 1, prob = 0.2+0.6*(1 + exp(-5 + x/8))^-1) p <- rbinom(N, 1, prob = 0.07+0.45*(x/40-1)^2+0.0025*x) #p <- rbinom(N, 1, prob = (1.2+0.024*x)^-1) #p <- rbinom(N, 1, prob = exp(-0.2 - 0.014*x)) probs <- seq(0.1, 0.9, 0.1) quants_known <- list(x=quantile(x, probs)) totals_known <- c(x=sum(x)) df <- data.frame(x, y, p) df_resp <- df[df$p == 1, ] df_resp$d <- N/nrow(df_resp) y_quant_true <- quantile(y, probs) head(df_resp) ``` ### Using `jointCalib` package example 1a: calibrate only quantiles (deciles) ```{r example1-quantiles} result1 <- joint_calib(formula_quantiles = ~x, data=df_resp, dweights=df_resp$d, N = N, pop_quantiles = quants_known, method = "linear", backend = "sampling") result1 ``` ```{r} data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result1$g*df_resp$d, probs)) ``` example 1b: calibrate only quantiles (deciles) ```{r example1-joint} result2 <- joint_calib(formula_totals = ~x, formula_quantiles = ~x, data = df_resp, dweights = df_resp$d, N = N, pop_quantiles = quants_known, pop_totals = totals_known, method = "linear", backend = "sampling") summary(result2$g) ``` ```{r} data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs)) ``` We can restrict weights to specific range using `logit`. ```{r example1-joint-logit} result3 <- joint_calib(formula_totals = ~x, formula_quantiles = ~x, data = df_resp, dweights = df_resp$d, N = N, pop_quantiles = quants_known, pop_totals = totals_known, method = "logit", backend = "sampling", maxit = 500, bounds = c(0, 3)) summary(result3$g) ``` ```{r} data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result3$g*df_resp$d, probs)) ``` Empirical likelihood method can be applied using the following code ```{r example1-joint-el} result4a <- joint_calib(formula_quantiles = ~ x, data = df_resp, dweights = df_resp$d, N = N, pop_quantiles = quants_known, method = "el", backend = "base") summary(result4a$g) result4b <- joint_calib(formula_totals = ~ x, formula_quantiles = ~ x, data = df_resp, dweights = df_resp$d, N = N, pop_quantiles = quants_known, pop_totals = totals_known, method = "el", backend = "base") summary(result4b$g) result4c <- calib_el(X = result4b$Xs[, c(1, 11)], d = df_resp$d, totals = c(N,totals_known), maxit = 50, tol = 1e-8) summary(result4c) ``` ```{r} data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs), est_el0 = weightedQuantile(df_resp$x, result4c*df_resp$d, probs), est_el1 = weightedQuantile(df_resp$x, result4a$g*df_resp$d, probs), est_el2 = weightedQuantile(df_resp$x, result4b$g*df_resp$d, probs)) ``` Entropy balancing method can be applied using the following code ```{r example1-joint-eb} result5 <- joint_calib(formula_quantiles = ~ x, data = df_resp, dweights = df_resp$d, N = N, pop_quantiles = quants_known, method = "eb") summary(result5$g) ``` Finally, compare all method with true Y distribution where - `est_y1` -- weights calibrated to quantiles only, - `est_y2` -- weights calibrated to quantiles and totals, - `est_y3` -- weights calibrated to quantiles and totals using `logit` distance and range limitations, - `est_y4` -- weights calibrated to means only using `el`. - `est_y5` -- weights calibrated to quantiles only using `el`. - `est_y6` -- weights calibrated to quantiles and totals using `el`. - `est_y7` -- weights calibrated to quantiles and totals using `eb`. ```{r results-combined} results_y <- data.frame(true_y = y_quant_true, est_y1 = weightedQuantile(df_resp$y, result1$g * df_resp$d, probs), est_y2 = weightedQuantile(df_resp$y, result2$g * df_resp$d, probs), est_y3 = weightedQuantile(df_resp$y, result3$g * df_resp$d, probs), est_y4 = weightedQuantile(df_resp$y, result4c * df_resp$d, probs), est_y5 = weightedQuantile(df_resp$y, result4a$g * df_resp$d, probs), est_y6 = weightedQuantile(df_resp$y, result4b$g * df_resp$d, probs), est_y7 = weightedQuantile(df_resp$y, result5$g * df_resp$d, probs)) results_y ``` Here is the sum of squares and calibration of totals and means seems to be the best. ```{r} apply(results_y[, -1], 2, FUN = function(x) sum((x-results_y[,1])^2)) ``` ### Using `survey` package example 1a: calibrate only quantiles (deciles) ```{r example1-survey} A <- joint_calib_create_matrix(X_q = model.matrix(~x+0, df_resp), N = N, pop_quantiles = quants_known) colnames(A) <- paste0("quant_", gsub("\\D", "", names(quants_known$x))) A <- as.data.frame(A) df_resp <- cbind(df_resp, A) head(df_resp) m1 <- svydesign(ids = ~1, data = df_resp, weights = ~d) quants_formula <- as.formula(paste("~", paste(colnames(A), collapse = "+"))) quants_formula svytotal(quants_formula, m1) ``` Calibration using `calibrate` ```{r} pop_totals <- c(N, probs) names(pop_totals) <- c("(Intercept)", colnames(A)) m1_cal <- calibrate(m1, quants_formula, pop_totals) svytotal(quants_formula, m1_cal) ``` Calibration using `grake` (low level) ```{r} g1 <- grake(mm = as.matrix(cbind(1, A)), ww = df_resp$d, calfun = cal.linear, population = pop_totals, bounds = list(lower = -Inf, upper = Inf), epsilon = 1e-7, verbose = FALSE, maxit = 50, variance = NULL) summary(as.numeric(g1)) ```