Calibration of quantiles and a joint calibration of totals and quanitles for surveys

Setup

library(jointCalib)
library(survey)
#> Loading required package: grid
#> Loading required package: Matrix
#> Loading required package: survival
#> 
#> Attaching package: 'survey'
#> The following object is masked from 'package:graphics':
#> 
#>     dotchart
library(laeken)
library(ebal)
#> ##
#> ## ebal Package: Implements Entropy Balancing.
#> ## See http://www.stanford.edu/~jhain/ for additional information.

Examples

Basic example

Setup

Generate small dataset

set.seed(123)
N <- 100
x1 <- rbinom(n = 100, prob = 0.7, size = 1)
x2 <- rlnorm(n = 100)*1000
pop<-data.frame(x1, x2)
sample_df <- pop[sample(1:N, 20), ]
sample_df$d <- as.integer(N/nrow(sample_df))
colSums(sample_df[, c("x1", "x2")]*sample_df$d)
#>       x1       x2 
#>     70.0 181454.5
with(sample_df, weightedMedian(x2, d))
#> [1] 1005.781

Calibrate using: + totals: x1 and x2, + median for: x2.

res <- joint_calib(formula_totals = ~ x1 + x2,
                   formula_quantiles = ~ x2,
                   data = sample_df,
                   dweights = as.numeric(sample_df$d),
                   N = 100,
                   pop_totals = c(x1=sum(x1), x2=sum(x2)),
                   pop_quantiles = list(x2=quantile(x2,0.5)),
                   method = "linear",
                   control = control_calib(interpolation = "linear"))

for_example <- res$Xs[, c(3,4,2)] |> as.data.frame()
for_example$w <- sample_df$d * res$g
for_example
#>    x1        x2           V3        w
#> 90  1  236.0072 0.0100000000 5.550384
#> 58  0  188.6349 0.0100000000 5.805215
#> 81  1 4239.9474 0.0000000000 3.892356
#> 29  1 1198.7789 0.0000000000 5.319617
#> 26  0 2788.6884 0.0000000000 4.806049
#> 27  1  752.1850 0.0100000000 5.308135
#> 85  1  128.3176 0.0100000000 5.600924
#> 7   1  212.5129 0.0100000000 5.561410
#> 60  1 2506.7739 0.0000000000 4.705757
#> 96  1  588.0716 0.0100000000 5.385155
#> 41  1 2700.6807 0.0000000000 4.614754
#> 84  0  655.4083 0.0100000000 5.586152
#> 6   1 4556.1164 0.0000000000 3.743974
#> 31  0 1005.7808 0.0008372057 5.624283
#> 17  1 1565.5071 0.0000000000 5.147506
#> 64  1  945.9534 0.0000000000 5.438271
#> 37  0 2994.6849 0.0000000000 4.709372
#> 57  1  456.1633 0.0100000000 5.447062
#> 20  0 7768.5590 0.0000000000 2.468930
#> 35  1  802.1284 0.0100000000 5.284696

Median of x2 after calibration.

weightedQuantile(sample_df$x2, res$g, 0.5)
#> [1] 945.9534

Example 1 – census case

Setup

Based on Haziza, D., and Lesage, É. (2016). A discussion of weighting procedures for unit nonresponse. Journal of Official Statistics, 32(1), 129-145.

set.seed(20230817)
N <- 1000
x <- runif(N, 0, 80)
#y <- 1000+10*x+rnorm(N, 0, 300)
#y <- exp(-0.1 + 0.1*x) + rnorm(N, 0, 300)
#y <- rbinom(N, 1, prob = 1/(exp(-0.5*(x-55))+1))
y <- 1300-(x-40)^2 + rnorm(N, 0, 300)
#p <- rbinom(N, 1, prob = 0.2+0.6*(1 + exp(-5 + x/8))^-1)
p <- rbinom(N, 1, prob = 0.07+0.45*(x/40-1)^2+0.0025*x)
#p <- rbinom(N, 1, prob = (1.2+0.024*x)^-1)
#p <- rbinom(N, 1, prob = exp(-0.2 - 0.014*x))
probs <- seq(0.1, 0.9, 0.1)
quants_known <- list(x=quantile(x, probs))
totals_known <- c(x=sum(x))
df <- data.frame(x, y, p)
df_resp <- df[df$p == 1, ]
df_resp$d <- N/nrow(df_resp)
y_quant_true <- quantile(y, probs)
head(df_resp)
#>           x        y p        d
#> 6  12.35687 444.9053 1 3.134796
#> 7  61.90319 403.9473 1 3.134796
#> 13 60.96079 923.4975 1 3.134796
#> 14 76.85300 124.4110 1 3.134796
#> 18 71.52828 422.0934 1 3.134796
#> 19 65.32808 740.4801 1 3.134796

Using jointCalib package

example 1a: calibrate only quantiles (deciles)

result1 <- joint_calib(formula_quantiles = ~x,
                      data=df_resp,
                      dweights=df_resp$d,
                      N = N,
                      pop_quantiles = quants_known,
                      method = "linear",
                      backend = "sampling")
result1
#> Weights calibrated using: linear with sampling backend.
#> Summary statistics for g-weights:
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4562  0.6773  0.8180  1.0000  1.4500  2.4538 
#> Totals and precision (abs diff: 4.589559e-10)
#>        totals    precision
#> N       1e+03 4.572485e-10
#> x 0.10  1e-01 5.050127e-14
#> x 0.20  2e-01 1.062483e-13
#> x 0.30  3e-01 1.333933e-13
#> x 0.40  4e-01 1.593170e-13
#> x 0.50  5e-01 1.778577e-13
#> x 0.60  6e-01 2.007283e-13
#> x 0.70  7e-01 2.322587e-13
#> x 0.80  8e-01 2.901013e-13
#> x 0.90  9e-01 3.570477e-13
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result1$g*df_resp$d, probs))
#>          true       est
#> 10%  7.078067  7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274

example 1b: calibrate only quantiles (deciles)

result2 <- joint_calib(formula_totals = ~x,
                       formula_quantiles = ~x,
                       data = df_resp,
                       dweights = df_resp$d,
                       N = N,
                       pop_quantiles = quants_known,
                       pop_totals = totals_known,
                       method = "linear",
                       backend = "sampling")
summary(result2$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.3563  0.6208  0.8199  1.0000  1.4368  2.5384
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs))
#>          true       est
#> 10%  7.078067  7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274

We can restrict weights to specific range using logit.

result3 <- joint_calib(formula_totals = ~x,
                       formula_quantiles = ~x,
                       data = df_resp,
                       dweights = df_resp$d,
                       N = N,
                       pop_quantiles = quants_known,
                       pop_totals = totals_known,
                       method = "logit",
                       backend = "sampling", 
                       maxit = 500,
                       bounds = c(0, 3))

summary(result3$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.3911  0.6254  0.8140  1.0000  1.4325  2.5186
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result3$g*df_resp$d, probs))
#>          true       est
#> 10%  7.078067  7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 22.872078
#> 40% 31.641911 31.297922
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.529683
#> 90% 71.565441 71.567274

Empirical likelihood method can be applied using the following code

result4a <- joint_calib(formula_quantiles = ~ x,
                        data = df_resp,
                        dweights = df_resp$d,
                        N = N,
                        pop_quantiles = quants_known,
                        method = "el",
                        backend = "base")
summary(result4a$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4563  0.6773  0.8180  1.0000  1.4500  2.4538

result4b <- joint_calib(formula_totals = ~ x,
                        formula_quantiles = ~ x,
                        data = df_resp,
                        dweights = df_resp$d,
                        N = N,
                        pop_quantiles = quants_known,
                        pop_totals = totals_known,
                        method = "el",
                        backend = "base")

summary(result4b$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4414  0.6584  0.8109  1.0000  1.4256  2.8513

result4c <- calib_el(X = result4b$Xs[, c(1, 11)],
                     d = df_resp$d, 
                     totals = c(N,totals_known),
                     maxit = 50,
                     tol = 1e-8)

summary(result4c)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.7438  0.7910  0.8848  1.0000  1.2455  1.4923
data.frame(true = quants_known$x, 
           est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs),
           est_el0 = weightedQuantile(df_resp$x, result4c*df_resp$d, probs),
           est_el1 = weightedQuantile(df_resp$x, result4a$g*df_resp$d, probs),
           est_el2 = weightedQuantile(df_resp$x, result4b$g*df_resp$d, probs))
#>          true       est   est_el0   est_el1   est_el2
#> 10%  7.078067  7.085003  3.640246  7.085003  7.085003
#> 20% 14.831221 14.824424  8.024948 14.824424 14.824424
#> 30% 23.146180 23.287657 14.499018 23.287657 23.287657
#> 40% 31.641911 31.802986 24.010501 31.802986 31.802986
#> 50% 39.033812 39.154276 39.154276 38.892342 39.154276
#> 60% 47.527168 48.252065 54.685438 48.252065 48.252065
#> 70% 54.984229 55.311953 63.084405 54.954054 54.954054
#> 80% 64.073167 64.062629 70.050593 64.062629 64.062629
#> 90% 71.565441 71.567274 75.663078 71.567274 71.567274

Entropy balancing method can be applied using the following code

result5 <- joint_calib(formula_quantiles = ~ x,
                        data = df_resp,
                        dweights = df_resp$d,
                        N = N,
                        pop_quantiles = quants_known,
                        method = "eb")
summary(result5$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4576  0.6775  0.8180  1.0003  1.4500  2.4538

Finally, compare all method with true Y distribution where

  • est_y1 – weights calibrated to quantiles only,
  • est_y2 – weights calibrated to quantiles and totals,
  • est_y3 – weights calibrated to quantiles and totals using logit distance and range limitations,
  • est_y4 – weights calibrated to means only using el.
  • est_y5 – weights calibrated to quantiles only using el.
  • est_y6 – weights calibrated to quantiles and totals using el.
  • est_y7 – weights calibrated to quantiles and totals using eb.
results_y <- data.frame(true_y = y_quant_true,
                        est_y1 = weightedQuantile(df_resp$y, result1$g * df_resp$d, probs),
                        est_y2 = weightedQuantile(df_resp$y, result2$g * df_resp$d, probs),
                        est_y3 = weightedQuantile(df_resp$y, result3$g * df_resp$d, probs),
                        est_y4 = weightedQuantile(df_resp$y, result4c * df_resp$d, probs),
                        est_y5 = weightedQuantile(df_resp$y, result4a$g * df_resp$d, probs),
                        est_y6 = weightedQuantile(df_resp$y, result4b$g * df_resp$d, probs),
                        est_y7 = weightedQuantile(df_resp$y, result5$g * df_resp$d, probs))

results_y
#>         true_y     est_y1     est_y2     est_y3     est_y4     est_y5
#> 10%  -56.97982  -36.18473  -36.18473  -36.18473 -149.00996  -36.18473
#> 20%  210.58301  228.75429  228.75429  228.75429   17.27107  228.75429
#> 30%  444.81652  413.22517  417.60155  413.22517  203.58302  413.22517
#> 40%  646.06099  622.82192  622.82192  622.82192  381.99560  622.82192
#> 50%  803.50842  789.89936  789.89936  789.89936  503.35849  789.89936
#> 60%  975.31803  925.84730  925.84730  925.84730  679.04530  925.84730
#> 70% 1123.44643 1023.75230 1023.75230 1023.75230  811.84389 1023.75230
#> 80% 1245.62645 1162.06504 1162.06504 1162.06504 1009.46703 1162.06504
#> 90% 1449.23819 1471.33301 1471.33301 1471.33301 1242.60357 1471.33301
#>         est_y6     est_y7
#> 10%  -36.18473  -36.18473
#> 20%  228.75429  228.75429
#> 30%  411.47088  413.22517
#> 40%  622.82192  622.82192
#> 50%  789.89936  789.89936
#> 60%  925.84730  925.84730
#> 70% 1023.75230 1023.75230
#> 80% 1162.06504 1162.06504
#> 90% 1471.33301 1471.33301

Here is the sum of squares and calibration of totals and means seems to be the best.

apply(results_y[, -1], 2, FUN = function(x) sum((x-results_y[,1])^2))
#>    est_y1    est_y2    est_y3    est_y4    est_y5    est_y6    est_y7 
#>  22342.87  22085.51  22342.87 547195.99  22342.87  22456.79  22342.87

Using survey package

example 1a: calibrate only quantiles (deciles)

A <- joint_calib_create_matrix(X_q = model.matrix(~x+0, df_resp), N = N, pop_quantiles = quants_known)
colnames(A) <- paste0("quant_", gsub("\\D", "", names(quants_known$x)))
A <- as.data.frame(A)
df_resp <- cbind(df_resp, A)
head(df_resp)
#>           x        y p        d quant_10 quant_20 quant_30 quant_40 quant_50
#> 6  12.35687 444.9053 1 3.134796        0    0.001    0.001    0.001    0.001
#> 7  61.90319 403.9473 1 3.134796        0    0.000    0.000    0.000    0.000
#> 13 60.96079 923.4975 1 3.134796        0    0.000    0.000    0.000    0.000
#> 14 76.85300 124.4110 1 3.134796        0    0.000    0.000    0.000    0.000
#> 18 71.52828 422.0934 1 3.134796        0    0.000    0.000    0.000    0.000
#> 19 65.32808 740.4801 1 3.134796        0    0.000    0.000    0.000    0.000
#>    quant_60 quant_70 quant_80 quant_90
#> 6     0.001    0.001    0.001    0.001
#> 7     0.000    0.000    0.001    0.001
#> 13    0.000    0.000    0.001    0.001
#> 14    0.000    0.000    0.000    0.000
#> 18    0.000    0.000    0.000    0.001
#> 19    0.000    0.000    0.000    0.001
m1 <- svydesign(ids = ~1, data = df_resp, weights = ~d)
quants_formula <- as.formula(paste("~", paste(colnames(A), collapse = "+")))
quants_formula
#> ~quant_10 + quant_20 + quant_30 + quant_40 + quant_50 + quant_60 + 
#>     quant_70 + quant_80 + quant_90
svytotal(quants_formula, m1)
#>            total     SE
#> quant_10 0.10972 0.0175
#> quant_20 0.23197 0.0237
#> quant_30 0.29154 0.0255
#> quant_40 0.34796 0.0267
#> quant_50 0.38871 0.0273
#> quant_60 0.43887 0.0278
#> quant_70 0.50784 0.0280
#> quant_80 0.63323 0.0270
#> quant_90 0.78100 0.0232

Calibration using calibrate

pop_totals <- c(N, probs)
names(pop_totals) <- c("(Intercept)", colnames(A))
m1_cal <- calibrate(m1, quants_formula, pop_totals)
svytotal(quants_formula, m1_cal)
#>          total SE
#> quant_10   0.1  0
#> quant_20   0.2  0
#> quant_30   0.3  0
#> quant_40   0.4  0
#> quant_50   0.5  0
#> quant_60   0.6  0
#> quant_70   0.7  0
#> quant_80   0.8  0
#> quant_90   0.9  0

Calibration using grake (low level)

g1 <- grake(mm = as.matrix(cbind(1, A)),
            ww = df_resp$d,
            calfun = cal.linear,
            population = pop_totals,
            bounds = list(lower = -Inf, upper = Inf),
            epsilon = 1e-7,
            verbose = FALSE,
            maxit = 50,
            variance = NULL)
summary(as.numeric(g1))
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4562  0.6773  0.8180  1.0000  1.4500  2.4538