Skip to content

Commit

Permalink
Merge branch 'next' for next cran-release
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhard-da committed Jan 26, 2018
2 parents f35cc1c + 6c376eb commit 9286ce6
Show file tree
Hide file tree
Showing 20 changed files with 700 additions and 2,275 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Package: sdcMicro
Type: Package
Title: Statistical Disclosure Control Methods for Anonymization of Microdata and
Risk Estimation
Version: 5.0.6
Date: 2017-12-10
Version: 5.1.0
Date: 2018-26-01
Author: Matthias Templ, Alexander Kowarik, Bernhard Meindl
Maintainer: Matthias Templ <[email protected]>
Description: Data from statistical agencies and other institutions are mostly
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export(groupAndRename)
export(importProblem)
export(indivRisk)
export(kAnon)
export(kAnon_violations)
export(ldiversity)
export(localSupp)
export(localSuppression)
Expand Down Expand Up @@ -69,6 +70,7 @@ export(varToNumeric)
export(writeSafeFile)
exportClasses(sdcMicroObj)
exportMethods("strataVar<-")
exportMethods(kAnon_violations)
exportMethods(print)
exportMethods(show)
import(MASS)
Expand Down
12 changes: 12 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# 5.0.5
- bugfix in sdcApp() when using R-objects as data input
- bugfix in sdcApp() when button to perform kAnon() was "lost"
- bugfix in sdcApp() when >= 10 keyvars were used in localSuppression
- bugfix in sdcApp(): sort table of risky observations correctly
- support shiny server for the GUI
- new method kAnon_violations() returning the number of records violating k-anonymity in the sample or the population
- fixes and improvements in parametrisation and error-handling in riskyCells()
- minor fixes in sdc_guidelines vignette including a comment, that the guidelines have not yet been revised for sdcMicro version >= 5.0.0
- pass (...) in writeSafeFile(..., format="csv")
- fixes and improvements in localSuppression()

# 5.0.4
- new default theme "IHSN" for sdcApp()
- fixing an issue in report() where disclosure risk for original data was wrongly displayed if alpha-parameter was set
Expand Down
3 changes: 1 addition & 2 deletions R/GUIfunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -670,8 +670,7 @@ writeSafeFile <- function(obj, format, randomizeRecords, fileOut, ...) {
write_dta(data=dat, path=fileOut)
}
if (format=="csv") {
inp <- list(...)
write.table(dat, file=fileOut, col.names=as.logical(inp$col.names), sep=inp$sep, dec=inp$dec)
write.table(dat, file=fileOut, ...)
}
return(invisible(NULL))
}
19 changes: 16 additions & 3 deletions R/freqCalc.r
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@
#' data.frame(fka=f3a$fk, fkb=f3b$fk, fkc=f3c$fk)
#' data.frame(Fka=f3a$Fk, Fkb=f3b$Fk, Fkc=f3c$Fk)
freqCalc <- function(x, keyVars, w=NULL, alpha=1) {
addAllNAtoKeys <- FALSE

. <- Fk <- fk <- id <- keyid <- na_ids <- sortidforfreqcalc <- sortvar <- weight <- NULL
xxxxx_tmpweight_xxxxx <- NULL
if (alpha <0 | alpha>1) {
Expand Down Expand Up @@ -141,8 +143,8 @@ freqCalc <- function(x, keyVars, w=NULL, alpha=1) {
n1=sum(new$fk==1, na.rm=TRUE), n2=sum(new$fk==2, na.rm=TRUE), alpha=alpha)
class(z) <- "freqCalc"
return(z)
}
}

nr_kv <- length(keyVars_n)
naind <- dat_with_na[,lapply(.SD, function(x) {
!is.na(x)
Expand Down Expand Up @@ -206,7 +208,7 @@ freqCalc <- function(x, keyVars, w=NULL, alpha=1) {
ids_complete <- dat_without_na[as.list(dat_with_na[i, cur_sortVars, with=F]),id]
if (is.na(ids_complete)[1]) {
ids_complete <- NULL
}
}
}

# update dataset containing NA's
Expand All @@ -218,6 +220,17 @@ freqCalc <- function(x, keyVars, w=NULL, alpha=1) {
add_fks_nona[ids_complete] <- add_fks_nona[ids_complete] + fks_na[i]*alpha
add_Fks_nona[ids_complete] <- add_Fks_nona[ids_complete] + Fks_na[i]*alpha
}

## TODO: CHECK IF THIS IS NEEDED
# in case we have NA-only keys, we add their (weighted) fks/Fks to all keys with no missing values
if (addAllNAtoKeys) {
ii <- which(rowSums(is.na(dat_with_na[,keyVars_n, with=FALSE]))==nr_kv)
if (length(ii)==1) {
add_fks_nona <- add_fks_nona + dat_with_na[ii,fk]*alpha
add_Fks_nona <- add_Fks_nona + dat_with_na[ii,Fk]*alpha
}
}

setkeyv(dat_without_na, keyVars_n)
dat_without_na[,fk:=fk+add_fks_nona]
dat_without_na[,Fk:=Fk+add_Fks_nona]
Expand Down
164 changes: 94 additions & 70 deletions R/localSuppression.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@
#' Templ, M. Statistical Disclosure Control for Microdata: Methods and Applications in R.
#' \emph{Springer International Publishing}, 287 pages, 2017. ISBN 978-3-319-50272-4.
#' \doi{10.1007/978-3-319-50272-4}
#'
#' Templ, M. and Kowarik, A. and Meindl, B.
#' Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro.
#'
#' Templ, M. and Kowarik, A. and Meindl, B.
#' Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro.
#' \emph{Journal of Statistical Software}, \strong{67} (4), 1--36, 2015. \doi{10.18637/jss.v067.i04}
#'
#' @keywords manip
Expand Down Expand Up @@ -255,30 +255,28 @@ localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=
for (i in seq_along(ind.problem)) {
params <- list(alpha=alpha, id=as.integer(ind.problem[i]))
res <- cpp_calcSuppInds(mat, mat[ind.problem[i],], params=params)
if (res$fk < k) {
ind <- res$ids
if (length(ind) > 0) {
colInd <- NULL
colIndsSorted <- keyVars[order(importanceI)]
while (is.null(colInd)) {
for (cc in colIndsSorted) {
# special case where we have to suppress values in the problematic instance itself because no other candidates are available
if (length(ind)==1 && ind==ind.problem[i]) {
z <- which(!is.na(mat[ind, cc]))
} else {
z <- which(mat[ind.problem[i],cc]!=mat[ind,cc] & !is.na(mat[ind,cc]))
}
if (length(z) > 0) {
colInd <- cc
break;
}
ind <- res$ids
if (length(ind) > 0) {
colInd <- NULL
colIndsSorted <- keyVars[order(importanceI)]
while (is.null(colInd)) {
for (cc in colIndsSorted) {
# special case where we have to suppress values in the problematic instance itself because no other candidates are available
if (length(ind)==1 && ind==ind.problem[i]) {
z <- which(!is.na(mat[ind, cc]))
} else {
z <- which(mat[ind.problem[i],cc]!=mat[ind,cc] & !is.na(mat[ind,cc]))
}
if (length(z) > 0) {
colInd <- cc
break;
}
}
x[[colInd]][ind.problem[i]] <- NA
mat[ind.problem[i], colInd] <- NA # required for cpp_calcSuppInds()
} else {
stop("Error\n")
}
x[[colInd]][ind.problem[i]] <- NA
mat[ind.problem[i], colInd] <- NA # required for cpp_calcSuppInds()
} else {
stop("Error\n")
}
}
ff <- freqCalc(x, keyVars=keyVars, alpha=alpha)
Expand All @@ -302,34 +300,34 @@ localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=
x <- rbind(x,xKeys)
setkey(x,"idvarextraforsls")
x[,idvarextraforsls:=NULL]
x <- as.data.frame(x)
if(any(NABefore)) {
x[NABefore] <- NA
}
} else {
setkey(x,"idvarextraforsls")
x[,idvarextraforsls:=NULL]
}
## preparing the output:
supps <- as.data.table(t(apply(x, 2, function(x) {
sum(is.na(x))
}))) - NAinKey

totalSupps <- sum(supps)
out <- list(xAnon=x, supps=supps, totalSupps=totalSupps)
out <- list(xAnon=x)#, supps=supps, totalSupps=totalSupps)
return(out)
}

# compute number of missings
sum_na <- function(x) {
sum(is.na(x))
}

strata <- NULL
if (!"data.table" %in% class(x)) {
x <- as.data.table(x)
}
if (is.numeric(keyVars)) {
keyVarsNum <- keyVars
keyVars <- names(x)[keyVars]
} else {
keyVarsNum <- match(keyVars, colnames(x))
}
if (is.numeric(strataVars)) {
strataVarsNum <- strataVars
strataVars <- names(x)[strataVars]
} else {
strataVarsNum <- match(strataVars, colnames(x))
}

# checks and preparations if we apply localSuppression on
Expand Down Expand Up @@ -367,17 +365,24 @@ localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=
# calculate number of suppressions for each keyVar
# before trying to achieve k-anonymity
NABefore <- is.na(x)
NAinKey <- x[,lapply(.SD, function(x) sum(is.na(x))), .SDcols=keyVars, by=strataVars]
totalNABefore <- sum(x[,lapply(.SD, function(x) sum(is.na(x))), .SDcols=keyVars])
if (is.null(strataVars)) {
NAinKey <- x[,lapply(.SD, sum_na), .SDcols=keyVars, by=strataVars]
} else {
x[,strata:=apply(x[,strataVars,with=F],1,paste,collapse="-")]
NAinKey <- x[,lapply(.SD, sum_na), .SDcols=keyVars, by="strata"]
NAinKey_tot <- x[,lapply(.SD, sum_na), .SDcols=keyVars]
NAinKey_tot[,strata:="Total"]
NAinKey_tot <- NAinKey_tot[,c("strata", keyVars), with=F]
NAinKey <- rbind(NAinKey, NAinKey_tot)
x[,strata:=NULL]
}

# performing the k-Anon algorithm
# no stratification required
if (is.null(strataVars)) {
if (is.null(combs)) {
inpDat <- x[,keyVars,with=F]
res <- suppSubset(x=inpDat, k=k, importance=importance, alpha=alpha)
supps <- res$supps
totalSupps <- res$totalSupps
xAnon <- res$xAnon
} else {
# no strata but subsets of key variables (combs)
Expand All @@ -389,6 +394,7 @@ localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=
#log <- paste0(log, "of ",combs[gr]," key variables.\n")
#cat(log)
for (comb in 1:ncol(tree[[gr]])) {
#cat("combination",comb,"|",ncol(tree[[gr]]),"\n")
counter <- counter+1
kV <- tree[[gr]][,comb]
cur_importance <- rank(importance[kV], ties.method="min")
Expand All @@ -402,10 +408,6 @@ localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=
}
# prepare output
xAnon <- tmpDat
supps <- as.data.frame(t(apply(tmpDat, 2, function(x) {
sum(is.na(x))
})))
totalSupps <- sum(supps)
}
} else {
## we want k-anonymity in each strata!
Expand All @@ -425,9 +427,7 @@ localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=
# todo: using parallel/mclapply?
for (i in seq_along(spl)) {
res <- suppSubset(spl[[i]][,keyVars, with=F], k=k, importance=importance, alpha=alpha)
supps[[i]] <- res$supps
xAnon[[i]] <- res$xAnon
totalSupps[i] <- res$totalSupps
}
} else {
# local Suppression by strata and combination of subsets!
Expand All @@ -452,37 +452,45 @@ localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=
}
}
# prepare output
tmpDat[,strata:=NULL]
tmpDat[,sortid:=NULL]
xAnon[[i]] <- tmpDat
supps[[i]] <- as.data.frame(t(apply(tmpDat, 2, function(x) {
sum(is.na(x))
})))
totalSupps[i] <- res$totalSupps
}
}
supps <- as.data.frame(rbindlist(supps))
supps <- rbind(supps, colSums(supps))
rownames(supps) <- c(names(spl),"Total")
xAnon <- rbindlist(xAnon)
xAnon[,sortid:=sortid]
setkey(xAnon, sortid)
xAnon[,sortid:=NULL]
totalSupps <- sum(supps[nrow(supps),])
}

# totalSupps after kAnon
if (!is.null(strataVars)) {
ss <- NAinKey[,lapply(.SD, function(x) sum(x)), .SDcols=setdiff(names(NAinKey), strataVars)]
NAinKey <- rbind(NAinKey,ss,fill=TRUE)
NAinKey[,paste(strataVars):=NULL]
NAinKey <- as.data.frame(NAinKey)
rownames(NAinKey) <- rownames(NAinKey)
## compute number of suppressions
if (is.null(strataVars)) {
totalSupps <- xAnon[,lapply(.SD, sum_na), .SDcols=keyVars]
supps <- totalSupps - NAinKey
totalSupps <- as.data.frame(totalSupps)
supps <- as.data.frame(supps)
} else {
if (is.null(combs)) {
xAnon[,strata:=inpDat[,strata]]
}
totalSupps <- xAnon[,lapply(.SD, sum_na), .SDcols=keyVars, by="strata"]
totalSupps_tot <- xAnon[,lapply(.SD, sum_na), .SDcols=keyVars]
totalSupps_tot[,strata:="Total"]
totalSupps_tot <- totalSupps_tot[,c("strata", keyVars), with=F]
totalSupps <- rbind(totalSupps, totalSupps_tot)
xAnon[,strata:=NULL]

supps <- copy(totalSupps)
supps[, c(keyVars):=totalSupps[,keyVars, with=FALSE] - NAinKey[,keyVars, with=FALSE]]

supps <- as.data.frame(supps)
rownames(supps) <- supps$strata; supps$strata <- NULL
totalSupps <- as.data.frame(totalSupps)
rownames(totalSupps) <- totalSupps$strata; totalSupps$strata <- NULL
}
total_supps <- as.data.frame(as.data.frame(supps)+as.data.frame(NAinKey))
totalSupps <- tail(rowSums(total_supps),1)

newSupps <- tail(rowSums(supps), 1)
res <- list(xAnon=as.data.frame(xAnon), supps=supps,
totalSupps=total_supps, newSupps=totalSupps-totalNABefore, anonymity=TRUE, keyVars=keyVars,
totalSupps=totalSupps, newSupps=newSupps, anonymity=TRUE, keyVars=keyVars,
strataVars=strataVars, importance=importance, k=k, threshold=NA, combs=combs)
class(res) <- "localSuppression"
invisible(res)
Expand Down Expand Up @@ -539,15 +547,31 @@ print.localSuppression <- function(x, ...) {
print(dt)

if (byStrata==TRUE) {
if (all(x$anonymity)) {
pp <- paste0("\n", x$k, "-anonymity == TRUE in all strata!\n")
if (is.null(x$combs)) {
if (all(x$anonymity)) {
pp <- paste0("\n", x$k, "-anonymity == TRUE in all strata!\n")
} else {
prob <- rownames(x$supps)[which(!x$anonymity)]
pp <- paste0("\n", x$k, "-anonymity == FALSE in the following strata:\n")
pp <- paste0(pp, paste0(rownames(x$supps)[which(!x$anonymity)], collapse=", "))
}
} else {
prob <- rownames(x$supps)[which(!x$anonymity)]
pp <- paste0("\n", x$k, "-anonymity == FALSE in the following strata:\n")
pp <- paste0(pp, paste0(rownames(x$supps)[which(!x$anonymity)], collapse=", "))
pp <- "\nk-anonymity has been achieved within strata and for combinations of key-variables!"
for (i in 1:length(x$combs)) {
pp <- paste(pp, paste0("\n", x$k[i], "-anonymity == ", all(x$anonymity)," for all ",x$combs[i],"-dimensional subsets of key variables within stratas"))
}
pp <- paste0(pp,"\n")
}
} else {
pp <- paste0("\n", x$k, "-anonymity == ", all(x$anonymity),"\n")
if (!is.null(x$combs)) {
pp <- NULL
for (i in 1:length(x$combs)) {
pp <- paste(pp, paste0("\n", x$k[i], "-anonymity == ", all(x$anonymity)," for all ",x$combs[i],"-dimensional subsets of key variables"))
}
pp <- paste(pp, "\n")
} else {
pp <- paste0("\n", x$k, "-anonymity == ", all(x$anonymity),"\n")
}
}
pp <- paste0(pp, "-----------------------\n")
cat(pp)
Expand Down
Loading

0 comments on commit 9286ce6

Please sign in to comment.