Commit 28c04ee5 authored by alexkowa's avatar alexkowa
Browse files

Initial commit sdcMicroGUI

parent 487a8997
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>sdcMicroGUI</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
</buildSpec>
<natures>
</natures>
</projectDescription>
Package: sdcMicroGUI
Type: Package
Title: Graphical user interface for package sdcMicro
Version: 1.0.3
Date: 2013-08-30
Author: Alexander Kowarik, Matthias Templ, Bernhard Meindl, Francois Fonteneau
Maintainer: Matthias Templ <matthias.templ@gmail.com>
Description: A point and click graphical user interface based on top of the sdcMicro package to create anonymized data set. The graphical user interface provides full reproducibility of any result via the script menu in the GUI.
Depends: sdcMicro (>= 4.0.4), gWidgetsRGtk2, tcltk, cairoDevice, vcd, foreign, Hmisc
License: GPL-2
export(plot.indivRisk,sdcGUI,sdcGUIoutput)
S3method(plot, indivRisk)
\ No newline at end of file
# 1.0.3
- Seed in Script
\ No newline at end of file
sdcGUIenv <- new.env()
## utility functions
# envionment with get and set functions
# not used, cause it ignores new env ... to use, remove ...x
#sdcGUIenvx <- function() {
# pos <- match("sdcGUIenv", search())
# if(is.na(pos)) {
# sdcGUIenv <- list()
# attach(sdcGUIenv, pos=length(search())-1)
# rm(sdcGUIenv)
# pos <- match("sdcGUIenv", search())
# }
# return(pos.to.env(pos))
#}
sdcGUIoutput <- function(){
if(existd("sdcObject")){
sdc <- ActiveSdcObject()
return(extractManipData(sdc))
}else
stop("There is no object from the sdcGUI to retrieve.")
}
putd <- function(x, value) {
assign(x, value, envir=sdcGUIenv) # add () to sdcGUIenv
}
rmd <- function(x) {
rm(list=x, envir=sdcGUIenv) # rm () from sdcGUIenv
}
getd <- function(x, mode="any") {
get(x, envir=sdcGUIenv, mode=mode, inherits=FALSE) # add () to sdcGUIenv
}
existd <- function(x, mode="any") {
exists(x, envir=sdcGUIenv, mode=mode, inherits=FALSE) # add () to sdcGUIenv
}
listd <- function(x){
ls(envir=sdcGUIenv)
}
ActiveDataSet <- function(name) {
if( missing(name) ) {
getd("activeDataSet")
} else {
if( is.matrix(get(name)) ) {
putd("activeDataSet", data.frame(get(name), stringsAsFactors=FALSE))
} else {
putd("activeDataSet", get(name))
}
putd("dataSetName", name)
}
}
ActiveSdcObject <- function(name) {
if( missing(name) ) {
getd("sdcObject")
} else {
if( class(name)=="sdcMicroObj" ) {
putd("sdcObject", name)
} else {
stop("Input is not an object of class 'sdcMicroObj'")
}
}
}
ActiveSdcVars <- function(name="keyVars"){
get.sdcMicroObj(getd("sdcObject"),name)
}
ActiveSdcVarsStr <- function(name="keyVars"){
sdcObject <- getd("sdcObject")
colnames(sdcObject@origData)[get.sdcMicroObj(sdcObject,name)]
}
parseVar <- function(x, ...) {
if(length(x)==0)return("NULL")
s <- "c("
for ( i in 1:length(x) ) {
s <- paste(s, x[i])
if (i < length(x)) {
s <- paste(s, ",")
}
}
s <- paste(s, ")")
return(s)
}
parseVarStr <- function(x, ...) {
if(length(x)==0)return("NULL")
s <- "c("
for ( i in 1:length(x) ) {
s <- paste(s, "'", x[i], "'", sep="")
if (i < length(x)) {
s <- paste(s, ",", sep="")
}
}
s <- paste(s, ")", sep="")
return(s)
}
# getIndex to get the col index of categorical, numerical and weight vars
getIndex <- function(x, ...) {
ads <- names(ActiveDataSet())
ord <- c()
for( i in 1:length(x) ) {
for( j in 1:length(ads) ) {
if( x[i]==ads[j] ) {
ord <- c(ord, j)
}
}
}
return(ord)
}
printFrequencies <- function(obj){
cat("\n --------------------------\n")
cat(paste(sum(obj@risk$individual[,2]<2), "obs. violate 2-anonymity \n"))
cat(paste(sum(obj@risk$individual[,2]<3), "obs. violate 3-anonymity \n"))
cat(" --------------------------\n")
}
printFrequenciesComp <- function(obj){
# cat("\n --------------------------\n")
cat("Number of observations violating\n")
cat("\n - 2-anonymity: ")
cat(paste(sum(obj@risk$individual[,2]<2),
"(orig: ",sum(obj@originalRisk$individual[,2]<2),")\n"))
cat(" - 3-anonymity: ")
cat(paste(sum(obj@risk$individual[,2]<3),
"(orig: ",sum(obj@originalRisk$individual[,2]<3),")"))
# cat(paste(sum(obj@originalRisk$individual[,2]<2), "obs. violate 2-anonymity \n"))
# cat(paste(sum(obj@originalRisk$individual[,2]<3), "obs. violate 3-anonymity \n"))
cat("\n--------------------------\n")
n <- nrow(obj@origData)
cat("\nPercentage of observations violating\n")
cat(" - 2-anonymity: ")
cat(paste(round(sum(obj@risk$individual[,2]<2)/n*100,2),"% ",
"(orig: ",round(sum(obj@originalRisk$individual[,2]<2)/n*100,2),"%",")\n"))
cat(" - 3-anonymity: ")
cat(paste(round(sum(obj@risk$individual[,2]<3)/n*100,2),"% ",
"(orig: ",round(sum(obj@originalRisk$individual[,2]<3)/n*100,2),"%",")"))
}
printMeasure_risk <- function(obj){
risk <- obj@risk
cat("\n")
cat("--------------------------\n")
s <- sum((risk$individual[,1] > median(risk$individual[,1])+2*mad(risk$individual[,1])) & (risk$indiviual[,1] > 0.1))
cat(paste(s,"obs. with higher risk than the main part\n"))
cat("Expected no. of re-identifications:\n",round(risk$global$risk_ER,2),"")
cat("(",round(risk$global$risk_pct,2),"%)\n")
if(is.na(risk$global$threshold))
risk$global$threshold <- Inf
#cat("Threshold:",round(risk$global$threshold,2),"\n (for maximal global risk",round(risk$global$max_risk,2),")\n")
cat("--------------------------\n")
if("hier_risk_ER"%in%names(risk$global)){
if(!is.na(risk$global$hier_risk_ER)){
cat("--------------------------\n")
cat("Hierarchical risk \n")
cat("--------------------------\n")
cat("Expected no. of re-identifications:\n",round(risk$global$hier_risk_ER,2),"")
cat("(",round(risk$global$hier_risk_pct,2),"% )\n")
}else{
cat("--------------------------\n")
cat("Hierarchical risk not available\n")
cat("--------------------------\n")
}
}
}
printRecode <- function(obj){
cat("Reported is the")
cat("\n")
cat(" number | mean size and | size of smallest category")
cat("\n")
k <- length(obj@keyVars)
tab <- tab2 <- ssize <- ssize2 <- msize <- msize2 <- numeric(k)
names(tab) <- colnames(obj@origData[,obj@keyVars])
cat("\n")
for(i in 1:k){
tab2[i] <- length(unique(obj@origData[,obj@keyVars[i]]))
tab[i] <- length(unique(obj@manipKeyVars[,i]))
t2 <- table(obj@origData[,obj@keyVars[i]])
t1 <- table(obj@manipKeyVars[,i])
msize[i] <- round(mean(t1),0)
msize2[i] <- round(mean(t2),0)
ssize[i] <- min(t1)
ssize2[i] <- min(t2)
}
nc <- sapply(names(tab), nchar)
maxnam <- max(nc)
for(i in 1:k){
# cat(names(tab)[i],":",tab[i]," (orig:", tab2[i],"), ms:", msize[i], "(orig:",msize2[i],") \n")
nam <- names(tab)[i]
cat("-------------\n")
cat(nam, paste(rep(".",2+maxnam-nchar(nam)), collapse=""),tab[i],"|",msize[i],"|",ssize[i],
"\n (orig:", tab2[i],"|",msize2[i],"|",ssize2[i],") \n")
}
}
printMeasure_riskComp <- function(obj){
# cat("NOW:")
risk <- obj@risk
originalRisk <- obj@originalRisk
cat("\n")
cat("--------------------------\n")
s <- sum((risk$individual[,1] > median(risk$individual[,1])+2*mad(risk$individual[,1])) & (risk$indiviual[,1] > 0.1))
sorig <- sum((originalRisk$individual[,1] > median(originalRisk$individual[,1])+2*mad(originalRisk$individual[,1])) & (originalRisk$indiviual[,1] > 0.1))
cat(paste(s," (orig:", sorig, ")","obs. with higher risk than the main part\n"))
cat("Expected no. of re-identifications:\n",round(risk$global$risk_ER,2),"")
cat("[",round(risk$global$risk_pct,2),"%] (orig:", round(originalRisk$global$risk_ER,2),
"[",round(originalRisk$global$risk_pct,2),"%])\n")
# if(is.na(risk$global$threshold))
# risk$global$threshold <- Inf
#cat("Threshold:",round(risk$global$threshold,2),"\n (for maximal global risk",round(risk$global$max_risk,2),")\n")
cat("--------------------------\n")
if("hier_risk_ER"%in%names(risk$global)){
if(!is.na(risk$global$hier_risk_ER)){
cat("--------------------------\n")
cat("Hierarchical risk \n")
cat("--------------------------\n")
cat("Expected no. of re-identifications:\n",
round(risk$global$hier_risk_ER,2),"")
cat("[",round(risk$global$hier_risk_pct,2),"%] (orig:",
round(originalRisk$global$hier_risk_ER,2),
"[",round(originalRisk$global$hier_risk_pct,2),"%])\n")
}else{
cat("--------------------------\n")
cat("Hierarchical risk not available\n")
cat("--------------------------\n")
}
}
# cat("ORIGINAL:")
# risk <- obj@originalRisk
# cat("\n")
# cat("--------------------------\n")
# s <- sum((risk$individual[,1] > median(risk$individual[,1])+2*mad(risk$individual[,1])) & (risk$indiviual[,1] > 0.1))
# cat(paste(s,"obs. with higher risk than the main part\n"))
# cat("Expected no. of re-identifications:\n",round(risk$global$risk_ER,2),"")
# cat("(",round(risk$global$risk_pct,2),"%)\n")
# if(is.na(risk$global$threshold))
# risk$global$threshold <- Inf
# #cat("Threshold:",round(risk$global$threshold,2),"\n (for maximal global risk",round(risk$global$max_risk,2),")\n")
# cat("--------------------------\n")
# if("hier_risk_ER"%in%names(risk$global)){
# if(!is.na(risk$global$hier_risk_ER)){
# cat("--------------------------\n")
# cat("Hierarchical risk \n")
# cat("--------------------------\n")
# cat("Expected no. of re-identifications:\n",round(risk$global$hier_risk_ER,2),"")
# cat("(",round(risk$global$hier_risk_pct,2),"% )\n")
# }else{
# cat("--------------------------\n")
# cat("Hierarchical risk not available\n")
# cat("--------------------------\n")
# }
# }
}
printLocalSuppression <- function(obj){
keyVars <- colnames(obj@manipKeyVars)
maxnam <- max(sapply(keyVars, nchar))
if(is.null(obj@localSuppression))
lsup <- list(rep(0,length(keyVars)))
else
lsup <- obj@localSuppression
for(i in 1:length(keyVars)){
nam <- keyVars[i]
n <- nrow(obj@origData)
cat("\n")
cat(keyVars[i],paste(rep(".",2+maxnam-nchar(nam)), collapse=""),lsup[[1]][i])
cat(" [", round(100*lsup[[1]][i]/n,3), "%]")
}
}
updates2 <- function(restart=FALSE){
options(timeout=5)
xt <- try(download.file(url="http://cran.r-project.org/",destfile=tempfile(),quiet=TRUE))
INET <- TRUE
if(class(xt)=="try-error"){
INET <- FALSE
}
if(INET){
oldP <- old.packages()
if(!is.null(oldP)){
oldP <- oldP[oldP[,1]%in%c("sdcMicro","sdcMicroGUI"),,drop=FALSE]
}else{
oldP <- data.frame()
}
if(nrow(oldP)!=0){
text <- paste("Updates found for the following packages: ",paste(oldP[,1],collapse="\n ",sep=""),"\n Click OK for updating (GUI will be restarted).",sep="")
}else{
text <- "No updates available."
}
if(text=="No updates available."&&!restart){
return(0)
}
ns_do <- gconfirm(text, title="Package Updates",icon="warning")
if( ns_do &&substr(text,1,1)!="N") {
loaded <- oldP[oldP[,1]%in%loadedNamespaces(),1]
for(ll in loaded){
if(length(which(search()==paste("package:",ll,sep="")))>0)
detach(pos=which(search()==paste("package:",ll,sep="")),unload=TRUE,force=TRUE)
}
update.packages(oldPkgs=oldP,ask=FALSE)
for(ll in loaded)
require(ll,character.only=TRUE)
if(restart)
sdcGUI()
}
}else
gmessage("It is not possible to check for possible updates at the moment.", title="No internet connection",icon="warning")
}
plot.indivRisk <- function (x, ...)
{
## x ... object from class indivRisk
## y ... object from class freqCalc
if (!exists("slider.env"))
#slider.env <<- new.env()
slider.env <- new.env()
#library(tcltk)
method = "histogram"
mu <- 0.0025
assign("mu", tclVar(mu), envir = slider.env)
sd <- 0.5
assign("sd", tclVar(sd), envir = slider.env)
s2 <- 0.5
assign("s2", tclVar(s2), envir = slider.env)
#xmin <- 1
#assign("xmin", tclVar(xmin), envir = slider.env)
#xmax <- 5
#assign("xmax", tclVar(xmax), envir = slider.env)
#ymin <- 0
#assign("ymin", tclVar(ymin), envir = slider.env)
#ymax <- round(dnorm(0, 0, 0.5), 2)
#assign("ymax", tclVar(ymax), envir = slider.env)
mu.old <- mu
sd.old <- sd
s2.old <- s2
maxsd <- 1/length(x$rk) * (sum(x$fk * x$rk)) *100
n1 <- x$knames[1] ## next, the plot of column names of keys
if( length(x$knames) > 1 ){
for(i in 2:length(x$knames)){
n1 <- paste(n1, "x", x$knames[i])
}
}
p1 <- function(method){
if( method == "histogram" ){
hist(x$rk, main=n1,freq=TRUE, xlab="individual risk", col="yellow")
abline(v=mu, col="blue", lwd=2)
}
if( method == "ecdf" ){
plot(ecdf(x$rk), main="ecdf of individual risk", xlab="individual risk")
abline(v=as.numeric(evalq(tclvalue(mu), envir = slider.env)), col="blue", lwd=2)
}
}
norm.refresh <- function(...) {
method = method
mu <- as.numeric(evalq(tclvalue(mu), envir = slider.env))
sd <- as.numeric(evalq(tclvalue(sd), envir = slider.env))
s2 <- as.numeric(evalq(tclvalue(s2), envir = slider.env))
if (mu != mu.old) {
s2 <- round(length(which(x$rk > mu)))
sd <- 1/length(x$rk) * (sum(x$fk[x$rk < mu] * x$rk[x$rk < mu]) + mu*sum(x$fk[x$rk>mu])) * 100
try(eval(parse(text = paste("tclvalue(s2)<-", s2,
sep = "")), envir = slider.env))
try(eval(parse(text = paste("tclvalue(sd)<-", sd,
sep = "")), envir = slider.env))
sd.old <<- sd
s2.old <<- s2
#print(sd)
#print(paste("s2:", s2))
}
if (sd != sd.old) {
sd <- as.numeric(evalq(tclvalue(s2), envir = slider.env))
#mu <- sort(x$rk)[
s2 <- length(which(x$rk > mu))
try(eval(parse(text = paste("tclvalue(s2)<-", s2,
sep = "")), envir = slider.env))
try(eval(parse(text = paste("tclvalue(sd)<-", sd,
sep = "")), envir = slider.env))
sd.old <<- sd
s2.old <<- s2
}
if (s2 != s2.old) {
s2 <- as.numeric(evalq(tclvalue(s2), envir = slider.env))
sd <- 1/length(x$rk) * (sum(x$fk * x$rk) + 0.02*sum(x$fk))
try(eval(parse(text = paste("tclvalue(sd)<-", sd,
sep = "")), envir = slider.env))
sd.old <<- sd
s2.old <<- length(which(x$rk > mu))
}
#xmin <- as.numeric(evalq(tclvalue(xmin), envir = slider.env))
#xmax <- as.numeric(evalq(tclvalue(xmax), envir = slider.env))
#ymin <- as.numeric(evalq(tclvalue(ymin), envir = slider.env))
#ymax <- as.numeric(evalq(tclvalue(ymax), envir = slider.env))
p1 <- function(method){
if( method == "histogram" ){
hist(x$rk, main=n1,freq=TRUE, xlab="individual risk", col="yellow")
abline(v=mu, col="blue", lwd=2)
}
if( method == "ecdf" ){
plot(ecdf(x$rk), main="ecdf of individual risk", xlab="individual risk")
abline(v=mu, col="blue", lwd=2)
}
}
p1(method=method)
}
m <- tktoplevel()
tkwm.title(m, "Individual risk adjustments")
fontHeading <- tkfont.create(family="tahoma",size=10)
tkpack(tklabel(m, text="Please, see at the plot the active graphik device in R", font=fontHeading))
tkwm.geometry(m, "+0+0")
tkpack(fr <- tkframe(m), side = "top")
tkpack(tklabel(fr, text = "Individual risk threshold =", width = "35"), side = "left")
tkpack(sc <- tkscale(fr, command = norm.refresh, from = 0,
to = max(x$rk), orient = "horiz", resolution = 0.001, showvalue = TRUE),
side = "left")
assign("sc", sc, envir = slider.env)
evalq(tkconfigure(sc, variable = mu), envir = slider.env)
tkpack(fr <- tkframe(m), side = "top")
tkpack(tklabel(fr, text = "Re-identification rate =", width = "35"),
side = "left")
tkpack(sc <- tkscale(fr, command = norm.refresh, from = 0,
to = maxsd, orient = "horiz", resolution = 0.01, showvalue = TRUE),
side = "left")
assign("sc", sc, envir = slider.env)
evalq(tkconfigure(sc, variable = sd), envir = slider.env)
tkpack(fr <- tkframe(m), side = "top")
tkpack(tklabel(fr, text = "Unsafe recods =", width = "35"), side = "left")
tkpack(sc <- tkscale(fr, command = norm.refresh, from = 0,
to = length(x$rk), orient = "horiz", resolution = 1, showvalue = TRUE),
side = "left")
assign("sc", sc, envir = slider.env)
evalq(tkconfigure(sc, variable = s2), envir = slider.env)
tkpack(fr <- tkframe(m), side = "top")
#tkpack(tklabel(fr, text = "Xmin = ", width = 6), side = "left")
#tkpack(e <- tkentry(fr, width = 8), side = "left")
#assign("e", e, envir = slider.env)
#evalq(tkconfigure(e, textvariable = xmin), envir = slider.env)
#tkpack(tklabel(fr, text = "Xmax =", width = 6), side = "left")
#tkpack(e <- tkentry(fr, width = 8), side = "left")
#assign("e", e, envir = slider.env)
#evalq(tkconfigure(e, textvariable = xmax), envir = slider.env)
#tkpack(fr <- tkframe(m), side = "top")
#tkpack(tklabel(fr, text = "Ymin = ", width = 6), side = "left")
#tkpack(e <- tkentry(fr, width = 8), side = "left")
#assign("e", e, envir = slider.env)
#evalq(tkconfigure(e, textvariable = ymin), envir = slider.env)
#tkpack(tklabel(fr, text = "Ymax =", width = 6), side = "left")
#tkpack(e <- tkentry(fr, width = 8), side = "left")
#assign("e", e, envir = slider.env)
#evalq(tkconfigure(e, textvariable = ymax), envir = slider.env)
#tkpack(tkbutton(m, text = "histogram", command = function(method){p1(method)}),
# side = "left")
tkpack(tkbutton(m, text = "ecdf", command = function(){method="ecdf"; p1(method); abline(v=as.numeric(evalq(tclvalue(mu), envir = slider.env)), col="blue")}), side="left")
tkpack(tkbutton(m, text = "Exit", command = function() tkdestroy(m)),
side = "right")
}
This diff is collapsed.
.onAttach <- function(lib,pkg)
{
packageStartupMessage("\n--------\n")
packageStartupMessage("you may start the graphical user interface by running 'sdcGUI()'\n")
}
citHeader("To cite package 'sdcMicroGUI' in publications use:")
desc <- packageDescription("sdcMicroGUI")
year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", desc$Date)
vers <- paste("R package version", desc$Version)
citEntry(entry="Article",
title = "A Graphical User Interface for Microdata Protection Which Provides Reproducibility and Interactions: the sdcMicro GUI",
author = personList(as.person("Matthias Templ, Thomas Petelin")),
journal = "Transactions on Data Privacy",
year = "2009",
volume = "2",
number = "3",
pages = "207--224",
textVersion =
paste("Matthias Templ",
" (2009). A Graphical User Interface for Microdata Protection Which Provides Reproducibility and Interactions: the sdcMicro GUI. Transactions on Data Privacy, 2(3),207-224 ",
".", sep=""))
\name{plot.indivRisk}
\alias{plot.indivRisk}
\title{ plot method for indivRisk objects }
\description{
Plots an interactive histogramm or ecdf plot with various interactive sliders.
}
\usage{
\method{plot}{indivRisk}(x, ...)
}
\arguments{
\item{x}{ object of class \sQuote{indivRisk} }
\item{\dots}{ Additional arguments passed through. }
}
\details{
With the sliders one can move the individual risk threshold. By this movement the threshold
will be moved on the plot and the slider with a re-idendification rate and the slider of the number of unsafe records (based on your
chosen threshold) are also moved based on the individual risk threshold. This plot is very similar to the individual risk plot of
the software mu-Argus.
}
\references{ look e.g. on the mu-Argus manuals available at \url{http://neon.vb.cbs.nl/casc/Software/MuManual4.1.pdf}
Templ, M.
\emph{Statistical Disclosure Control for Microdata Using the R-Package sdcMicro},
Transactions on Data Privacy,
vol. 1, number 2, pp. 67-85, 2008.
\url{http://www.tdp.cat/issues/abs.a004a08.php}
}
\author{ Matthias Templ }
\seealso{ \code{\link{indivRisk}} }
\examples{
## example from Capobianchi, Polettini and Lucarelli:
data(francdat)
ff <- freqCalc(francdat, keyVars=c(2,4,5,6),w=8)
irisk <- indivRisk( ff )
## and now apply:
## plot(irisk)
data(free1)
ff <- freqCalc(free1, keyVars=1:3, w=30)
irisk2 <- indivRisk(ff)
## and now apply:
## plot(irisk2)
}
\keyword{ aplot }
\name{sdcGUI}
\alias{sdcGUI}
\docType{data}
\title{ GUI for the sdcMicro package}
\description{
This graphical user interface supports the main functions of sdcMicro.
}
\usage{sdcGUI()}
\details{
This GUI provides an extension to the package sdcMicro. The developed GUI makes sdcMicro
accessible to a wider range of users including ones not used to the R command line interface.
The user can access all basic functions for microdata protection by using this
GUI.
The graphical user interface of sdcMicro allows an interactive interaction between objects.
Flexibility is provided by automatic displaying of the main results (e.g.,
the summary of the frequency counts and the estimated disclosure risk) which
are updated after a user interaction automatically. Additional flexibility is
provided by storing all the users operations with all parameters in a script
which can then be saved, modified and/or reloaded. Thus, full reproducibility
is provided also when using this GUI instead of the CLI version.
It is programmed based on the gWidgetsRGtk2 and RGtk2 package.