####################################################
## A few quick and dirty ways to examine a dataset.
## and a handy recoding function
####################################################
## read in a not quite familliar dataset
##library(foreign)
##acs113<-read.dta(file="/90days/carlmIPUMS/usa_00113.dta")
##save(file="/90days/carlmIPUMS/acs113.Rsave", acs113)
print(load(file="/90days/carlmIPUMS/acs113.Rsave"))
names(acs113)
# A very nice overview of your data
library(Hmisc)
describe(acs113) # takes a few seconds
## And it takes weights
describe(acs113,weights=acs113$perwt) # even more seconds
## Here's the handy recoding trick: There is a recode() function in the "car"
## library -- the "a-r" in car stand for 'applied regression' so someday you may
## find other bits in that library to be useful too.
library(car)
## the recode() there are a few options on the syntax here is the most general
## the "recode=" argument takes a single character string (enclosed in ' ')
## composed of expressions such as "old value" = 7 .
acs113$ageNN<-recode(acs113$age,as.factor.result = TRUE,
recodes = '"Less than 1 year old"=0;
"90 (90+ in 1980 and 1990)" =90;
"100 (100+ in 1970)" =100'
)
## disappointingly (despite as.factor.result=TURE )..
is.factor(acs113$ageNN)
acs113$ageNN<-as.numeric(acs113$ageNN)
## same for father's age
acs113$AGE_POPNN<-recode(acs113$AGE_POP,as.factor.result = TRUE,
recodes = '"Less than 1 year old"=0;
"90 (90+ in 1980 and 1990)" =90;
"100 (100+ in 1970)" =100'
)
## disappointingly ..
is.factor(acs113$AGE_POPNN)
acs113$AGE_POPNN<-as.numeric(acs113$AGE_POPNN)
## density function
plot(density(acs113$ageNN))
plot(density(acs113$AGE_POPNN,na.rm=TRUE))
## cumulative distribution function
plot(ecdf(acs113$ageNN))
plot(ecdf(acs113$AGE_POPNN))
## compare a numerical variable the normal distribution
## this is what a qqnorm of a normal RV looks like
## qq stands for quantile to quantile
qqnorm(rnorm(1000))
## ageN is truncated -- only women 15 to 50 are included
qqnorm(acs113$ageNN)
## examine categorical variable with just a few levels
plot(table(acs113$citizen))
## examine two way frequencies
table(acs113$region,acs113$citizen)
## using as.character() to convert a factor to character strings
## can have the effect of deleting those levels which have zero
## representation in the data
table(as.character(acs113$region),as.character(acs113$citizen))
## and of course one can plot this sort of thing
plot(table(as.character(acs113$region),as.character(acs113$citizen)),las=2)
## see what happens without las=2
## examine categorical variable with many levels
dotchart(sort(table(as.character(acs113$stateicp))),cex=.8)
## experiment with cex
##