NSS
rounds 64, 66, and 68 come as .Nesstar binaries inside
.rar archives. Download them with the sister package mospiR, extract with
unar, and read with nesstarR. All examples below use NSS
68th Round Type 2 (Consumer Expenditure Survey, July 2011 - June 2012);
the same steps apply to NSS 64 and 66.
Prerequisites
remotes::install_github("saketlab/mospiR")
remotes::install_github("saketlab/nesstarR")Download from the portal
mospiR::download_dataset() authenticates with the MoSPI
NADA API and downloads the full archive for one survey idno. The API key
is read from MOSPI_KEY (set it in
~/.Renviron).
api_key <- Sys.getenv("MOSPI_KEY")
data_dir <- file.path(
"data", "hces",
"DDI-IND-MOSPI-NSSO-68Rnd-Sch2.0-July2011-June2012"
)
download_dataset(
"DDI-IND-MOSPI-NSSO-68Rnd-Sch2.0-July2011-June2012",
data_dir,
api_key
)The download is a .rar archive, around 270 MB for NSS 68
T2.
Extract the archive
unar (macOS/Linux) and unrar (Windows) both
work. Either must be on your PATH.
rar_file <- file.path(data_dir, "Nss68_1.0_Type2_new format.rar")
system2("unar", c("-o", dirname(rar_file), shQuote(rar_file)))Extraction produces a folder containing:
-
survey0/data/nss68_consumer_expenditure_type2.Nesstar(the binary data) -
ddi.xml(variable metadata in DDI format, not required by nesstarR) -
Readme.pdf/ supporting documents
Parse the Nesstar file
nesstar_parse() reads the entire file into memory and
decodes the binary header, resource index, dataset descriptors, and
variable directories. No row data is loaded at this stage.
nb <- nesstar_parse(nesstar_path)
nb
#> <nesstar_binary>
#> File : nss68_consumer_expenditure_type2.Nesstar
#> Datasets : 11Dataset structure
A single .Nesstar container holds multiple datasets, one
per schedule block (household, person, food, non-food, …).
ds_tbl <- nesstar_datasets(nb)
ds_tbl
#> dataset_number row_count variable_count
#> 1 47 101651 37
#> 2 48 101651 42
#> 3 49 101651 35
#> 4 50 464730 39
#> 5 51 5277850 32
#> 6 52 1493313 29
#> 7 53 370071 28
#> 8 54 2343672 28
#> 9 55 3436786 36
#> 10 56 3317537 28
#> 11 57 101647 41The dataset with the largest row count is the food block: one row per household × item combination (~4-5 million rows for NSS 68 T2).
Variable listing
vars <- nesstar_variables(nb, dataset_number = food_ds)
vars[, c("name", "mode_code", "value_format_code", "width_value")]
#> name mode_code value_format_code width_value
#> 1 Round_Centre_Code 1 2 3
#> 2 FSU_Serial_No 1 2 5
#> 3 Round 1 2 2
#> 4 Sch_no 1 2 3
#> 5 Sample 1 2 1
#> 6 Sector 1 2 1
#> 7 State_region 1 2 3
#> 8 District 1 2 2
#> 9 Stratum 1 2 2
#> 10 Sub_Stratum_No 1 2 2
#> 11 Schedule_type 1 2 1
#> 12 Sub_Round 1 2 1
#> 13 Sub_Sample 1 2 1
#> 14 FOD_Sub_region 1 2 4
#> 15 Hamlet_Group_Sub_Stratum_no 1 2 1
#> 16 Second_Stage_Stratum_No 1 2 1
#> 17 Sample_hhld_no 1 2 2
#> 18 Level 1 2 2
#> 19 Item_Code 1 2 3
#> 20 Home_Produce_Quantity 5 5 7
#> 21 Home_Produce_Value 5 4 5
#> 22 Total_Consumption_Quantity 5 5 7
#> 23 Total_Consumption_Value 5 5 5
#> 24 Source_Code 1 2 1
#> 25 NSS 5 3 2
#> 26 NSC 5 3 2
#> 27 MLT 5 6 8
#> 28 HHID 1 2 9
#> 29 State_code 1 2 2
#> 30 District_Code 1 2 4
#> 31 Combined_Multiplier 5 10 9
#> 32 Subsample_multiplier 0 10 9Embedded labels and category codes
NESSTAR files from NSS 64 onwards embed Huffman-compressed XML blocks with variable labels and category codes.
meta <- nesstar_metadata(nb)
#> Warning in nesstar_metadata(nb): Embedded metadata record ids 1 and 2 not
#> found; this file uses an older format. Labels are unavailable.
if (!is.null(meta)) {
ds_meta <- meta$datasets[[which(
vapply(meta$datasets, `[[`, integer(1), "dataset_number") == food_ds
)]]
cat("File name stored in container:", ds_meta$file_name, "\n\n")
for (v in head(ds_meta$variables, 6)) {
cat(sprintf("%-20s label: %s\n", v$name, v$label))
if (length(v$categories) > 0) {
for (cat_entry in head(v$categories, 4)) {
cat(sprintf(" %s = %s\n", cat_entry$value, cat_entry$label))
}
}
}
} else {
message("Metadata not available for this file (older NSS format).")
}
#> Metadata not available for this file (older NSS format).Read a dataset
nesstar_read_dataset() decodes all columns for the
requested dataset. For the food block this is the slow step (~10-30 s on
a laptop): 4–5 million rows × ~20 columns, decoded from the packed
binary format.
Pass columns to load only what you need:
food <- nesstar_read_dataset(
nb,
dataset_number = food_ds,
columns = c(
"HHID", "State_code", "District_Code", "Sector",
"Combined_Multiplier", "Item_Code", "Total_Consumption_Value"
)
)
cat("Rows:", nrow(food), "| Columns:", ncol(food), "\n")
#> Rows: 5277850 | Columns: 7
head(food)
#> Sector Item_Code Total_Consumption_Value HHID State_code District_Code
#> 1 1 101 420 715581201 17 1701
#> 2 1 102 255 715581201 17 1701
#> 3 1 105 15 715581201 17 1701
#> 4 1 113 60 715581201 17 1701
#> 5 1 129 750 715581201 17 1701
#> 6 1 139 50 715581201 17 1701
#> Combined_Multiplier
#> 1 324.08
#> 2 324.08
#> 3 324.08
#> 4 324.08
#> 5 324.08
#> 6 324.08Quick check: cereal spending by sector
Weighted mean monthly household expenditure on cereals (item codes
101–140), rural vs. urban. The Multiplier column is the
survey weight.
cereals <- food[as.integer(food$Item_Code) >= 101 &
as.integer(food$Item_Code) <= 140, ]
hh_cereals <- aggregate(Total_Consumption_Value ~ HHID + Sector + Combined_Multiplier,
data = cereals, FUN = sum
)
rural <- hh_cereals[hh_cereals$Sector == "1", ]
urban <- hh_cereals[hh_cereals$Sector == "2", ]
cat(sprintf(
"Weighted mean cereal expenditure (Rs/month):\n Rural: %.0f\n Urban: %.0f\n",
weighted.mean(rural$Total_Consumption_Value, rural$Combined_Multiplier, na.rm = TRUE),
weighted.mean(urban$Total_Consumption_Value, urban$Combined_Multiplier, na.rm = TRUE)
))
#> Weighted mean cereal expenditure (Rs/month):
#> Rural: 1486
#> Urban: 1592Export to csv
nesstar_export() writes one CSV (or
.csv.gz) per dataset to a directory. It processes 50,000
rows per chunk by default, so even the 4.65 GB T1 file does not require
loading all rows at once.
# Write CSVs alongside the .Nesstar file so they survive across R sessions
# and avoid filling /tmp on machines with limited temporary storage.
output_dir <- dirname(nesstar_path)
nesstar_export(nb, output_dir = output_dir, compress = FALSE)
#> Wrote: nss68_consumer_expenditure_type2_ds47.csv (101651 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds48.csv (101651 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds49.csv (101651 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds50.csv (464730 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds51.csv (5277850 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds52.csv (1493313 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds53.csv (370071 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds54.csv (2343672 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds55.csv (3436786 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds56.csv (3317537 rows)
#> Wrote: nss68_consumer_expenditure_type2_ds57.csv (101647 rows)
list.files(output_dir, pattern = "\\.csv$")
#> [1] "nss68_consumer_expenditure_type2_ds47.csv"
#> [2] "nss68_consumer_expenditure_type2_ds48.csv"
#> [3] "nss68_consumer_expenditure_type2_ds49.csv"
#> [4] "nss68_consumer_expenditure_type2_ds50.csv"
#> [5] "nss68_consumer_expenditure_type2_ds51.csv"
#> [6] "nss68_consumer_expenditure_type2_ds52.csv"
#> [7] "nss68_consumer_expenditure_type2_ds53.csv"
#> [8] "nss68_consumer_expenditure_type2_ds54.csv"
#> [9] "nss68_consumer_expenditure_type2_ds55.csv"
#> [10] "nss68_consumer_expenditure_type2_ds56.csv"
#> [11] "nss68_consumer_expenditure_type2_ds57.csv"Pass compress = TRUE (the default) for gzip-compressed
output, which typically halves file size with negligible extra time.
nesstar_export(nb,
output_dir = output_dir,
datasets = food_ds,
compress = TRUE,
chunk_size = 100000L
)Verify against Stata reference files
The original NESSTAR Publisher software was used to export each
dataset to Stata .dta format. The code below loads those
exports and checks that nesstarR’s CSV output matches them
row-for-row.
Unzip the Stata archive
# Extract alongside the .Nesstar output to avoid exhausting /tmp on
# space-constrained machines (the full STATA archive expands to ~1.4 GB).
stata_dir <- file.path(output_dir, "stata_68t2")
unzip(stata_zip, exdir = stata_dir)
dta_files <- sort(list.files(stata_dir,
pattern = "\\.dta$",
full.names = TRUE, recursive = TRUE
))
cat(length(dta_files), "Stata files found\n")Build the dataset correspondence table
The NESSTAR container stores a file name for every dataset in its
embedded metadata (<FileName> XML field decoded by
nesstar_metadata()). For NSS files this is something like
Level_1.sav. We extract the level number from that string
and join to the STATA export filenames — a structural match that is
independent of any data property. If the metadata names do not carry
level numbers we fall back to matching on unique
(nrow, ncol) pairs.
library(haven)
level_num <- function(s) {
if (is.na(s) || !nzchar(s)) {
return(NA_integer_)
}
m <- regexpr("[Ll]evel[[:space:]_-]*(\\d+)", s, perl = TRUE)
if (m[[1L]] == -1L) {
return(NA_integer_)
}
cs <- attr(m, "capture.start")[[1L]]
cl <- attr(m, "capture.length")[[1L]]
as.integer(substr(s, cs, cs + cl - 1L))
}
# Reuse metadata from earlier chunk; older containers return NULL, in which
# case every ds$level stays NA and we fall back to (nrow, ncol) matching.
if (!exists("meta")) meta <- nesstar_metadata(nb)
nesstar_ds <- if (!is.null(meta)) {
do.call(rbind, lapply(meta$datasets, function(dm) {
data.frame(
ds_num = dm$dataset_number,
file_name = dm$file_name,
level = level_num(dm$file_name),
stringsAsFactors = FALSE
)
}))
} else {
cat("No embedded XML metadata — falling back to (nrow, ncol) matching.\n")
data.frame(
ds_num = vapply(nb$datasets, `[[`, integer(1), "dataset_number"),
file_name = NA_character_,
level = NA_integer_,
stringsAsFactors = FALSE
)
}
if (!is.null(meta)) {
print(nesstar_ds[, c("ds_num", "file_name", "level")])
}
dta_nobs <- function(path) {
raw <- readBin(path, "raw", n = 512L)
if (length(raw) < 10L) {
return(NA_integer_)
}
ver <- as.integer(raw[1L])
if (ver %in% c(102L, 108L, 110L, 113L, 114L, 115L)) {
bo <- if (as.integer(raw[2L]) == 1L) "big" else "little"
return(readBin(raw[7L:10L], "integer", n = 1L, size = 4L, endian = bo))
}
lt <- as.raw(0x3c)
Nb <- as.raw(0x4e)
for (i in seq_len(length(raw) - 4L)) {
if (raw[i] == lt && raw[i + 1L] == Nb && raw[i + 2L] == as.raw(0x3e)) {
return(readBin(raw[(i + 3L):(i + 6L)], "integer",
n = 1L, size = 4L,
endian = "little"
))
}
}
NA_integer_
}
stata_ds <- do.call(rbind, lapply(dta_files, function(f) {
df <- read_dta(f, n_max = 0L)
data.frame(
stata_path = f,
level = level_num(basename(f)),
nrow = dta_nobs(f),
ncol = ncol(df),
stringsAsFactors = FALSE
)
}))
csv_files <- sort(list.files(output_dir,
pattern = "_ds\\d+\\.csv$",
full.names = TRUE
))
csv_meta <- lapply(csv_files, function(f) {
hdr <- names(read.csv(f, nrows = 0L, check.names = FALSE))
wc_out <- system(paste("wc -l", shQuote(f)), intern = TRUE)
nrows <- as.integer(sub("^\\s*(\\d+).*", "\\1", wc_out)) - 1L
ds_num <- as.integer(sub(".*_ds(\\d+)\\.csv$", "\\1", basename(f)))
list(ds_num = ds_num, path = f, nrow = nrows, ncol = length(hdr))
})
match_tbl <- data.frame(
level = integer(), ds_num = integer(),
csv_path = character(), stata_path = character(), method = character(),
stringsAsFactors = FALSE
)
for (i in seq_len(nrow(stata_ds))) {
sd <- stata_ds[i, ]
nm <- nesstar_ds[!is.na(nesstar_ds$level) & nesstar_ds$level == sd$level, ]
if (nrow(nm) == 1L) {
csv_f <- Filter(function(cm) cm$ds_num == nm$ds_num, csv_meta)
if (length(csv_f) == 1L) {
match_tbl <- rbind(match_tbl, data.frame(
level = sd$level, ds_num = nm$ds_num,
csv_path = csv_f[[1L]]$path, stata_path = sd$stata_path,
method = "metadata", stringsAsFactors = FALSE
))
next
}
}
taken <- match_tbl$ds_num
cands <- Filter(function(cm) {
!cm$ds_num %in% taken &&
cm$nrow == sd$nrow
}, csv_meta)
if (length(cands) > 1L) {
cands <- Filter(function(cm) cm$ncol == sd$ncol, cands)
}
if (length(cands) == 1L) {
match_tbl <- rbind(match_tbl, data.frame(
level = sd$level, ds_num = cands[[1L]]$ds_num,
csv_path = cands[[1L]]$path, stata_path = sd$stata_path,
method = "row+col", stringsAsFactors = FALSE
))
} else {
cat(sprintf(
"Level %2d: COULD NOT MATCH (nrow=%d, ncol=%d)\n",
sd$level, sd$nrow, sd$ncol
))
}
}
match_tbl <- match_tbl[order(match_tbl$level), ]
cat(sprintf("\n%d / %d STATA levels matched:\n", nrow(match_tbl), nrow(stata_ds)))
for (i in seq_len(nrow(match_tbl))) {
r <- match_tbl[i, ]
cat(sprintf(
" Level %2d -> ds%02d [%-8s] %s\n",
r$level, r$ds_num, r$method, basename(r$csv_path)
))
}Full dataset verification
Load every matched pair completely and compare all rows and all
columns. STATA converts . to _ in variable
names; norm_col() normalises both sides.
haven::zap_labels() strips value-label attributes so
numeric columns compare as plain doubles. Each dataset is freed from
memory after comparison to limit peak usage.
norm_col <- function(x) toupper(gsub("[. ]+", "_", trimws(x)))
strip_stata <- function(x) {
if (inherits(x, c("haven_labelled", "haven_labelled_spss"))) {
return(as.numeric(unclass(x)))
}
if (is.factor(x)) {
return(as.character(x))
}
x
}
pass_all <- TRUE
for (i in seq_len(nrow(match_tbl))) {
r <- match_tbl[i, ]
s_df <- tryCatch(
as.data.frame(zap_labels(read_dta(r$stata_path))),
error = function(e) {
cat(sprintf(
"Level %2d / ds%02d: SKIP (STATA read error: %s)\n",
r$level, r$ds_num, conditionMessage(e)
))
NULL
}
)
if (is.null(s_df)) {
pass_all <- FALSE
next
}
c_df <- read.csv(r$csv_path, check.names = FALSE)
names(s_df) <- norm_col(names(s_df))
names(c_df) <- norm_col(names(c_df))
if (nrow(s_df) != nrow(c_df)) {
cat(sprintf(
"Level %2d / ds%02d: FAIL row count STATA=%d CSV=%d\n",
r$level, r$ds_num, nrow(s_df), nrow(c_df)
))
pass_all <- FALSE
rm(s_df, c_df)
gc(verbose = FALSE)
next
}
only_s <- setdiff(names(s_df), names(c_df))
only_c <- setdiff(names(c_df), names(s_df))
common <- intersect(names(s_df), names(c_df))
# Zero-padded survey codes (e.g. "010") lose leading zeros through
# read.csv(); compare numerically when both sides are numeric-convertible.
col_fail <- character(0)
for (col in common) {
sv <- strip_stata(s_df[[col]])
cv <- c_df[[col]]
sv_c <- trimws(as.character(sv))
cv_c <- trimws(as.character(cv))
sv_n <- suppressWarnings(as.numeric(sv_c))
cv_n <- suppressWarnings(as.numeric(cv_c))
sv_was_na <- is.na(sv_c) | sv_c == ""
cv_was_na <- is.na(cv_c) | cv_c == ""
if (sum(is.na(sv_n) & !sv_was_na) == 0L &&
sum(is.na(cv_n) & !cv_was_na) == 0L) {
n_diff <- sum(abs(sv_n - cv_n) > 1e-9, na.rm = TRUE) +
sum(xor(is.na(sv_n), is.na(cv_n)))
} else {
n_diff <- sum(sv_c != cv_c, na.rm = TRUE) +
sum(xor(is.na(sv_c), is.na(cv_c)))
}
if (n_diff > 0L) col_fail <- c(col_fail, sprintf("%s(%d)", col, n_diff))
}
if (length(col_fail) == 0L) {
cat(sprintf(
"Level %2d / ds%02d: PASS %7d rows x %d cols\n",
r$level, r$ds_num, nrow(s_df), length(common)
))
} else {
pass_all <- FALSE
cat(sprintf(
"Level %2d / ds%02d: FAIL %7d rows x %d cols | diffs: %s\n",
r$level, r$ds_num, nrow(s_df), length(common),
paste(col_fail, collapse = ", ")
))
}
if (length(only_s) > 0L) {
cat(sprintf(" only in STATA: %s\n", paste(only_s, collapse = ", ")))
}
if (length(only_c) > 0L) {
cat(sprintf(" only in CSV: %s\n", paste(only_c, collapse = ", ")))
}
rm(s_df, c_df)
gc(verbose = FALSE)
}
if (pass_all) {
cat("\n==> ALL LEVELS PASS: nesstarR output is identical to the STATA reference export.\n")
} else {
cat("\n==> Failures detected — see details above.\n")
}