Pre-processing intertidal data

All data for the intertidal pages is to be pulled directly from metacat and then pre-processed to a suitable form. This is in contrast to the subtidal pages where much of the data was based on "depth populated" tables and other sets that required extensive SAS processing.

This page documents the process I am using to pre-process these data so that others can check my methods and so that I can more easily create a Kepler workflow in the future. I will be using R as there is already an actor built into Kepler to work with it.

UPC

    1 # complete documentation for this script can be found at :
    2 # http://ucsb.piscoweb.org/~cburt/intertidal/workflow.html
    3 classes = c(
    4   'factor',   #site_code
    5   'factor',   #survey_rep
    6   'integer',  #year
    7   'integer',  #month
    8   'integer',  #day
    9   'factor',   #section
   10   'numeric',  #transect must be numeric
   11   'factor',   #location
   12   'factor',   #class_code
   13   'factor',   #sampler_code
   14   'numeric',  #point
   15   'factor',   #pool_code
   16   'factor',   #substrate_code
   17   'factor',   #canopy_code
   18   'factor',   #epi_code
   19   'factor'    #host_code
   20 )
   21 upc_data  = read.csv(
   22   file="upc_numeric.csv", 
   23   na.strings='.', 
   24   head=TRUE, sep=",", 
   25   colClasses = classes
   26 )
   27 data = data.frame()
   28 sites = unique(upc_data$site_code);
   29 class_codes = levels(upc_data$class_code)
   30 num_of_sites = length(sites)
   31 site_count = 0
   32 for(site in sites){
   33   site_count = site_count + 1
   34   message = paste('site', site_count, '/', num_of_sites)
   35   print(message)
   36   data_for_site = subset(upc_data, site_code == sites[site]);
   37   years = unique(data_for_site$year)
   38   num_of_years = length(years)
   39   year_counter = 0
   40   for(y in years){
   41     year_counter = year_counter + 1
   42     message = paste('year', year_counter, '/', num_of_years)
   43     print(message)
   44     data_for_year = subset(data_for_site, year == y)
   45     t = table(data_for_year[c('transect', 'class_code')])
   46     points_counted = table(data_for_year$transect)
   47     #We have to stack this to allow matrix division
   48     points_counted = stack(points_counted)[,1]
   49     percent_cover = (t / points_counted) * 100
   50     mean = colMeans(percent_cover)
   51     standard_deviation = sd(percent_cover)
   52     n = length(percent_cover[, 1])
   53     standard_error = standard_deviation / sqrt(n)
   54     df = data.frame(
   55       year = y,
   56       site_code = sites[site],
   57       class_code=as.array(labels(mean)),
   58       mean=mean,
   59       standard_deviation=standard_deviation,
   60       standard_error=standard_error,
   61       n = n
   62     )
   63     data = rbind(data, df)
   64   }
   65 }
   66 write.table(
   67   na.omit(data),
   68   file="./upc_output.csv",
   69   row.names=FALSE,
   70   col.names = FALSE,
   71   sep="\t",
   72   quote=FALSE
   73 )

lines 1 - 20This is where the script specifies typing for the data coming in from the data catalog's csv file. It is good to not have any surprises here as sometimes having a factor instead of an integer can mess things up. In the case of building a frequence table on line 45, if transect were a factor, the table() function would populate zeros for every transect label found in the whole dataset. As this data has not only transect 0, 3, 6, 9..., but can also have 0, 3.9, 6.9, ..., that could be a problem.

lines 21 - 26Data from the catalog csv file is loaded with our options, switching "." to 'NA'

Quad

    3 classes = c(
    4   'factor',   #site_code
    5   'factor',   #survey_rep
    6   'integer',  #year
    7   'integer',  #month
    8   'integer',  #day
    9   'factor',   #section
   10   'factor',   #transect
   11   'factor',   #location
   12   'factor',   #classification_code
   13   'factor',   #sampler_code
   14   'factor',   #zone_code
   15   'integer'   #count
   16 )
   17 quad_data = read.csv(
   18   file="quad_numeric.csv", 
   19   na.strings='.', 
   20   head=TRUE, sep=",", 
   21   colClasses = classes
   22 )

First, a vector of data types are defined so that we can define the quad_data data.frame more finely. Otherwise, we might end up with factors where we may not want them. With these types, the quad_numeric data as it comes off the data catalog is read. "."'s are replaced with "NA" and the header is ignored.

   23 # this script will only handle the number of rows in nrow for the output.
   24 # as of 2006 this is 30,000 rows
   25 data = matrix(nrow=100000, ncol=7)
   26 #pointer for adding rows to the output matrix
   27 pointer = 0

data is set up as a matrix to hold all of the output. The number of rows is set to 100,000. Right now with 6 years of data the output is at 30,000 rows so this should be ok for a while. A pointer variable is set up so that data can be appended to the matrix without overwriting anything.

I have used a matrix rather than a data.frame and the rbind() function because it is much faster. A very quick test indicated improvement on the order of 40%

   28 sites = unique(quad_data$site_code);
   29 class_codes = levels(quad_data$class_code)
   30 num_of_sites = length(sites)
   31 site_count = 0

The script then gets a list of sites and class_codes to iterate over.

num_of_sites and site_count are for printing out progress messages to the screen.

   32 for(site in sites){
   33   site_count = site_count + 1
   34   message = paste('site', site_count, '/', num_of_sites)
   35   print(message)
   36   data_for_site = subset(quad_data, site_code == sites[site]);
   37   years = unique(data_for_site$year)

Now starting to iterate over each site. Lines 33-35 print a status message to the screen. quad_data is subset for just the current site. The years when the site was sampled are put into years.

   38   num_of_years = length(years)
   39   year_counter = 0
   40   for(y in years){
   41     year_counter = year_counter + 1
   42     message = paste('year', year_counter, '/', num_of_years)
   43     print(message)
   44     data_for_year = subset(data_for_site, year == y)
   45     #taking the taxon list from the year data. 
   46     #So assuming that if a species does not have a zero datapoint, it has not been sampled
   47     cc = unique(data_for_year$class_code)

Before iterating over the years, num_of_years and year_counter are setup for printing status messages. After entering the for-loop the message is output. data_for_site is then subset for the selected year. Unique class_codes within that site/year are then pulled out and put into cc.

Notice that cc is derived from the unique class_codes appearing in data_for_year. Any class_codes not recorded are left out. I assumed that zeros are already populated for each class_code on the list. If this is a problem we would need to create another classes table specifically for quad.

   48     for(c in cc){
   49       data_for_taxon = subset(data_for_year, class_code == class_codes[c])
   50       counts = data_for_taxon$count * 4
   51       mean = mean(counts)

Start iterating over each class_code. The data is then subset for the current class_code. The count is pulled out of that subset and multiplied by 4. The quadrates are 50cm^2 so this will give us a count per meter^2. A mean is then taken from this.

   52       if(!is.na(mean)){
   53         pointer = pointer + 1
   54         standard_deviation = sd(counts)
   55         n = length(counts)
   56         standard_error = standard_deviation / sqrt(n)
   57         data[pointer, ] = c(
   58           data_for_taxon$year[1],   #year 
   59           sites[site],              #site
   60           class_codes[c],           #class_code
   61           mean,                     #mean
   62           standard_deviation,       #standard_deviation
   63           standard_error,           #standard_error
   64           n                         #replicates, number of quads
   65         )
   66       }else{
   67         print('warning: mean is NA')
   68       }

If there is only one value in the counts vector, mean() will return 'NA'. I'm throwing those values out because it is impossible to get standard error. If there is a mean, pointer is advanced in anticipation of storing the preprocessed data. Standard deviation and standard error are calculated. The row of preprocessed data is then added to the data matrix.

   69     }
   70   }
   71 }
   72 write.table(
   73   na.omit(data),
   74   file="./quad_output.csv",
   75   row.names=FALSE,
   76   col.names = FALSE,
   77   sep="\t",
   78   quote=FALSE
   79 )

After all the sites, years, and class_codes have been iterated through, data needs to be written out to a csv file for entry into the database. row/col names are not to be written and the delimiter is set to tabs(\t). Quotes are also set to false as the matrix adds these and postgres has trouble parsing them.

The resulting file ends up looking like so...

2001    10      360     16.8484848484848        37.7062007600551        6.56380704330534        33
2001    10      364     0			0			0			33
2001    10      366     0			0			0			33
2001    10      368     0			0			0			33
2001    10      369     2.66666666666667        7.39369100427295        1.28707639888456        33
2001    10      371     0			0			0			33
2001    10      372     5.21212121212121        12.2263996534077        2.12834299244277        33
2001    10      373     0			0			0			33
2001    10      374     3.75757575757576        7.80636880626286        1.35891438331992        33
2001    10      375     0			0			0			33
2001    10      376     11.8787878787879        18.4522315313040        3.21212121212121        33
2001    10      378     4.96969696969697        8.36841102183103        1.45675337475417        33
2001    10      379     0			0			0			33
2001    10      380     43.2727272727273        73.1262917523824        12.7296534569872        33

Here is the complete script (quad_preprocessing.r):

    1 # complete documentation for this script can be found at :
    2 # http://ucsb.piscoweb.org/~cburt/intertidal/workflow.html
    3 classes = c(
    4   'factor',   #site_code
    5   'factor',   #survey_rep
    6   'integer',  #year
    7   'integer',  #month
    8   'integer',  #day
    9   'factor',   #section
   10   'factor',   #transect
   11   'factor',   #location
   12   'factor',   #classification_code
   13   'factor',   #sampler_code
   14   'factor',   #zone_code
   15   'integer'   #count
   16 )
   17 quad_data = read.csv(
   18   file="quad_numeric.csv", 
   19   na.strings='.', 
   20   head=TRUE, sep=",", 
   21   colClasses = classes
   22 )
   23 # this script will only handle the number of rows in nrow for the output.
   24 # as of 2006 this is 30,000 rows
   25 data = matrix(nrow=100000, ncol=7)
   26 #pointer for adding rows to the output matrix
   27 pointer = 0
   28 sites = unique(quad_data$site_code);
   29 class_codes = levels(quad_data$class_code)
   30 num_of_sites = length(sites)
   31 site_count = 0
   32 for(site in sites){
   33   site_count = site_count + 1
   34   message = paste('site', site_count, '/', num_of_sites)
   35   print(message)
   36   data_for_site = subset(quad_data, site_code == sites[site]);
   37   years = unique(data_for_site$year)
   38   num_of_years = length(years)
   39   year_counter = 0
   40   for(y in years){
   41     year_counter = year_counter + 1
   42     message = paste('year', year_counter, '/', num_of_years)
   43     print(message)
   44     data_for_year = subset(data_for_site, year == y)
   45     #taking the taxon list from the year data. 
   46     #So assuming that if a species does not have a zero datapoint, it has not been sampled
   47     cc = unique(data_for_year$class_code)
   48     for(c in cc){
   49       data_for_taxon = subset(data_for_year, class_code == class_codes[c])
   50       counts = data_for_taxon$count * 4
   51       mean = mean(counts)
   52       if(!is.na(mean)){
   53         pointer = pointer + 1
   54         standard_deviation = sd(counts)
   55         n = length(counts)
   56         standard_error = standard_deviation / sqrt(n)
   57         data[pointer, ] = c(
   58           data_for_taxon$year[1],   #year 
   59           sites[site],              #site
   60           class_codes[c],           #class_code
   61           mean,                     #mean
   62           standard_deviation,       #standard_deviation
   63           standard_error,           #standard_error
   64           n                         #replicates, number of quads
   65         )
   66       }else{
   67         print('warning: mean is NA')
   68       }
   69     }
   70   }
   71 }
   72 write.table(
   73   na.omit(data),
   74   file="./quad_output.csv",
   75   row.names=FALSE,
   76   col.names = FALSE,
   77   sep="\t",
   78   quote=FALSE
   79 )

Swath

    1 swath   = read.csv(file="swath_numeric.csv", head=TRUE, sep=",")
    2 swath$class_code = factor(swath$class_code)

Here we are simply pulling in the swath data as it is stored on the data catalog. In the future, this could be brought into the working directory using the metacat API or Kepler.

I have also turned the class_code column into a factor. This specifies that every class_code must be represented when generating frequence tables.

    2 sites = unique(swath$site_code);
    3 class_codes = unique(levels(swath$class_code))

Grab all the unique class and site codes and throw them into vectors to iterate over.

    4 for(site in sites){
    5   data_for_site = subset(swath, site_code == site);
    6   years = unique(data_for_site$year)
    7   for(year in years){ ... }
   26 }

Now we have a looping structure that for each site, then goes into a loop for each year that the site was sampled, where the data is further subset.

    8   for(year in years){
    9     data_for_year = subset(data_for_site, year == year)
   10     t = table(data_for_year[c('transect', 'class_code')])

Inside that inner loop, a frequency table is generated that counts the occurrence of each count of a particular organism for each transect (line 10). R automagically does this because the class_codes are specified as a factor. So even if a particular class_code is not present for a transect, it's absence will be noted. Peering into variable t would look like this ...

	        class_code
	transect 54 146 167 239 283 284 297 526 527 575
	      0   0   0   2   0   0   0   0   0  10   1
	      3   0   0   0   0   0   0   0   0  28   1
	      6   0   0  12   0   0   0   4   0  17   1
	      9   0   0   2   1   0   0   1   0  13   5
	      12  0   0   2   0   0   0   0   0  17   1
	      15  0   0   1   0   0   0   0   0  10   0
	      18  0   0   0   0   0   0   0   0  31   0
	      21  0   0   1   0   0   0   0   0   9   1
	      24  0   0   2   0   0   0   0   0   6   1
	      27  0   0   3   0   0   0   3   0  22   0
	      30  0   0   0   0   0   0   0   0  40   1

   11     mean = colMeans(t)
   12     standard_deviation = sd(t)
   13     n = length(t[, 1])
   14     standard_error = standard_deviation / sqrt(n)

Using this table, the script generates mean, standard_deviation, and standard_error for each transect and class_code...

> mean
       54        146        167        239        283        284        297        526        527        575 
0.0000000  0.0000000  2.2727273  0.0909091  0.0000000  0.0000000  0.7272727  0.0000000 18.4545455  1.0909091

>standard_deviation
       54        146        167        239        283        284        297        526        527        575 
0.0000000  0.0000000  3.3790800  0.3015113  0.0000000  0.0000000  1.4206273  0.0000000 10.7085353  1.3751033

> standard_error
       54       146       167       239       283       284       297       526       527       575 
0.0000000 0.0000000 1.0188310 0.0909091 0.0000000 0.0000000 0.4283352 0.0000000 3.2287449 0.4146092
   15     df = data.frame(
   16       year = year,
   17       site_code = site,
   18       class_code=as.array(labels(mean)),
   19       mean=mean,
   20       standard_deviation=standard_deviation,
   21       standard_error=standard_error,
   22       n = n
   23     )
   24     print(df)

Those numbers are then put into a data.frame and printed to the screen for manual checking. Afterwards these rows from each data.frame are loaded into a SQL database for use on the web application.

This is what the data.frame, df, looks like...

	    year site_code class_code       mean standard_deviation standard_error  n
	54  2005      3006         54  0.0000000          0.0000000      0.0000000 11
	146 2005      3006        146  0.0000000          0.0000000      0.0000000 11
	167 2005      3006        167  2.2727273          3.3790800      1.0188310 11
	239 2005      3006        239  0.0909091          0.3015113      0.0909091 11
	283 2005      3006        283  0.0000000          0.0000000      0.0000000 11
	284 2005      3006        284  0.0000000          0.0000000      0.0000000 11
	297 2005      3006        297  0.7272727          1.4206273      0.4283352 11
	526 2005      3006        526  0.0000000          0.0000000      0.0000000 11
	527 2005      3006        527 18.4545455         10.7085353      3.2287449 11
	575 2005      3006        575  1.0909091          1.3751033      0.4146092 11

Here is the complete script (swath_preprocessing.r):

    1 swath   = read.csv(file="swath_numeric.csv", head=TRUE, sep=",")
    2 swath$class_code = factor(swath$class_code)
    3 sites = unique(swath$site_code);
    4 class_codes = unique(levels(swath$class_code))
    5 for(site in sites){
    6   data_for_site = subset(swath, site_code == site);
    7   years = unique(data_for_site$year)
    8   for(year in years){
    9     data_for_year = subset(data_for_site, year == year)
   10     t = table(data_for_year[c('transect', 'class_code')])
   11     mean = colMeans(t)
   12     standard_deviation = sd(t)
   13     n = length(t[, 1])
   14     standard_error = standard_deviation / sqrt(n)
   15     df = data.frame(
   16       year = year,
   17       site_code = site,
   18       class_code=as.array(labels(mean)),
   19       mean=mean,
   20       standard_deviation=standard_deviation,
   21       standard_error=standard_error,
   22       n = n
   23     )
   24     print(df)
   25   }
   26 }