Python Dataset Subsetting Script Using w10n

Webification (W10n; https://podaac.jpl.nasa.gov/podaac_labs) for science is ReSTful web service technology providing simplified access to PODAAC data and metadata via HTTP/HTTPS protocols. W10n-sci supports major Earth science data formats like NetCDF and HDF 4/5, and abstracts an arbitrary data store as a hierarchical tree of nodes for each associated attributes which can be individually interrogated. Direct access to the inner components of the tree is made via HTTP requests from either a web browser, script or similar client. Results of w10n-sci calls return specified measurement arrays (or metadata elements) via subset by array value (v.s. subset by array index), and output in formats such as JSON, HTML, or netCDF as specified in the URL request.
In this recipe, the multiple array selector w10n API is used to subset level 2, level 3 and level 4 data granules:
http://host:port/path/store/.../{array0,array1,array2,...}/[selector]?output=format
For example, to extract AMSR-E Level 2 Sea Surface Temperature (SST) data from 2011-01-01 to 2011-01-31 for a region bounded by -30 to -20 degrees longitude and -40 to 0 degrees latitude, this request can be made on the UNIX command line:
% ./subset_w10n.py -s 20110101 -f 20110131 -b -30 -20 -40 0 -x AMSRE-REMSS-L2P-v7a
where 'AMSRE-REMSS-L2P-v7a' is the shortname for the dataset. The shortname is a required field for the script to run. Without it the script will print out a help menu.
The shortname can be found at the PO.DAAC dataset information page for each product. For example for the AMSR-E SST dataset it is here:
http://podaac.jpl.nasa.gov/dataset/AMSRE-REMSS-L2P-v7a
The script can also be downloaded from the link: ftp://podaac.jpl.nasa.gov/allData/common/sw/subsetting/subset_w10n.py
For more information on the w10n-sci API please visit this link: http://mipldevwebify1.jpl.nasa.gov:10080/data/oco2.html
In this recipe, the multiple array selector w10n API is used to subset level 2, level 3 and level 4 data granules:
http://host:port/path/store/.../{array0,array1,array2,...}/[selector]?output=format
For example, to extract AMSR-E Level 2 Sea Surface Temperature (SST) data from 2011-01-01 to 2011-01-31 for a region bounded by -30 to -20 degrees longitude and -40 to 0 degrees latitude, this request can be made on the UNIX command line:
% ./subset_w10n.py -s 20110101 -f 20110131 -b -30 -20 -40 0 -x AMSRE-REMSS-L2P-v7a
where 'AMSRE-REMSS-L2P-v7a' is the shortname for the dataset. The shortname is a required field for the script to run. Without it the script will print out a help menu.
The shortname can be found at the PO.DAAC dataset information page for each product. For example for the AMSR-E SST dataset it is here:
http://podaac.jpl.nasa.gov/dataset/AMSRE-REMSS-L2P-v7a
- Code: Select all
#! /usr/bin/env python
#
# a skeleton script to download a set of L2 files using w10n.
#
#
# 2016.08.20 Yibo Jiang, version 0
##################################
# user parameters to be editted: #
##################################
# Caution: This is a Python script, and Python takes indentation seriously.
# DO NOT CHANGE INDENTATION OF ANY LINE BELOW!
# Example:
# % ./subset_w10n.py -s 20100101 -f 20100205 -b -30 -20 -40 0 -x AMSRE-REMSS-L2P-v7a
# Subset the data from 1 Jan 2010 to 2 Jan 2010 in a box from -140 to -110 degrees longitude and 20 to 30 degrees latitude
# for shortName AMSRE-REMSS-L2P-v7a.
import sys,os
import commands
import time
from datetime import date, timedelta
from optparse import OptionParser
import urllib
from xml.dom import minidom
#####################
# Global Parameters #
#####################
itemsPerPage = 10
PODAAC_WEB = 'http://podaac.jpl.nasa.gov'
w10n_root = "http://podaac-w10n.jpl.nasa.gov/w10n/"
###############
# subroutines #
###############
def today():
import datetime
todays=datetime.date.today()
return str(todays.year)+str(todays.month).zfill(2)+str(todays.day).zfill(2)
def yesterday():
import datetime
yesterdays=datetime.date.today() - timedelta(days=1)
return str(yesterdays.year)+str(yesterdays.month).zfill(2)+str(yesterdays.day).zfill(2)
def yearday(day,month,year):
months=[0,31,28,31,30,31,30,31,31,30,31,30,31]
if isLeap(year):
months[2]=29
for m in range(month):
day=day+months[m]
return(day)
def parseoptions():
usage = "Usage: %prog [options]"
parser = OptionParser(usage)
parser.add_option("-x", "--shortname", help="product short name", dest="shortname")
parser.add_option("-s", "--start", help="start date: Format yyyymmdd (eg. -s 20140502 for May 2, 2014) [default: yesterday %default]", dest="date0", default=yesterday())
parser.add_option("-f", "--finish", help="finish date: Format yyyymmdd (eg. -f 20140502 for May 2, 2014) [default: today %default]", dest="date1", default=today())
parser.add_option("-b", "--subset", help="limit the domain to a box given by lon_min,lon_max,lat_min,lat_max (eg. -r -140 -110 20 30) [default: %default]",
type="float", nargs=4, dest="box", default=(-180.,180.,-90.,90.))
# Parse command line arguments
(options, args) = parser.parse_args()
# print help if no arguments are given
if(len(sys.argv) == 1):
parser.print_help()
exit(-1)
if options.shortname == None:
print('\nShortname is required !\nProgram will exit now !\n')
parser.print_help()
exit(-1)
return( options )
def standalone_main():
# get command line options:
options=parseoptions()
shortname = options.shortname
date0 = options.date0
if options.date1==-1:
date1 = date0
else:
date1 = options.date1
if len(date0) != 8:
sys.exit('\nStart date should be in format yyyymmdd !\nProgram will exit now !\n')
if len(date1) != 8:
sys.exit('\nEnd date should be in format yyyymmdd !\nProgram will exit now !\n')
year0=date0[0:4]; month0=date0[4:6]; day0=date0[6:8];
year1=date1[0:4]; month1=date1[4:6]; day1=date1[6:8];
timeStr = '&startTime='+year0+'-'+month0+'-'+day0+'&endTime='+year1+'-'+month1+'-'+day1
box = list( options.box )
print '\nPlease wait while program searching for the granules ...\n'
wsurl = PODAAC_WEB+'/ws/search/granule/?shortName='+shortname+timeStr+'&itemsPerPage=1&sortBy=timeAsc&format=atom'
response = urllib.urlopen(wsurl)
data = response.read()
if (len(data.splitlines()) == 1):
sys.exit('No granules found for dataset: '+shortname+'\nProgram will exit now !\n')
numGranules = 0
doc = minidom.parseString(data)
for arrays in doc.getElementsByTagName('link'):
names = arrays.getAttribute("title")
if names == 'OPeNDAP URL':
numGranules = numGranules + 1
href = arrays.getAttribute("href")
if numGranules == 0 and len(data.splitlines()) < 30:
sys.exit('No granules found for dataset: '+shortname+'\nProgram will exit now !\n')
elif numGranules == 0 and len(data.splitlines()) > 30:
sys.exit('No OpenDap access for dataset: '+shortname+'\nProgram will exit now !\n')
samplefile = href.rsplit( ".", 1 )[ 0 ] + '.ddx'
variable_list = []
lon_order = 'X'
lat_order = 'Y'
nt = 1 #time dimension
nd = 1 #depth dimension
doc = minidom.parse(urllib.urlopen(samplefile))
for arrays in doc.getElementsByTagName('Array'):
names = arrays.getAttribute("name")
if names == 'lat' or names == 'latitude':
for dimensions in arrays.getElementsByTagName("dimension"):
size = dimensions.getAttribute("size")
name = dimensions.getAttribute("name")
ni = int(size)
for attrs in arrays.getElementsByTagName("Attribute"):
aname = attrs.getAttribute("name")
if aname == 'axis':
for nodes in attrs.getElementsByTagName("value"):
for cn in nodes.childNodes:
lat_order = cn.nodeValue
elif names == 'lon' or names == 'longitude':
for dimensions in arrays.getElementsByTagName("dimension"):
size = dimensions.getAttribute("size")
name = dimensions.getAttribute("name")
nj = int(size)
for attrs in arrays.getElementsByTagName("Attribute"):
aname = attrs.getAttribute("name")
if aname == 'axis':
for nodes in attrs.getElementsByTagName("value"):
for cn in nodes.childNodes:
lon_order = cn.nodeValue
else:
variable_list.append(names)
try:
ni # does a exist in the current namespace
except NameError:
sys.exit('Granule file format may not be in netcdf or no latitude or longitude info for dataset: '+shortname+'\n')
#************************************************************************************
# download size information:
print ' '
print 'Longitude range: %f to %f'%(box[0],box[1])
print 'Latitude range: %f to %f'%(box[2],box[3])
print ' '
r=raw_input('OK to download? [yes or no]: ')
if len(r)==0 or (r[0]!='y' and r[0]!='Y'):
print '... no download'
sys.exit(0)
# main loop:
start = time.time()
bmore = 1
while (bmore > 0):
if (bmore == 1):
urllink = PODAAC_WEB+'/ws/search/granule/?shortName='+shortname+timeStr+'&itemsPerPage=%d&sortBy=timeAsc'%itemsPerPage
else:
urllink = PODAAC_WEB+'/ws/search/granule/?shortName='+shortname+timeStr+'&itemsPerPage=%d&sortBy=timeAsc&startIndex=%d'%(itemsPerPage, (bmore-1)*itemsPerPage)
bmore = bmore + 1
response = urllib.urlopen(urllink)
data = response.read()
doc = minidom.parseString(data)
numGranules = 0
for arrays in doc.getElementsByTagName('link'):
names = arrays.getAttribute("title")
if names == 'FTP URL':
numGranules = numGranules + 1
href = arrays.getAttribute("href")
aindex = href.find("allData");
str_temp = w10n_root + href[aindex:]
ncfile = str_temp.rsplit( ".", 1 )[ 0 ]
head, tail = os.path.split(str_temp)
ncout = tail
if ncout.endswith('.bz2') or ncout.endswith('.gz'):
ncout = ncout.rsplit( ".", 1 )[ 0 ]
ncout = ncout.rsplit( ".", 1 )[ 0 ]+'_subset.'+ncout.rsplit( ".", 1 )[ 1 ]
cmd=ncfile+'.nc/{'
cmd = cmd + 'sea_surface_temperature,lat,lon}'
cmd=cmd+"/[%s<lat<%s,%s<lon<%s]?output=nc.4" % (box[2],box[3],box[0],box[1])
status_curl, result = commands.getstatusoutput("which curl")
status_wget, result = commands.getstatusoutput("which wget")
if status_curl == 0:
cmd='curl -g "'+cmd+'" -o '+ ncout
elif status_wget == 0:
cmd='wget "'+cmd+'" -O '+ ncout
else:
sys.exit('\nThe script will need curl or wget on the system, please install them first before running the script !\nProgram will exit now !\n')
os.system( cmd )
print ncout + ' download finished !'
if numGranules < itemsPerPage:
bmore = 0
end = time.time()
print 'Time spend = ' + str(end - start) + ' seconds'
if __name__ == "__main__":
standalone_main()
The script can also be downloaded from the link: ftp://podaac.jpl.nasa.gov/allData/common/sw/subsetting/subset_w10n.py
For more information on the w10n-sci API please visit this link: http://mipldevwebify1.jpl.nasa.gov:10080/data/oco2.html