Python Dataset Subsetting Script Using w10n

Python Dataset Subsetting Script Using w10n

Postby yiboj » Mon Aug 22, 2016 12:52 pm

Webification (W10n; https://podaac.jpl.nasa.gov/podaac_labs) for science is ReSTful web service technology providing simplified access to PODAAC data and metadata via HTTP/HTTPS protocols. W10n-sci supports major Earth science data formats like NetCDF and HDF 4/5, and abstracts an arbitrary data store as a hierarchical tree of nodes for each associated attributes which can be individually interrogated. Direct access to the inner components of the tree is made via HTTP requests from either a web browser, script or similar client. Results of w10n-sci calls return specified measurement arrays (or metadata elements) via subset by array value (v.s. subset by array index), and output in formats such as JSON, HTML, or netCDF as specified in the URL request.

In this recipe, the multiple array selector w10n API is used to subset level 2, level 3 and level 4 data granules:

http://host:port/path/store/.../{array0,array1,array2,...}/[selector]?output=format

For example, to extract AMSR-E Level 2 Sea Surface Temperature (SST) data from 2011-01-01 to 2011-01-31 for a region bounded by -30 to -20 degrees longitude and -40 to 0 degrees latitude, this request can be made on the UNIX command line:

% ./subset_w10n.py -s 20110101 -f 20110131 -b -30 -20 -40 0 -x AMSRE-REMSS-L2P-v7a

where 'AMSRE-REMSS-L2P-v7a' is the shortname for the dataset. The shortname is a required field for the script to run. Without it the script will print out a help menu.

The shortname can be found at the PO.DAAC dataset information page for each product. For example for the AMSR-E SST dataset it is here:

http://podaac.jpl.nasa.gov/dataset/AMSRE-REMSS-L2P-v7a


Code: Select all
#! /usr/bin/env python
#
# a skeleton script to download a set of L2 files using w10n.
#
#
#   2016.08.20  Yibo Jiang, version 0

##################################
# user parameters to be editted: #
##################################

# Caution: This is a Python script, and Python takes indentation seriously.
# DO NOT CHANGE INDENTATION OF ANY LINE BELOW!

# Example:
# % ./subset_w10n.py -s 20100101 -f 20100205 -b -30 -20 -40 0 -x AMSRE-REMSS-L2P-v7a
# Subset the data from 1 Jan 2010 to 2 Jan 2010 in a box from -140 to -110 degrees longitude and 20 to 30 degrees latitude
# for shortName AMSRE-REMSS-L2P-v7a.

import sys,os
import commands
import time
from datetime import date, timedelta
from optparse import OptionParser
import urllib

from xml.dom import minidom

#####################
# Global Parameters #
#####################

itemsPerPage = 10
PODAAC_WEB = 'http://podaac.jpl.nasa.gov'
w10n_root = "http://podaac-w10n.jpl.nasa.gov/w10n/"

###############
# subroutines #
###############

def today():
  import datetime
  todays=datetime.date.today()
  return str(todays.year)+str(todays.month).zfill(2)+str(todays.day).zfill(2)

def yesterday():
  import datetime
  yesterdays=datetime.date.today() - timedelta(days=1)
  return str(yesterdays.year)+str(yesterdays.month).zfill(2)+str(yesterdays.day).zfill(2)

def yearday(day,month,year):
  months=[0,31,28,31,30,31,30,31,31,30,31,30,31]
  if isLeap(year):
    months[2]=29
  for m in range(month):
    day=day+months[m]
  return(day)

def parseoptions():
  usage = "Usage: %prog [options]"
  parser = OptionParser(usage)
  parser.add_option("-x", "--shortname", help="product short name", dest="shortname")

  parser.add_option("-s", "--start", help="start date: Format yyyymmdd (eg. -s 20140502 for May 2, 2014) [default: yesterday %default]", dest="date0", default=yesterday())
  parser.add_option("-f", "--finish", help="finish date: Format yyyymmdd (eg. -f 20140502 for May 2, 2014) [default: today %default]", dest="date1", default=today())
  parser.add_option("-b", "--subset", help="limit the domain to a box given by lon_min,lon_max,lat_min,lat_max (eg. -r -140 -110 20 30) [default: %default]",
                        type="float", nargs=4, dest="box", default=(-180.,180.,-90.,90.))

  # Parse command line arguments
  (options, args) = parser.parse_args()

  # print help if no arguments are given
  if(len(sys.argv) == 1):
    parser.print_help()
    exit(-1)

  if options.shortname == None:
    print('\nShortname is required !\nProgram will exit now !\n')
    parser.print_help()
    exit(-1)

  return( options )

def standalone_main():
  # get command line options:

  options=parseoptions()

  shortname = options.shortname

  date0 = options.date0
  if options.date1==-1:
    date1 = date0
  else:
    date1 = options.date1

  if len(date0) != 8:
    sys.exit('\nStart date should be in format yyyymmdd !\nProgram will exit now !\n')

  if len(date1) != 8:
    sys.exit('\nEnd date should be in format yyyymmdd !\nProgram will exit now !\n')

  year0=date0[0:4]; month0=date0[4:6]; day0=date0[6:8];
  year1=date1[0:4]; month1=date1[4:6]; day1=date1[6:8];

  timeStr = '&startTime='+year0+'-'+month0+'-'+day0+'&endTime='+year1+'-'+month1+'-'+day1

  box = list( options.box )

  print '\nPlease wait while program searching for the granules ...\n'

  wsurl = PODAAC_WEB+'/ws/search/granule/?shortName='+shortname+timeStr+'&itemsPerPage=1&sortBy=timeAsc&format=atom'
  response = urllib.urlopen(wsurl)
  data = response.read()

  if (len(data.splitlines()) == 1):
    sys.exit('No granules found for dataset: '+shortname+'\nProgram will exit now !\n')

  numGranules = 0
  doc = minidom.parseString(data)
  for arrays in doc.getElementsByTagName('link'):
   names = arrays.getAttribute("title")
   if names == 'OPeNDAP URL':
      numGranules = numGranules + 1
      href = arrays.getAttribute("href")

  if numGranules == 0 and len(data.splitlines()) < 30:
    sys.exit('No granules found for dataset: '+shortname+'\nProgram will exit now !\n')
  elif numGranules == 0 and len(data.splitlines()) > 30:
    sys.exit('No OpenDap access for dataset: '+shortname+'\nProgram will exit now !\n')

  samplefile = href.rsplit( ".", 1 )[ 0 ] + '.ddx'

  variable_list = []
  lon_order = 'X'
  lat_order = 'Y'
  nt = 1 #time dimension
  nd = 1 #depth dimension

  doc = minidom.parse(urllib.urlopen(samplefile))

  for arrays in doc.getElementsByTagName('Array'):
   names = arrays.getAttribute("name")
   if names == 'lat' or names == 'latitude':
     for dimensions in arrays.getElementsByTagName("dimension"):
       size = dimensions.getAttribute("size")
       name = dimensions.getAttribute("name")
       ni = int(size)
     for attrs in arrays.getElementsByTagName("Attribute"):
       aname = attrs.getAttribute("name")
       if aname == 'axis':
         for nodes in attrs.getElementsByTagName("value"):
           for cn in nodes.childNodes:
              lat_order = cn.nodeValue
   elif names == 'lon' or names == 'longitude':
     for dimensions in arrays.getElementsByTagName("dimension"):
       size = dimensions.getAttribute("size")
       name = dimensions.getAttribute("name")
       nj = int(size)
     for attrs in arrays.getElementsByTagName("Attribute"):
       aname = attrs.getAttribute("name")
       if aname == 'axis':
         for nodes in attrs.getElementsByTagName("value"):
           for cn in nodes.childNodes:
              lon_order = cn.nodeValue
   else:
     variable_list.append(names)

  try:
     ni # does a exist in the current namespace
  except NameError:
    sys.exit('Granule file format may not be in netcdf or no latitude or longitude info for dataset: '+shortname+'\n')

  #************************************************************************************
  # download size information:
  print ' '
  print 'Longitude range: %f to %f'%(box[0],box[1])
  print 'Latitude range: %f to %f'%(box[2],box[3])
  print ' '

  r=raw_input('OK to download?  [yes or no]: ')
  if len(r)==0 or (r[0]!='y' and r[0]!='Y'):
    print '... no download'
    sys.exit(0)

  # main loop:
  start = time.time()
  bmore = 1
  while (bmore > 0):
   if (bmore == 1):
       urllink = PODAAC_WEB+'/ws/search/granule/?shortName='+shortname+timeStr+'&itemsPerPage=%d&sortBy=timeAsc'%itemsPerPage
   else:
       urllink = PODAAC_WEB+'/ws/search/granule/?shortName='+shortname+timeStr+'&itemsPerPage=%d&sortBy=timeAsc&startIndex=%d'%(itemsPerPage, (bmore-1)*itemsPerPage)
   bmore = bmore + 1
   response = urllib.urlopen(urllink)
   data = response.read()
   doc = minidom.parseString(data)

   numGranules = 0
   for arrays in doc.getElementsByTagName('link'):
    names = arrays.getAttribute("title")
    if names == 'FTP URL':
      numGranules = numGranules + 1
      href = arrays.getAttribute("href")
      aindex = href.find("allData");
      str_temp = w10n_root + href[aindex:]
      ncfile = str_temp.rsplit( ".", 1 )[ 0 ]
      head, tail = os.path.split(str_temp)
      ncout = tail
      if ncout.endswith('.bz2') or ncout.endswith('.gz'):
        ncout = ncout.rsplit( ".", 1 )[ 0 ]
      ncout = ncout.rsplit( ".", 1 )[ 0 ]+'_subset.'+ncout.rsplit( ".", 1 )[ 1 ]
      cmd=ncfile+'.nc/{'
      cmd = cmd + 'sea_surface_temperature,lat,lon}'

      cmd=cmd+"/[%s<lat<%s,%s<lon<%s]?output=nc.4" % (box[2],box[3],box[0],box[1])

      status_curl, result = commands.getstatusoutput("which curl")
      status_wget, result = commands.getstatusoutput("which wget")

      if status_curl == 0:
        cmd='curl -g "'+cmd+'" -o '+ ncout
      elif status_wget == 0:
        cmd='wget "'+cmd+'" -O '+ ncout
      else:
        sys.exit('\nThe script will need curl or wget on the system, please install them first before running the script !\nProgram will exit now !\n')

      os.system( cmd )
      print ncout + ' download finished !'

   if numGranules < itemsPerPage:
     bmore = 0

  end = time.time()
  print 'Time spend = ' + str(end - start) + ' seconds'

if __name__ == "__main__":
        standalone_main()



The script can also be downloaded from the link: ftp://podaac.jpl.nasa.gov/allData/common/sw/subsetting/subset_w10n.py

For more information on the w10n-sci API please visit this link: http://mipldevwebify1.jpl.nasa.gov:10080/data/oco2.html
yiboj
 
Posts: 130
Joined: Mon Mar 30, 2015 11:22 am

Return to Data Access and Services