Open EHdr raster with hdr header file from memory


fkluibenschaedl@...
 

Hi All,

I am processing SNODAS data archives that store each variable in a single-band ESRI BIL format raster. rasterio.open() works like a charm when a) fp references a file on disk and b) when a hdr (header) file exists in that same location. (I am attaching a working set of files for reference).

I am curious if there is a way to load the raster from memory though. Since each eraster within the archive aforementioned archive is compressed using gzip, The goal is to download the dataset, unpack the .bil file, open it with the matching header information, and store it as e.g. GeoTIFF for further processing. Below is some code that (obviously) fails when it passes a tuple to the rasterio.open fp argument.

Looking forward to your thoughts on this. Thanks in advance,

Florian

import gzip
import tarfile
import rasterio

from ftplib import FTP
from io import BytesIO
from io import StringIO

header_values = {'byteorder': 'M',
                 'layout': 'bil',
                 'nbands': 1,
                 'nbits': 16,  # i.e. 2 bytes indicated in meta info text
                 'ncols': 6935,
                 'nodata_value': -9999,
                 'nrows': 3351,
                 'pixeltype': 'signedint',
                 'ulxmap': -124.729583333333,
                 'ulymap': 52.8704166666666,
                 'xdim': 0.00833333333333333,
                 'ydim': 0.00833333333333333
                 }


HOST = "sidads.colorado.edu"
ftp_path = 'DATASETS/NOAA/G02158/masked/2021/01_Jan/SNODAS_20210101.tar'
data_file_name = 'us_ssmv11034tS__T0001TTNATS2021010105HP001.dat.gz'

dl_file = BytesIO()
ftp = FTP(HOST)
ftp.login()
# ftp.login('anonymous',password)
ftp.retrbinary('RETR ' + ftp_path, dl_file.write)
dl_file.seek(0)


with tarfile.open(fileobj=dl_file, mode='r:') as tar_handle:
    gz_data_file = tar_handle.extractfile(member=data_file_name)
    with gzip.open(gz_data_file) as data_file:
        with StringIO() as hdr_file:
            for field, value in header_values.items():
                hdr_file.write(f"{field} {value}\n")
            with rasterio.open((data_file, hdr_file), driver="EHdr") as src_ds:
                out_profile = src_ds.profile
                out_profile.update(driver="GTiff")
                out_profile.update(compress="lzw")
                with rasterio.open('out.tiff', "w", **out_profile) as out_ds:
                    out_ds.write(src_ds.read(1), 1)


Luke
 

You could use a raw raster VRT combined with the /vsitar and /vsigzip virtual filesystems (you could probably use /vsicurl as well to replace your FTP code, but I couldn't get that to work immediately so gave up).


import rasterio

vrt = '''
<VRTDataset rasterXSize="6935" rasterYSize="3351">
    <SRS>EPSG:4326</SRS>
    <GeoTransform>-124.729583333333,0.00833333333333333,0,52.8704166666666,0,-0.00833333333333333</GeoTransform>
    <VRTRasterBand dataType="Int16" band="1" subClass="VRTRawRasterBand">
        <SourceFilename relativetoVRT="0">/vsigzip//vsitar/{}</SourceFilename>
        <NoDataValue>-9999</NoDataValue>
        <ImageOffset>0</ImageOffset>
        <PixelOffset>2</PixelOffset>
        <LineOffset>13870</LineOffset>
        <ByteOrder>MSB</ByteOrder>
    </VRTRasterBand>
</VRTDataset>
'''
file_path = '/path/to/SNODAS_20210101.tar/us_ssmv11050lL00T0024TTNATS2021010105DP000.dat.gz'
with rasterio.open(vrt.format(file_path)) as src:
    print(src.profile)

On Tue., 29 Jun. 2021, 08:27 , <fkluibenschaedl@...> wrote:

Hi All,

I am processing SNODAS data archives that store each variable in a single-band ESRI BIL format raster. rasterio.open() works like a charm when a) fp references a file on disk and b) when a hdr (header) file exists in that same location. (I am attaching a working set of files for reference).

I am curious if there is a way to load the raster from memory though. Since each eraster within the archive aforementioned archive is compressed using gzip, The goal is to download the dataset, unpack the .bil file, open it with the matching header information, and store it as e.g. GeoTIFF for further processing. Below is some code that (obviously) fails when it passes a tuple to the rasterio.open fp argument.

Looking forward to your thoughts on this. Thanks in advance,

Florian

import gzip
import tarfile
import rasterio

from ftplib import FTP
from io import BytesIO
from io import StringIO

header_values = {'byteorder': 'M',
                 'layout': 'bil',
                 'nbands': 1,
                 'nbits': 16,  # i.e. 2 bytes indicated in meta info text
                 'ncols': 6935,
                 'nodata_value': -9999,
                 'nrows': 3351,
                 'pixeltype': 'signedint',
                 'ulxmap': -124.729583333333,
                 'ulymap': 52.8704166666666,
                 'xdim': 0.00833333333333333,
                 'ydim': 0.00833333333333333
                 }


HOST = "sidads.colorado.edu"
ftp_path = 'DATASETS/NOAA/G02158/masked/2021/01_Jan/SNODAS_20210101.tar'
data_file_name = 'us_ssmv11034tS__T0001TTNATS2021010105HP001.dat.gz'

dl_file = BytesIO()
ftp = FTP(HOST)
ftp.login()
# ftp.login('anonymous',password)
ftp.retrbinary('RETR ' + ftp_path, dl_file.write)
dl_file.seek(0)


with tarfile.open(fileobj=dl_file, mode='r:') as tar_handle:
    gz_data_file = tar_handle.extractfile(member=data_file_name)
    with gzip.open(gz_data_file) as data_file:
        with StringIO() as hdr_file:
            for field, value in header_values.items():
                hdr_file.write(f"{field} {value}\n")
            with rasterio.open((data_file, hdr_file), driver="EHdr") as src_ds:
                out_profile = src_ds.profile
                out_profile.update(driver="GTiff")
                out_profile.update(compress="lzw")
                with rasterio.open('out.tiff', "w", **out_profile) as out_ds:
                    out_ds.write(src_ds.read(1), 1)


Luke
 

Replacing your FTP code with direct access using the /vsicurl/ virtual filesystem:

import rasterio

vrt = '''
<VRTDataset rasterXSize="6935" rasterYSize="3351">
<SRS>EPSG:4326</SRS>
<GeoTransform>-124.729583333333,0.00833333333333333,0,52.8704166666666,0,-0.00833333333333333</GeoTransform>
<VRTRasterBand dataType="Int16" band="1" subClass="VRTRawRasterBand">
<SourceFilename relativetoVRT="0">{}/{}/{}</SourceFilename>
<NoDataValue>-9999</NoDataValue>
<ImageOffset>0</ImageOffset>
<PixelOffset>2</PixelOffset>
<LineOffset>13870</LineOffset>
<ByteOrder>MSB</ByteOrder>
</VRTRasterBand>
</VRTDataset>
'''
""
vsi_path = '/vsigzip//vsitar//vsicurl'
ftp_path = 'ftp://sidads.colorado.edu/DATASETS/NOAA/G02158/masked/2021/01_Jan/SNODAS_20210101.tar'
file_path = 'us_ssmv11050lL00T0024TTNATS2021010105DP000.dat.gz'
with rasterio.open(vrt.format(vsi_path, ftp_path, file_path)) as src:
print(src.profile)


fkluibenschaedl@...
 

On Fri, Jul 2, 2021 at 01:12 AM, Luke wrote:
Replacing your FTP code with direct access using the /vsicurl/ virtual filesystem:

This is a very elegant solution, thanks for pointing it out and thanks for the sample code! The only drawback I see is that it is not possible to access multiple products (files within the archive) while at the same time downloading the archive only once.

Thanks again,
F