Data ingestion and processing

import rasterio as rs 
from rasterio.windows import Window
from rasterio.transform import xy
from rasterio.vrt import WarpedVRT
from rasterio.crs import CRS
from rasterio.windows import Window

from pyproj import Transformer

from pandas import DataFrame
import geopandas as gpd
import numpy as np
from numpy import array,meshgrid,arange,log

import ibis as ib
ib.options.interactive = True
# With dask

import dask as dk
import dask.array as da
import dask.dataframe as dd
# import dask_image as dki
import xarray as xr
import rioxarray as rx

from pyarrow import float32,schema,field,uint16,table,Table
from pyarrow.parquet import ParquetWriter
# big file
# src_file="/Users/cenv1069/Documents/data/datasets/JRC/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif"

src_file = "/Users/cenv1069/Documents/data/datasets/JRC/S_10m/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif"

# smaller size file
# src_file="/Users/cenv1069/Documents/data/datasets/JRC/S_1000m/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19.tif"

Working workflows below

# inspired by : https://rasterio.readthedocs.io/en/stable/topics/virtual-warping.html

dst_crs = CRS.from_epsg(4326)
print(dst_crs)
EPSG:4326
vrt_options = {
    # 'resampling': Resampling.cubic,
    'crs': dst_crs,
    # 'transform': dst_transform,
    # 'height': dst_height,
    # 'width': dst_width,
}

out_path = "S_10m_NRES_R8_C19.parquet"

rast_schema = schema([('lon',float32())
                    ,('lat',float32())
                    ,('band_var',float32())
                    ])

rast_schema.with_metadata({
        "lon" : "Longitude coordinate",
        "lat" : "Latitude coordinate",
        "band_var" : "Value associated",
                            })
lon: float
lat: float
band_var: float
-- schema metadata --
lon: 'Longitude coordinate'
lat: 'Latitude coordinate'
band_var: 'Value associated'

Using a virtual Warper and windows

%%time

# with ParquetWriter(out_path, rast_schema) as writer:
#     with rs.open(src_file) as src:
#         src_crs = src.crs
#         if len(src.nodatavals)>1:
#             nodata = src.nodatavals[0]
#         else :
#             nodata = src.nodatavals

#         with WarpedVRT(src, **vrt_options) as vrt:
#             # At this point 'vrt' is a full dataset with dimensions,
#             # CRS, and spatial extent matching 'vrt_options'.
#             # Read all data into memory.
#             # data = vrt.read()
#             # Process the dataset in chunks.  Likely not very efficient.
            
#             win_transfrom = vrt.window_transform

#             for _, window in vrt.block_windows():
#                 # print(src.crs)
#                 band1 = vrt.read(window=window)
                
#                 height = band1.shape[1]
#                 width = band1.shape[2]
#                 cols, rows = meshgrid(arange(width), arange(height))

#                 xs, ys = xy(
#                     transform = win_transfrom(window),
#                     rows=rows,
#                     cols=cols)

#                 lons = array(xs)
#                 lats = array(ys)
                
#                 out = DataFrame({"band_var" : array(band1).flatten()
#                                         ,'lon': lons.flatten()
#                                         ,'lat': lats.flatten()})
                
#                 out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)
#                 out.drop(index=out.loc[out.band_var<=0].index,inplace=True)
#                 # print(out.shape)
#                 # print(out.head())

#                 if out.shape[0]!=0:
#                     writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))

            # # # Dump the aligned data into a new file.  A VRT representing
            # # # this transformation can also be produced by switching
            # # # to the VRT driver.
            # # directory, name = os.path.split(path)
            # # outfile = os.path.join(directory, 'aligned-{}'.format(name))
            # # rio_shutil.copy(vrt, outfile, driver='GTiff')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:1

NameError: name 'ParquetWriter' is not defined

Using a classic window approach

# %%time

with ParquetWriter(out_path, rast_schema) as writer:
        with rs.open(src_file) as src:
            
            src_crs = src.crs
            win_transfrom = src.window_transform
            
            transformer = Transformer.from_crs(str(src_crs), 'EPSG:4326', always_xy=True)
            
            if len(src.nodatavals)>1:
                nodata = src.nodatavals[0]
            else :
                nodata = src.nodatavals
            # Process the dataset in chunks.  Likely not very efficient.
            for ij, window in src.block_windows():
                # print(window)
                # print(src.crs)
                band1 = src.read(window=window)
                # print(band1[0])
                height = band1.shape[1]
                width = band1.shape[2]
                cols, rows = meshgrid(arange(width), arange(height))
                # print(win_transfrom(window))
                xs, ys = xy(
                    transform = win_transfrom(window),
                    rows=rows,
                    cols=cols)
                
                # print(xs,ys)
                
                lons,lats = transformer.transform(array(xs),array(ys))
                # print(lons.shape)
                # print(lats.shape)
                # print(len(array(band1).flatten()))
                # print(len(lons.flatten()))
                
                out = DataFrame({'lon': lons.flatten(),
                                    'lat': lats.flatten(),
                                    "band_var" : array(band1[0,:,:]).flatten(),
                                    })
                
                out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)
                out.drop(index=out.loc[out.band_var<=0].index,inplace=True)
                
                # print(out.shape)
                # print(out.head())
                
                if out.shape[0]!=0:
                        writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))
test_data = ib.read_parquet(out_path)
test_data.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
┃ lon        lat        band_var ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
│ float32float32float32  │
├───────────┼───────────┼──────────┤
│ -0.00347816.2561931.0 │
│ -0.00338616.2561931.0 │
│ -0.00329516.2561931.0 │
│ -0.00320316.2561931.0 │
│ -0.00356916.2561021.0 │
└───────────┴───────────┴──────────┘
test_data.count()

┌────────┐
│ 243059 │
└────────┘
test_data
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
┃ lon        lat        band_var ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
│ float32float32float32  │
├───────────┼───────────┼──────────┤
│ -0.00347816.2561931.0 │
│ -0.00338616.2561931.0 │
│ -0.00329516.2561931.0 │
│ -0.00320316.2561931.0 │
│ -0.00356916.2561021.0 │
│ -0.00347816.2561021.0 │
│ -0.00338616.2561021.0 │
│ -0.00329516.2561021.0 │
│ -0.00320316.2561021.0 │
│ -0.00311216.2561021.0 │
│          │
└───────────┴───────────┴──────────┘
# test_data.select("lon","lat")
# test_xy = test_data.select("lon","lat").to_pandas()
# gpd.GeoSeries(gpd.points_from_xy(test_xy["lon"],test_xy["lat"],crs="epsg:4326")).explore()