Data ingestion and processing

import rasterio as rs 
from rasterio.windows import Window
from rasterio.transform import xy
from rasterio.vrt import WarpedVRT
from rasterio.crs import CRS
from rasterio.windows import Window

from pyproj import Transformer
import pandas as pd
from pandas import DataFrame
import geopandas as gpd
import numpy as np
from numpy import array,meshgrid,arange,log
from shapely import box


import ibis as ib
ib.options.interactive = True
# With dask

import dask as dk
import dask.array as da
import dask.dataframe as dd
# import dask_image as dki
import xarray as xr
import rioxarray as rx

from pyarrow import float16,float32,schema,field,uint16,table,Table
from pyarrow.parquet import ParquetWriter
# big file
src_file="/Users/cenv1069/Documents/data/datasets/JRC/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif"

# src_file = "/Users/cenv1069/Documents/data/datasets/JRC/S_10m/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif"

# src_file = "/Users/cenv1069/Documents/data/datasets/JRC/S_100m/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19.tif"

# src_file = "Users/cenv1069/Documents/data/datasets/JRC/S_100m/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R9_C19/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R9_C19.tif"

# smaller size file
# src_file="/Users/cenv1069/Documents/data/datasets/JRC/S_1000m/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19.tif"

Working workflows below

# inspired by : https://rasterio.readthedocs.io/en/stable/topics/virtual-warping.html

dst_crs = CRS.from_epsg(4326)
print(dst_crs)
vrt_options = {
    # 'resampling': Resampling.cubic,
    'crs': dst_crs,
    # 'transform': dst_transform,
    # 'height': dst_height,
    # 'width': dst_width,
}

out_path = "test_med.parquet"

rast_schema = schema([('lon',float32())
                    ,('lat',float32())
                    ,('band_var',float32())
                    ])

rast_schema.with_metadata({
        "lon" : "Longitude coordinate",
        "lat" : "Latitude coordinate",
        "band_var" : "Value associated",
                            })

Using a virtual Warper and windows

%%time

with ParquetWriter(out_path, rast_schema) as writer:
    with rs.open(src_file) as src:
        
        src_crs = src.crs
        if len(src.nodatavals)>1:
            nodata = src.nodatavals[0]
        else :
            nodata = src.nodatavals

        print("No data value : ", nodata)
        print("Detected source crs : ", src_crs)

        with WarpedVRT(src, **vrt_options) as vrt:
            # At this point 'vrt' is a full dataset with dimensions,
            # CRS, and spatial extent matching 'vrt_options'.
            # Read all data into memory.
            # data = vrt.read()
            # Process the dataset in chunks.  Likely not very efficient.
            
            win_transfrom = vrt.window_transform

            for _, window in vrt.block_windows():
                # print(src.crs)
                band1 = vrt.read(window=window)
                
                height = band1.shape[1]
                width = band1.shape[2]
                cols, rows = meshgrid(arange(width), arange(height))

                xs, ys = xy(
                    transform = win_transfrom(window),
                    rows=rows,
                    cols=cols)

                lons = array(xs)
                lats = array(ys)
                
                out = DataFrame({"band_var" : array(band1).flatten()
                                        ,'lon': lons.flatten()
                                        ,'lat': lats.flatten()})
                
                out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)
                out.drop(index=out.loc[out.band_var<=0].index,inplace=True)
                # print(out.shape)
                # print(out.head())

                if out.shape[0]!=0:
                    writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))

            # # # Dump the aligned data into a new file.  A VRT representing
            # # # this transformation can also be produced by switching
            # # # to the VRT driver.
            # # directory, name = os.path.split(path)
            # # outfile = os.path.join(directory, 'aligned-{}'.format(name))
            # # rio_shutil.copy(vrt, outfile, driver='GTiff')

Using a classic window approach

# %%time

with ParquetWriter(out_path, rast_schema) as writer:
        with rs.open(src_file) as src:
            
            src_crs = src.crs
            win_transfrom = src.window_transform
            
            transformer = Transformer.from_crs(str(src_crs), 'EPSG:4326', always_xy=True)
            
            if len(src.nodatavals)>1:
                nodata = src.nodatavals[0]
            else :
                nodata = src.nodatavals

            print("No data value : ", nodata)
            print("Detected source crs : ", src_crs)
            
            # Process the dataset in chunks.  Likely not very efficient.
            for ij, window in src.block_windows():
                # print(window)
                # print(src.crs)
                band1 = src.read(window=window)
                # print(band1[0])
                height = band1.shape[1]
                width = band1.shape[2]
                cols, rows = meshgrid(arange(width), arange(height))
                # print(win_transfrom(window))
                xs, ys = xy(
                    transform = win_transfrom(window),
                    rows=rows,
                    cols=cols)
                
                # print(xs,ys)
                
                lons,lats = transformer.transform(array(xs),array(ys))
                # print(lons.shape)
                # print(lats.shape)
                # print(len(array(band1).flatten()))
                # print(len(lons.flatten()))
                
                out = DataFrame({'lon': lons.flatten(),
                                    'lat': lats.flatten(),
                                    "band_var" : array(band1[0,:,:]).flatten(),
                                    })
                
                out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)
                out.drop(index=out.loc[out.band_var<=0].index,inplace=True)
                
                # print(out.shape)
                # print(out.head())
                
                if out.shape[0]!=0:
                        writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))
test_data = ib.read_parquet(out_path)
test_data.head()
test_data.count()
test_data.band_var.max()
# test_data.select("lon","lat")
# test_xy = test_data.select("lon","lat").to_pandas()
# gpd.GeoSeries(gpd.points_from_xy(test_xy["lon"],test_xy["lat"],crs="epsg:4326")).explore()

Processing a huge file

type((255,))==tuple
# using the module from the script
! python ../../src/scalenav/rast_converter.py /Users/cenv1069/Documents/data/datasets/JRC/S_100m/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19.tif ../../test_big.parquet
Reading the following file(s):  /Users/cenv1069/Documents/data/datasets/JRC/S_100m/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19.tif
No data value :  (65535.0,)
Detected source crs :  ESRI:54009
# using the module from the build package 
! python -m scalenav.rast_converter /Users/cenv1069/Documents/data/datasets/JRC/S_100m/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19.tif ../../test_big.parquet
Reading the following file(s):  /Users/cenv1069/Documents/data/datasets/JRC/S_100m/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19/GHS_BUILT_S_NRES_E2015_GLOBE_R2023A_54009_100_V1_0_R8_C19.tif
No data value :  (65535.0,)
Detected source crs :  ESRI:54009

Reading a huge processed file

conn = ib.connect("duckdb://")
test_file = conn.read_parquet('../../H_AGBH_100.parquet')
test_file.band_var.min()

┌──────────┐
│ 0.000072 │
└──────────┘
test_file.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
┃ lon        lat        band_var ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
│ float32float32float32  │
├───────────┼───────────┼──────────┤
│ 14.30357577.9915240.015588 │
│ 14.30457077.9915240.015588 │
│ 14.19399577.9905240.029040 │
│ 14.19499277.9905240.029040 │
│ 14.19598877.9905240.029040 │
└───────────┴───────────┴──────────┘
test_file.count()

┌───────────┐
│ 526511828 │
└───────────┘
# with rs.open("/Users/cenv1069/Documents/data/datasets/JRC/S_1000m/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R7_C22/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R7_C22.tif") as src:
#     # Process the dataset in chunks.  Likely not very efficient.
#     print(src.height)
#     print(src.width)
    
#     for ij, window in src.block_windows():
#         print((ij, window))
1000
1000
((0, 0), Window(col_off=0, row_off=0, width=256, height=256))
((0, 1), Window(col_off=256, row_off=0, width=256, height=256))
((0, 2), Window(col_off=512, row_off=0, width=256, height=256))
((0, 3), Window(col_off=768, row_off=0, width=232, height=256))
((1, 0), Window(col_off=0, row_off=256, width=256, height=256))
((1, 1), Window(col_off=256, row_off=256, width=256, height=256))
((1, 2), Window(col_off=512, row_off=256, width=256, height=256))
((1, 3), Window(col_off=768, row_off=256, width=232, height=256))
((2, 0), Window(col_off=0, row_off=512, width=256, height=256))
((2, 1), Window(col_off=256, row_off=512, width=256, height=256))
((2, 2), Window(col_off=512, row_off=512, width=256, height=256))
((2, 3), Window(col_off=768, row_off=512, width=232, height=256))
((3, 0), Window(col_off=0, row_off=768, width=256, height=232))
((3, 1), Window(col_off=256, row_off=768, width=256, height=232))
((3, 2), Window(col_off=512, row_off=768, width=256, height=232))
((3, 3), Window(col_off=768, row_off=768, width=232, height=232))
# import time
# from tqdm import tqdm
# from datetime import datetime

# def process_data():
#     # Simulate data processing loop
#     for _ in tqdm(range(100), desc="Processing data..."):
#         time.sleep(0.1)  # Simulating some processing time for each iteration

# def show_clock():
#     with tqdm(total=0, bar_format="{desc}", dynamic_ncols=True) as pbar:
#         while True:
#             # Get current time
#             now = datetime.now().strftime("%H:%M:%S")
#             # Update the tqdm bar with the current time
#             pbar.set_description(f"Clock: {now}")
#             time.sleep(1)  # Update every second


# # Run the clock in the background
# import threading
# clock_thread = threading.Thread(target=show_clock, daemon=True)
# clock_thread.start()

# # Run the data processing function
# process_data()

# # Optionally, wait for the clock thread to finish if needed
# clock_thread.join()

Generating sample data sets

Let’s select a window from which we want to sample, we can rely on the boundaries of the ghsl tiles for consistency and ease :

ghsl_tile_bounds = {"R8_C19" : [-0.419440,8.098056,9.778591,16.259129]}
box_window = box(*ghsl_tile_bounds["R8_C19"],ccw=False)
box_window
../_images/7db98beec7c26cb1168e7222111d4f5d31cb421879a77535c9b297fdf52d448f.svg
# 
src_nc = "/Users/cenv1069/Documents/data/datasets/kummu_etal/pedigree_HDI_1990_2015_v2.nc"
out_file = "../../data/test_data_1.tif"
# with rs.open(src_nc) as src:
#     print(src.bounds)
#     print(src.shape)

#     out_image, out_transform = rs.mask.mask(src, [box_window], crop=True)
#     out_meta = src.meta
#     # print(out_meta)
#     if len(out_image)>2:
#         out_image = out_image[out_image.shape[0]-1]

#     out_meta.update({"driver": "GTiff",
#                     "height": out_image.shape[0],
#                     "width": out_image.shape[1],
#                     'crs': "epsg:4326",
#                     "transform": out_transform})

#     with rs.open(out_file, "w", **out_meta) as dest:
#         dest.write(out_image)
# with rs.open(out_file) as src:
#     print(src.shape)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 with rs.open(out_file) as src:
      2     print(src.shape)

NameError: name 'rs' is not defined
from scalenav.rast_converter import rast_converter
rast_converter(in_path="../../data/test_data_1.tif")
Reading the following file(s):  ../../data/test_data_1.tif
No data value :  -9.0
Detected source crs :  EPSG:4326
rast_convert = pd.read_parquet("rast_convert.parquet")
print(rast_convert.shape)
rast_convert.head()
(122, 3)
lon lat band_var
0 -0.375000 8.125 4.0
1 -0.291667 8.125 4.0
2 -0.208333 8.125 4.0
3 -0.125000 8.125 4.0
4 -0.041667 8.125 4.0
gpd.GeoSeries.from_xy(rast_convert["lon"],rast_convert["lat"],crs="epsg:4326").explore()
Make this Notebook Trusted to load map: File -> Trust Notebook