Data layer

import json
import io
import os
import re
import itertools as iter
import numpy as np

# data
import pandas as pd
import geopandas as gpd
import shapely
import duckdb
import h3

import scalenav.data as snd
import scalenav.scale_nav as sn
from scalenav.plotting import cmap
# from scipy import io as io
# import nctoolkit as nc
# import xarray as xr
# import rioxarray as rx
import glob
import ibis as ib
from ibis import _
import ibis.selectors as s
ib.options.interactive = True

# plots

# from datashader import transfer_functions as tf, reductions as rd
import pypalettes as pypal
import pydeck as pdk
from seaborn import color_palette
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
conn = ib.connect("duckdb://")
conn.raw_sql("install spatial; load spatial")
<duckdb.duckdb.DuckDBPyConnection at 0x17eb026b0>
%pwd
'/Users/cenv1069/Documents/scale_nav/source/notebook'
processed_data_folder = "../../outputs/"
# trying an interpolation method for the right resolution from a given grid size
dist = 0

alpha = 5/np.log(1_000)

A = 13 + alpha*np.log(10)

A 
alpha
# 13+alpha*np.log(10)
0.7238241365054198

Setup

extra_res = 0
grid_params = []
res_params = []

if dist > 0 : 
    # extra_res=round(A-np.log(alpha*dist))

    grid_params = [10,100,1000,5000,10000,dist]
    grid_params.sort()
    
    # res_params = [13,12,11,10,8,extra_res]
    res_params = [round(A-alpha*np.log(size)) for size in grid_params]
    # res_params.sort(reverse=True)

else:
    grid_params = [10,100,1000,5000,10000]
    res_params = [14,12,11,10,8]

res_params

rast_to_h3 = { str(size): {"h3_res" : res, "nn" : [] } for (size,res) in zip(grid_params,res_params)}
rast_to_h3
{'10': {'h3_res': 14, 'nn': []},
 '100': {'h3_res': 12, 'nn': []},
 '1000': {'h3_res': 11, 'nn': []},
 '5000': {'h3_res': 10, 'nn': []},
 '10000': {'h3_res': 8, 'nn': []}}
grid_param = 10

rast_to_h3 = snd.rast_to_h3_map(x=8,y=12,ref="m",dist=dist)

base_res = rast_to_h3[str(grid_param)]["h3_res"]
neighbs = rast_to_h3[str(grid_param)]["nn"]

print("Using base resolution: ",base_res)
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using base resolution:  14
neighbs
[(0, 0),
 (2, 1),
 (1, -1),
 (1, 1),
 (0, 2),
 (-1, 0),
 (0, -2),
 (-2, -1),
 (1, 2),
 (2, 0),
 (0, 1),
 (-1, -1),
 (1, -2),
 (3, 1),
 (1, 0),
 (2, 2),
 (-1, -2),
 (0, -1)]

GHSL MSZ data set

could be moved to another notebook.

# close up london-oxford
# limits = [-1.486,51.138,0.352,51.882]

# wide europe window
# limits = [-14.06,48.34,31.95,59.27]


# Kano Nigeria
limits = [8.46085,11.96741,8.53552,12.01733]

# Lagos
# limits = [3.0624,6.3699,3.5925,6.6592]
# src_file = "outputs/kummu.parquet"
src_file = "new_process.parquet"

layer = conn.read_parquet(f"{processed_data_folder}{src_file}",table_name="layer")
msz_categories = {
    "1" : "Open spaces, low vegetation",
    "2" : "Open spaces, medium vegetation",
    "3" : "Open spaces, high vegetation",
    "4" : "Open spaces, water surfaces",
    "5" : "Open spaces, road surfaces",
    "11" : "Built spaces, residential building height <= 3m",
    "12" : "Built spaces, residential building 3m < height <= 6m",
    "13" : "Built spaces, residential building 6m < height <= 15m",
    "14" : "Built spaces, residential building 15m < height <= 30m",
    "15" : "Built spaces, residential building 30m < height",
    "21" : "Built spaces, non-residential building height <= 3m",
    "22" : "Built spaces, non-residential building 3m < height <= 6m",
    "23" : "Built spaces, non-residential building 6m < height <= 15m",
    "24" : "Built spaces, non-residential building 15m < height <= 30m",
    "25" : "Built spaces, non-residential building 30m < height",
}
print(conn.list_tables())
conn.sql("Select * from layer limit 10")
['layer']
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
┃ lon        lat        band_var ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
│ float32float32float32  │
├───────────┼───────────┼──────────┤
│ -0.32256316.2438491.0 │
│ -0.32246116.2438491.0 │
│ -0.32235916.2438491.0 │
│ -0.32256316.2437671.0 │
│ -0.32246116.24376711.0 │
│ -0.32235916.24376711.0 │
│ -0.32256316.2436851.0 │
│ -0.32246116.24368511.0 │
│ -0.32235816.24368511.0 │
│ -0.32215316.2427812.0 │
└───────────┴───────────┴──────────┘
# conn.raw_sql("SUMMARIZE layer;")
# layer.describe()

### Fine scale data with CL tool

#  the coarse resolution file
# ! python -m scalenav.rast_converter /Users/cenv1069/Documents/data/datasets/kummu_etal/GDP_PPP_1990_2015_5arcmin_v2.nc outputs/kummu_5arcmin.parquet
# the fine resolution file 
# At the moment fails
# ! python -m scalenav.rast_converter /Users/cenv1069/Documents/data/datasets/kummu_etal/GDP_PPP_30arcsec_v3.nc outputs/kummu_30arcsec.parquet
layer_ = (
    layer
#     filter region
    .filter(_.lon>limits[0]
            ,_.lon<limits[2]
            ,_.lat>limits[1]
            ,_.lat<limits[3])
#     filter something on the band_var
    .filter(_.band_var > 5)
    .sample(.1)
    .execute(
        limit=30_000
        )
)
layer_.head()
lon lat band_var
0 8.462206 12.017195 13.0
1 8.461496 12.017113 13.0
2 8.461597 12.017113 13.0
3 8.463519 12.017113 13.0
4 8.463822 12.017113 13.0
# # sample_data = "data/sample_data_kummu.parquet"

# # sample_data = "data/sample_data_ghsl_msz.parquet"

# if not os.path.exists(sample_data):
#     layer.to_parquet(sample_data)
layer_.shape
(24332, 3)
# layer_.count()

Deck map

cat_palette = "Classic_10"
num_palette = "Burg"

gdp_pc_pal = pypal.load_cmap(cat_palette,reverse=True)
deck_data = layer_#.execute(limit=500_000)
deck_data.head()
lon lat band_var
0 8.462206 12.017195 13.0
1 8.461496 12.017113 13.0
2 8.461597 12.017113 13.0
3 8.463519 12.017113 13.0
4 8.463822 12.017113 13.0
deck_data["col"] = cmap(deck_data.band_var,palette=gdp_pc_pal,log=False)
deck_data["value"] = deck_data["band_var"].apply(lambda x: msz_categories[str(int(x))])
Max input : 24.00, palette colors : 10
print(deck_data.shape)
deck_data.head()
(24332, 5)
lon lat band_var col value
0 8.462206 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
1 8.461496 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
2 8.461597 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
3 8.463519 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
4 8.463822 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
viewstate = pdk.data_utils.compute_view(deck_data[["lon","lat"]])
deck_points = pdk.Layer(type="ScatterplotLayer"
                        ,data=deck_data
                        ,get_position = ["lon","lat"]
                        ,get_radius=10
                        ,get_fill_color= "col"
                        ,pickable=True
                        ,opacity=0.2
                        ,stroked=True
                        ,filled=True
                        ,radius_scale=2
                        ,radius_min_pixels=3
                        ,radius_max_pixels=5
                        ,line_width_min_pixels=1
                        ,get_line_color=[0, 0, 0, 0]
                       )
deck = pdk.Deck(layers=[deck_points],
                      initial_view_state=viewstate,
                      tooltip= {"text": " {band_var} : {value}"},
                      map_style="road")
deck.to_html("../../deck_maps/ghsl_msz_vis.html",iframe_height=700)

Gridding function

# square = snd.square_poly(lat=limits[1],lon=limits[0],distance=grid_param)

# ref_cell = h3.latlng_to_cell(lat=limits[1],lng=limits[0],res=base_res)
# ref_cell_ij = h3.cell_to_local_ij(origin=ref_cell,h=ref_cell)

# cells = h3.geo_to_cells(square,res=base_res)

# cells_ij = [h3.cell_to_local_ij(origin=ref_cell,h=cell) for cell in cells]

# snd.rast_to_h3[str(grid_param)]["nn"] = [(cell_i-ref_cell_ij[0],cell_j-ref_cell_ij[1]) for (cell_i,cell_j) in cells_ij]

# cells_redone = [h3.local_ij_to_cell(origin=ref_cell,i=cells_i+ref_cell_ij[0],j=cells_j+ref_cell_ij[1]) for (cells_i,cells_j) in snd.rast_to_h3[str(grid_param)]["nn"]]

# gpd.GeoSeries([h3.cells_to_h3shape(cells=cells_redone)],crs="epsg:4326").explore()
# limits = [-1.486,51.138,0.352,51.882]

# wide europe window
# -14.06
# 48.34
# 31.95
# 59.27
layer_.rename(columns={"lon":"x","lat":"y"},inplace=True)
layer_.head()
x y band_var col value
0 8.462206 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
1 8.461496 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
2 8.461597 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
3 8.463519 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
4 8.463822 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
# layer_df = layer.execute(limit=100_000)
# base_res = 14

layer_df = snd.df_project_on_grid(layer_,res=base_res) 
layer_df.head()
x y band_var col value h3_id
0 8.462206 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183a6aa7
1 8.461496 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41805bcaf
2 8.461597 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41805b187
3 8.463519 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183ac0e7
4 8.463822 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183adcdf
layer_df.rename(columns={"band":"band_var"},inplace=True)
print(layer_df.shape)
layer_df.head()
(24332, 6)
x y band_var col value h3_id
0 8.462206 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183a6aa7
1 8.461496 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41805bcaf
2 8.461597 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41805b187
3 8.463519 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183ac0e7
4 8.463822 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183adcdf
# layer_df["band_var"].astype(copy=)
cat = True

if cat:
    layer_df["band_var"]=layer_df["band_var"].astype("int").astype("str")
neighbs
[(0, 0),
 (2, 1),
 (1, -1),
 (1, 1),
 (0, 2),
 (-1, 0),
 (0, -2),
 (-2, -1),
 (1, 2),
 (2, 0),
 (0, 1),
 (-1, -1),
 (1, -2),
 (3, 1),
 (1, 0),
 (2, 2),
 (-1, -2),
 (0, -1)]
def layer_gridded(layer : pd.DataFrame, neighbs : list[tuple[int]]): 
    """" Take a layer of raster centroids expressed as H3 cells and do the 'gridding'. It means filling in all the gaps respecting the original raster grid size.
    Duplicates are removed, the value from the single cell is uniformly distributed across the set of cells associated to it. 
    
    """
    
    vars_columns = [x for x in layer.columns if re.search(pattern="_var$",string=x)]

    layer["h3_gridded"] = pd.Series(layer.apply(lambda row: snd.centre_cell_to_square(h3_cell = row["h3_id"],neighbs=neighbs),axis=1).tolist())
    layer.dropna(subset=["h3_gridded"],inplace=True)
    gridded_layer = layer.explode("h3_gridded")
    grid_layer_b = gridded_layer.shape[0]
    gridded_layer = gridded_layer[~gridded_layer.duplicated(subset=["h3_gridded"])]
    grid_layer_a = gridded_layer.shape[0]
    
    print("Duplicated ratio : ",1-np.round(grid_layer_a/grid_layer_b,3))
    
    gridded_layer.reset_index(inplace=True,drop=True)
    gridded_layer.set_index("h3_id",inplace=True,drop=True)
    gridded_layer = gridded_layer.join(gridded_layer.value_counts("h3_id"),how="left",sort=False,validate="m:1")

    # since only one layer is being read at the moment, just check if it's string or not. 
    # in the future, separate into categorical and numerical variables
    if len(vars_columns)>1:
        if (str not in gridded_layer[vars_columns].dtypes.values.tolist()):
            gridded_layer[vars_columns] = np.round(gridded_layer[vars_columns].div(gridded_layer["count"],axis=0),3)
    
    # if len(vars_columns)==1:
    #     if gridded_layer[vars_columns].dtype!=str:
    #         gridded_layer[vars_columns] = np.round(gridded_layer[vars_columns].div(gridded_layer["count"],axis=0),3)
    
    gridded_layer.reset_index(inplace=True,drop=False)
    gridded_layer.dropna(axis=0,subset=["h3_gridded"],inplace=True)

    return gridded_layer
layer_grid = layer_gridded(layer=layer_df,neighbs=neighbs)
Duplicated ratio :  0.014000000000000012
print(layer_grid.shape)
layer_grid.head()
(432000, 8)
h3_id x y band_var col value h3_gridded count
0 8e580a4183a6aa7 8.462206 12.017195 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183a6aa7 18
1 8e580a4183a6aa7 8.462206 12.017195 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183a6b87 18
2 8e580a4183a6aa7 8.462206 12.017195 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183a6a37 18
3 8e580a4183a6aa7 8.462206 12.017195 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183a6b9f 18
4 8e580a4183a6aa7 8.462206 12.017195 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a4183a614f 18

Mapping gridded

sample_size = np.min([layer_grid.shape[0],30_000])
plotting_layer = (layer_grid
                #   .iloc[0:sample_size]
                  .sample(sample_size))
plotting_layer["color"] = cmap(plotting_layer["band_var"].astype("int",copy=False)
                               ,palette=gdp_pc_pal
                               ,log=False)
Max input : 24.00, palette colors : 10
#####

layer_gridded = pdk.Layer(
    "H3HexagonLayer",
    plotting_layer,
    pickable=True,
    stroked=True,
    filled=True,
    opacity=.5,
    extruded=False,
    # elevation_scale = 1e-4,
    # elevation_range = [0,100],
    get_elevation="band_var",
    get_hexagon= "h3_gridded",
    get_fill_color = "color",
    get_line_color= [0, 0, 0, 100],
    line_width_min_pixels=0,
    line_width_max_pixels=1,
)
# Create mapdeck object
layer_gridded_deck = pdk.Deck(layers=[layer_gridded]
             ,initial_view_state=viewstate
             ,tooltip= {"text": "{band_var} :  {value}"}
             ,map_style="road"
             )

layer_gridded_deck.to_html("../../deck_maps/msz_ghsl_gridded.html",iframe_height=800,open_browser=True)