Data layer

import json
import io
import os
import re
import itertools as iter
import numpy as np

# data
import pandas as pd
import geopandas as gpd
import shapely
import duckdb
import h3

import scalenav.data as snd
import scalenav.scale_nav as sn
from scalenav.plotting import cmap
# from scipy import io as io
# import nctoolkit as nc
# import xarray as xr
# import rioxarray as rx
import glob
import ibis as ib
from ibis import _
import ibis.selectors as s
ib.options.interactive = True

# plots

# from datashader import transfer_functions as tf, reductions as rd
import pypalettes as pypal
import pydeck as pdk
from seaborn import color_palette
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
conn = ib.connect("duckdb://")
conn.raw_sql("install spatial; load spatial")
<duckdb.duckdb.DuckDBPyConnection at 0x30c33fbf0>
%pwd
'/Users/cenv1069/Documents/scale_nav/source/notebook'
processed_data_folder = "../../outputs/"
# trying an interpolation method for the right resolution from a given grid size
dist = 0

alpha = 5/np.log(1_000)

A = 13 + alpha*np.log(10)

A 
alpha
# 13+alpha*np.log(10)
0.7238241365054198

Setup

extra_res = 0
grid_params = []
res_params = []

if dist > 0 : 
    # extra_res=round(A-np.log(alpha*dist))

    grid_params = [10,100,1000,5000,10000,dist]
    grid_params.sort()
    
    # res_params = [13,12,11,10,8,extra_res]
    res_params = [round(A-alpha*np.log(size)) for size in grid_params]
    # res_params.sort(reverse=True)

else:
    grid_params = [10,100,1000,5000,10000]
    res_params = [14,12,11,10,8]

res_params

rast_to_h3 = { str(size): {"h3_res" : res, "nn" : [] } for (size,res) in zip(grid_params,res_params)}
rast_to_h3
{'10': {'h3_res': 14, 'nn': []},
 '100': {'h3_res': 12, 'nn': []},
 '1000': {'h3_res': 11, 'nn': []},
 '5000': {'h3_res': 10, 'nn': []},
 '10000': {'h3_res': 8, 'nn': []}}
grid_param = 10

rast_to_h3 = snd.rast_to_h3_map(x=8,y=12,ref="m",dist=dist)

base_res = rast_to_h3[str(grid_param)]["h3_res"]
neighbs = rast_to_h3[str(grid_param)]["nn"]

print("Using base resolution: ",base_res)
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using angles for meter grid.
Using base resolution:  14
neighbs
[(0, 0),
 (2, 1),
 (1, -1),
 (1, 1),
 (0, 2),
 (-1, 0),
 (0, -2),
 (-2, -1),
 (1, 2),
 (2, 0),
 (0, 1),
 (-1, -1),
 (1, -2),
 (3, 1),
 (1, 0),
 (2, 2),
 (-1, -2),
 (0, -1)]

GHSL MSZ data set

could be moved to another notebook.

# close up london-oxford
# limits = [-1.486,51.138,0.352,51.882]

# wide europe window
# limits = [-14.06,48.34,31.95,59.27]


# Kano Nigeria
limits = [8.46085,11.96741,8.53552,12.01733]

# Lagos
# limits = [3.0624,6.3699,3.5925,6.6592]
# src_file = "outputs/kummu.parquet"
src_file = "new_process.parquet"

layer = conn.read_parquet(f"{processed_data_folder}{src_file}",table_name="layer")
msz_categories = {
    "1" : "Open spaces, low vegetation",
    "2" : "Open spaces, medium vegetation",
    "3" : "Open spaces, high vegetation",
    "4" : "Open spaces, water surfaces",
    "5" : "Open spaces, road surfaces",
    "11" : "Built spaces, residential building height <= 3m",
    "12" : "Built spaces, residential building 3m < height <= 6m",
    "13" : "Built spaces, residential building 6m < height <= 15m",
    "14" : "Built spaces, residential building 15m < height <= 30m",
    "15" : "Built spaces, residential building 30m < height",
    "21" : "Built spaces, non-residential building height <= 3m",
    "22" : "Built spaces, non-residential building 3m < height <= 6m",
    "23" : "Built spaces, non-residential building 6m < height <= 15m",
    "24" : "Built spaces, non-residential building 15m < height <= 30m",
    "25" : "Built spaces, non-residential building 30m < height",
}
print(conn.list_tables())
conn.sql("Select * from layer limit 10")
['layer']
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
┃ lon        lat        band_var ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
│ float32float32float32  │
├───────────┼───────────┼──────────┤
│ -0.32256316.2438491.0 │
│ -0.32246116.2438491.0 │
│ -0.32235916.2438491.0 │
│ -0.32256316.2437671.0 │
│ -0.32246116.24376711.0 │
│ -0.32235916.24376711.0 │
│ -0.32256316.2436851.0 │
│ -0.32246116.24368511.0 │
│ -0.32235816.24368511.0 │
│ -0.32215316.2427812.0 │
└───────────┴───────────┴──────────┘
# conn.raw_sql("SUMMARIZE layer;")
# layer.describe()

### Fine scale data with CL tool

#  the coarse resolution file
# ! python -m scalenav.rast_converter /Users/cenv1069/Documents/data/datasets/kummu_etal/GDP_PPP_1990_2015_5arcmin_v2.nc outputs/kummu_5arcmin.parquet
# the fine resolution file 
# At the moment fails
# ! python -m scalenav.rast_converter /Users/cenv1069/Documents/data/datasets/kummu_etal/GDP_PPP_30arcsec_v3.nc outputs/kummu_30arcsec.parquet
layer_ = (
    layer
#     filter region
    .filter(_.lon>limits[0]
            ,_.lon<limits[2]
            ,_.lat>limits[1]
            ,_.lat<limits[3])
#     filter something on the band_var
    .filter(_.band_var > 5)
    .sample(.1)
    .execute(
        limit=30_000
        )
)
layer_.head()
lon lat band_var
0 8.464635 12.017276 13.0
1 8.466049 12.017195 13.0
2 8.466150 12.017195 13.0
3 8.466960 12.017195 13.0
4 8.462103 12.017113 13.0
# # sample_data = "data/sample_data_kummu.parquet"

# # sample_data = "data/sample_data_ghsl_msz.parquet"

# if not os.path.exists(sample_data):
#     layer.to_parquet(sample_data)
layer_.shape
(24308, 3)
# layer_.count()

Deck map

cat_palette = "Classic_10"
num_palette = "Burg"

gdp_pc_pal = pypal.load_cmap(cat_palette,reverse=True)
deck_data = layer_#.execute(limit=500_000)
deck_data.head()
lon lat band_var
0 8.464635 12.017276 13.0
1 8.466049 12.017195 13.0
2 8.466150 12.017195 13.0
3 8.466960 12.017195 13.0
4 8.462103 12.017113 13.0
deck_data["col"] = cmap(deck_data.band_var,palette=gdp_pc_pal,log=False)
deck_data["value"] = deck_data["band_var"].apply(lambda x: msz_categories[str(int(x))])
Max input : 24.00, palette colors : 10
print(deck_data.shape)
deck_data.head()
(24308, 5)
lon lat band_var col value
0 8.464635 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
1 8.466049 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
2 8.466150 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
3 8.466960 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
4 8.462103 12.017113 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
viewstate = pdk.data_utils.compute_view(deck_data[["lon","lat"]])
deck_points = pdk.Layer(type="ScatterplotLayer"
                        ,data=deck_data
                        ,get_position = ["lon","lat"]
                        ,get_radius=10
                        ,get_fill_color= "col"
                        ,pickable=True
                        ,opacity=0.2
                        ,stroked=True
                        ,filled=True
                        ,radius_scale=2
                        ,radius_min_pixels=3
                        ,radius_max_pixels=5
                        ,line_width_min_pixels=1
                        ,get_line_color=[0, 0, 0, 0]
                       )
deck = pdk.Deck(layers=[deck_points],
                      initial_view_state=viewstate,
                      tooltip= {"text": " {band_var} : {value}"},
                      map_style="road")
deck.to_html("../../deck_maps/ghsl_msz_vis.html",iframe_height=700)

Gridding function

# square = snd.square_poly(lat=limits[1],lon=limits[0],distance=grid_param)

# ref_cell = h3.latlng_to_cell(lat=limits[1],lng=limits[0],res=base_res)
# ref_cell_ij = h3.cell_to_local_ij(origin=ref_cell,h=ref_cell)

# cells = h3.geo_to_cells(square,res=base_res)

# cells_ij = [h3.cell_to_local_ij(origin=ref_cell,h=cell) for cell in cells]

# snd.rast_to_h3[str(grid_param)]["nn"] = [(cell_i-ref_cell_ij[0],cell_j-ref_cell_ij[1]) for (cell_i,cell_j) in cells_ij]

# cells_redone = [h3.local_ij_to_cell(origin=ref_cell,i=cells_i+ref_cell_ij[0],j=cells_j+ref_cell_ij[1]) for (cells_i,cells_j) in snd.rast_to_h3[str(grid_param)]["nn"]]

# gpd.GeoSeries([h3.cells_to_h3shape(cells=cells_redone)],crs="epsg:4326").explore()
# limits = [-1.486,51.138,0.352,51.882]

# wide europe window
# -14.06
# 48.34
# 31.95
# 59.27
layer_.rename(columns={"lon":"x","lat":"y"},inplace=True)
layer_.head()
x y band_var col value
0 8.464938 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
1 8.466557 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
2 8.466658 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
3 8.467063 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
4 8.466657 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height...
# layer_df = layer.execute(limit=100_000)
# base_res = 14

layer_df = snd.df_project_on_grid(layer_,res=base_res) 
layer_df.head()
x y band_var col value h3_id
0 8.464938 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41831bbaf
1 8.466557 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822cd5f
2 8.466658 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822c8d7
3 8.467063 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822dc6f
4 8.466657 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822c037
layer_df.rename(columns={"band":"band_var"},inplace=True)
print(layer_df.shape)
layer_df.head()
(24217, 6)
x y band_var col value h3_id
0 8.464938 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41831bbaf
1 8.466557 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822cd5f
2 8.466658 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822c8d7
3 8.467063 12.017276 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822dc6f
4 8.466657 12.017195 13.0 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41822c037
# layer_df["band_var"].astype(copy=)
cat = True

if cat:
    layer_df["band_var"]=layer_df["band_var"].astype("int").astype("str")
neighbs
[(0, 0),
 (2, 1),
 (1, -1),
 (1, 1),
 (0, 2),
 (-1, 0),
 (0, -2),
 (-2, -1),
 (1, 2),
 (2, 0),
 (0, 1),
 (-1, -1),
 (1, -2),
 (3, 1),
 (1, 0),
 (2, 2),
 (-1, -2),
 (0, -1)]
def layer_gridded(layer : pd.DataFrame, neighbs : list[tuple[int]]): 
    """" Take a layer of raster centroids expressed as H3 cells and do the 'gridding'. It means filling in all the gaps respecting the original raster grid size.
    Duplicates are removed, the value from the single cell is uniformly distributed across the set of cells associated to it. 
    
    """
    
    vars_columns = [x for x in layer.columns if re.search(pattern="_var$",string=x)]

    layer["h3_gridded"] = pd.Series(layer.apply(lambda row: snd.centre_cell_to_square(h3_cell = row["h3_id"],neighbs=neighbs),axis=1).tolist())
    layer.dropna(subset=["h3_gridded"],inplace=True)
    gridded_layer = layer.explode("h3_gridded")
    grid_layer_b = gridded_layer.shape[0]
    gridded_layer = gridded_layer[~gridded_layer.duplicated(subset=["h3_gridded"])]
    grid_layer_a = gridded_layer.shape[0]
    
    print("Duplicated ratio : ",1-np.round(grid_layer_a/grid_layer_b,3))
    
    gridded_layer.reset_index(inplace=True,drop=True)
    gridded_layer.set_index("h3_id",inplace=True,drop=True)
    gridded_layer = gridded_layer.join(gridded_layer.value_counts("h3_id"),how="left",sort=False,validate="m:1")

    # since only one layer is being read at the moment, just check if it's string or not. 
    # in the future, separate into categorical and numerical variables
    if len(vars_columns)>1:
        if (str not in gridded_layer[vars_columns].dtypes.values.tolist()):
            gridded_layer[vars_columns] = np.round(gridded_layer[vars_columns].div(gridded_layer["count"],axis=0),3)
    
    # if len(vars_columns)==1:
    #     if gridded_layer[vars_columns].dtype!=str:
    #         gridded_layer[vars_columns] = np.round(gridded_layer[vars_columns].div(gridded_layer["count"],axis=0),3)
    
    gridded_layer.reset_index(inplace=True,drop=False)
    gridded_layer.dropna(axis=0,subset=["h3_gridded"],inplace=True)

    return gridded_layer
layer_grid = layer_gridded(layer=layer_df,neighbs=neighbs)
Duplicated ratio :  0.013000000000000012
print(layer_grid.shape)
layer_grid.head()
(430149, 8)
h3_id x y band_var col value h3_gridded count
0 8e580a41831bbaf 8.464938 12.017276 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41831bbaf 18
1 8e580a41831bbaf 8.464938 12.017276 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41831b84f 18
2 8e580a41831bbaf 8.464938 12.017276 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41831bb07 18
3 8e580a41831bbaf 8.464938 12.017276 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41831bba7 18
4 8e580a41831bbaf 8.464938 12.017276 13 [148, 103, 189, 255] Built spaces, residential building 6m < height... 8e580a41831bb97 18

Mapping gridded

sample_size = np.min([layer_grid.shape[0],30_000])
plotting_layer = (layer_grid
                #   .iloc[0:sample_size]
                  .sample(sample_size))
plotting_layer["color"] = cmap(plotting_layer["band_var"].astype("int",copy=False)
                               ,palette=gdp_pc_pal
                               ,log=False)
Max input : 24.00, palette colors : 10
#####

layer_gridded = pdk.Layer(
    "H3HexagonLayer",
    plotting_layer,
    pickable=True,
    stroked=True,
    filled=True,
    opacity=.5,
    extruded=False,
    # elevation_scale = 1e-4,
    # elevation_range = [0,100],
    get_elevation="band_var",
    get_hexagon= "h3_gridded",
    get_fill_color = "color",
    get_line_color= [0, 0, 0, 100],
    line_width_min_pixels=0,
    line_width_max_pixels=1,
)
# Create mapdeck object
layer_gridded_deck = pdk.Deck(layers=[layer_gridded]
             ,initial_view_state=viewstate
             ,tooltip= {"text": "{band_var} :  {value}"}
             ,map_style="road"
             )

layer_gridded_deck.to_html("../../deck_maps/msz_ghsl_gridded.html",iframe_height=800,open_browser=True)