Constraining layers

Among the layers of data available, some have high granularity and will be used as a rescaling variables to distribute values that would otherwise be uniform in space at a coarser resolution. We to link certain economic activity related variables to some infrastructural and geographic layers, and assuming some form of homogeneity, distribute the coarse resolutions layers proportionally to the the distribution, say, of infrastrucutre in a location. This notebook will lay some basis for this and show the scalenav functionalities developped for that purpose.

import numpy as np

# scalenav modules
import scalenav.scale_nav as sd
import scalenav.data as dt
from scalenav.plotting import cmap
import scalenav.oop as snoo

# data engineering
import pandas as pd
import h3
import ibis as ib
from ibis import _
ib.options.interactive = True

### Visualisation
import pydeck as pdk
import pypalettes as pypal

H3 + DuckDB scale up

Connect to a temporary duckdb database in which we will be running most of the computations. The output of the sn_connect function is a connection object to the duckdb instance with the necessary dependencies installed and ready to use.

# For a detailed explanation of what goes on here, refer to the oop guide. 
conn = snoo.sn_connect()
Connecting to a temporary in-memory DB instance.

H3 and grid parameters

Reading data layers

To read layers into our newly created database, we call the sn_table function, which will receive as parameter the connection object representing the database into which the table should be loaded, the name that we want to assign to the table in the backend and the path ro find the data. It will return an ibis.Table object.

dose_wdi_geo_file = "/Users/cenv1069/Documents/data/datasets/local_data/dose-wdi/0_3/dose_wdi_geo.parquet"
mapspam_file = "/Users/cenv1069/Documents/data/datasets/mapspam/processed/spam_2020_yield_v2.parquet"
dwg = snoo.sn_table(conn=conn,name="dwg",path=dose_wdi_geo_file)
spam = snoo.sn_table(conn,name="spam",path=mapspam_file)
# Have a look at the data:
dwg.head()
┏━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ gid_0   country  gid_1    grp_usd_2015  services_usd_2015  manufacturing_usd_2015  agriculture_usd_2015  centr                                                            geometry                                                                                                                                                                                                                                     color              radius     x          y         ┃
┡━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━┩
│ stringstringstringfloat64float64float64float64binarybinaryarray<int64>float64float64float64   │
├────────┼─────────┼─────────┼──────────────┼───────────────────┼────────────────────────┼──────────────────────┼─────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────────┼───────────┼───────────┼───────────┤
│ ALB   AlbaniaALB.1_14.341915e+082.009631e+089.449606e+078.588401e+07b'\x01\x01\x00\x00\x00@\xa0\x88\x02=\x174@\x9d\xa3\xc5\xb3rPD@'b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xbe\x02\x00\x00\x07h\x11@ a4@\xd3\xa9\x02 \xb47D@\xcd\x15\x10 d`4@\xc4G\xfe?\xd27D@\xa0*\xf0_j_4@\xd8\x98\xf8\x1f\xf07D@$\xb3\xfe\xbf\x7f^4@.:\xf6?\x028D@\x88\xce\xf8'+11165[238, 97, ... +2]19.88899620.09077540.628500 │
│ ALB   AlbaniaALB.2_13.719427e+081.721516e+088.094841e+077.357106e+07b'\x01\x01\x00\x00\x00\xe6M\xcan\xe1?4@\x87\xf1\xed\xcdg\xcbD@'b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xbb\x02\x00\x00\xa0\x16\x07\xc0\rl4@\xf5\x0e\xf9_\x15\xa4D@\xdfC\x15@)l4@u\xe3\xfe\x9f\x1a\xa4D@\x1f\x99\xf5\xff\xfdk4@:\xa9\xf5\x9f1\xa4D@\xb5O\t\x80\xd8k4@\xab\xc4\x02\x80_\xa4D@\x9fY\x12'+11117[238, 97, ... +2]19.73425020.24953441.589105 │
│ ALB   AlbaniaALB.3_11.113524e+095.153885e+082.423439e+082.202575e+08b'\x01\x01\x00\x00\x00\xb5?\xc3\x1e`\xa13@\x14\x15\xcd2B\xb9D@'b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x8f\x0c\x00\x00\xb3@\xfd_\\\xa33@9\xad\x04 \xa8\xabD@\x85\x1a\xf0\xbf\xb6\xa23@\x13U\t`\x14\xabD@~r\x14 \n\xa33@+\xf0\xfc?\xda\xaaD@\xa8O\xf6\xbf\xf1\xa33@\xd1\x9a\xf6\xff7\xaaD@\xc0@\x10'+51373[238, 97, ... +2]20.83079619.63037341.447333 │
│ ALB   AlbaniaALB.4_17.954835e+083.681851e+081.731265e+081.573483e+08b'\x01\x01\x00\x00\x00\x17\xb66{\xbb/4@m\xda[\xa0\x1f\x85D@'b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xc5\x02\x00\x00\xe3\xe0\xf3?\xd0o4@\xe3\xa1\xf7\xdf\xadlD@ti\x15`\xcco4@\xb0\xb3\xf8\x7f\x9blD@A\x93\x0e \x8dp4@\x1f\x03\x05 flD@\xd8E\x13 \xf1p4@\xd5\x0f\x03\xc0\xa5kD@\xa6,\x01'+11277[238, 97, ... +2]20.49446120.18645441.040028 │
│ ALB   AlbaniaALB.5_11.345159e+096.225993e+082.927561e+082.660753e+08b"\x01\x01\x00\x00\x00\xee\x16g\xe7\xda\x9e3@Y\x90\xfc/'cD@"b'\x01\x03\x00\x00\x00\x01\x00\x00\x00,\n\x00\x00x\x06\xf4_s\xbe3@\xfc\x0c\xfb\x7f\xcd9D@+\xb1\x00\xe0\xb7\xbe3@\x8c\xc1\xfd_y:D@\xf2.\x0f\x80U\xbe3@\x159\x02\xc0\xca:D@\xdf\xed\xee\xbf\x9d\xbd3@\xf6\xb7\x04\xe0\x1f;D@,B\x14'+41597[238, 97, ... +2]21.01977819.62052840.774633 │
└────────┴─────────┴─────────┴──────────────┴───────────────────┴────────────────────────┴──────────────────────┴─────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴───────────────────┴───────────┴───────────┴───────────┘
spam.head()
┏━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓
┃ lon         lat        band_var    ┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ float32float32float32     │
├────────────┼───────────┼─────────────┤
│ 120.04166449.375000857.500000 │
│ 119.79166449.2916681027.000000 │
│ 119.87500049.2916681050.800049 │
│ 120.04166449.2916681065.599976 │
│ 120.12500049.2916681005.900024 │
└────────────┴───────────┴─────────────┘
# have a look at the table in the backend : 
conn.list_tables()
['dwg', 'spam']

Data

# known grid parameter of the data.
grid_param = 10_000

Some typical recommended H3 resolution values for projecting raster grids based on the size were precomputed and provided in a dictionary :

h3_res = dt.rast_to_h3[str(grid_param)]["h3_res"]
print(h3_res)
8

Then apply the sn_project function, it will try to automatically detect the coordinates columns, but they can be given as parameter in case it fails.

print("Using base H3 resolution : ", h3_res)
spam_h3 = snoo.sn_project(spam,res=h3_res)
Using base H3 resolution :  8
Assuming coordinates columns ('lon','lat')

This will result in a new table expression containing the newly created h3_id column.

snoo.sn_head(spam_h3)

┌──────────┐
│ 19459073 │
└──────────┘

┏━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ lon         lat        band_var     h3_id           ┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ float32float32float32string          │
├────────────┼───────────┼─────────────┼─────────────────┤
│ 120.04166449.375000857.50000088159c7aa3fffff │
│ 119.79166449.2916681027.00000088159c4e03fffff │
│ 119.87500049.2916681050.80004988159c4ea3fffff │
│ 120.04166449.2916681065.59997688159c793bfffff │
│ 120.12500049.2916681005.90002488159c6b4bfffff │
└────────────┴───────────┴─────────────┴─────────────────┘
# check the ids
spam_h3.h3_id.nunique()

┌────────┐
│ 514471 │
└────────┘
spam_h3 = spam_h3.group_by("h3_id").agg(band=_.band_var.sum(),
                                        x=_.lon.first(),
                                        y=_.lat.first(),)
spam_h3.count()

┌────────┐
│ 514471 │
└────────┘
conn.list_tables()
['dwg', 'spam']

One very important aspect of this workflow is that apart from the original data sets that we read into the database, no other data frames exist in the environment. The table expressions that ibis creates in the background and shows us in python are unrealised SQL queries. They only compute the minimum required for us to work with in the environment, say when we call the sn_head(n) function, it only needs to know the first n elements and will not bother computing all of them just to show us how the head of the table looks like. In SQL language, the table expressions correspond to views. The table expressions that we end up working with on the python side a just unrealised queries.

Let us now have a look at the data, for that we execute the table expression and set a limit to keep things smaller. This will result in a traditional pandas.DataFrame.

spam_h3_df = spam_h3.execute(limit=10_000)
spam_h3_df.head()
h3_id band x y
0 886b690a39fffff 5359.599915 34.291668 15.291667
1 88521b18c9fffff 56822.800415 43.791668 15.291667
2 8852e6d965fffff 57313.799438 44.375000 15.291667
3 8852e6c73bfffff 54263.199533 44.708332 15.291667
4 88525228b3fffff 58432.401428 47.958332 15.291667

Coordinates to plot only points

# h3_layer_ghsl_h3 = pdk.Layer(
#     "H3HexagonLayer",
#     spam_h3_df,
#     pickable=True,
#     stroked=True,
#     filled=True,
#     opacity=1,
#     extruded=False,
#     get_hexagon= "h3_id",
#     get_fill_color = [200,200,100,255],
#     get_line_color=[0, 0, 0, 100],
#     line_width_min_pixels=0,
#     line_width_max_pixels=1,
# )


points_layer = pdk.Layer(
    "ScatterplotLayer",
    spam_h3_df,
    pickable=False,
    opacity=0.8,
    stroked=False,
    filled=True,
    radius_scale=1,
    radius_min_pixels=1,
    radius_max_pixels=20,
    line_width_min_pixels=1,
    get_position = ["x","y"],
    get_radius=50,
    get_fill_color=[255, 140, 0, 255],
    get_line_color=[0, 0, 0, 0],
)
# view_state_global = pdk.data_utils.compute_view(spam_h3_df[["x","y"]])

# # Create mapdeck object
# deck_map_ghsl_h3 = pdk.Deck(layers=[
#     points_layer,
#     ]
#              ,initial_view_state=view_state_global
#              ,tooltip= {"text": "Value :  {band}"}
#              ,map_style="road"
#              )

# deck_map_ghsl_h3.to_html("../../deck_maps/deck_ghsl_h3_grid.html",
#                          iframe_height=800,
#                          )

Selecting an area

Let’s now restrict our area of interest :

dwg.filter(_.country.re_search("Nigeria")).select("gid_0","country")
┏━━━━━━━━┳━━━━━━━━━┓
┃ gid_0   country ┃
┡━━━━━━━━╇━━━━━━━━━┩
│ stringstring  │
├────────┼─────────┤
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│ NGA   Nigeria │
│        │
└────────┴─────────┘
selected_country = "NGA"

Rescaling approach to econ data

We will now restrict the area of interest and combine the layers at hand, namely the economic output per regions and the mapspam agricultural yield, into one table. For that, we need to do a bit of sql and create the corresponding table expression.

country = conn.sql(f"""
SELECT sel_country.* EXCLUDE (color,centr,geom,radius,x,y,grp_usd_2015,services_usd_2015,manufacturing_usd_2015),
         spam.*
         FROM spam AS spam
         JOIN (SELECT * EXCLUDE geometry, ST_GeomFromWKB(geometry) as geom FROM dwg where gid_0='{selected_country}') AS sel_country
         ON ST_Intersects(ST_Point(spam.lon,spam.lat),sel_country.geom);
""")
print(country.count())
country.head(3)

┌────────┐
│ 292275 │
└────────┘
┏━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ gid_0   country  gid_1     agriculture_usd_2015  lon       lat        band_var   ┃
┡━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ stringstringstringfloat64float32float32float32    │
├────────┼─────────┼──────────┼──────────────────────┼──────────┼───────────┼────────────┤
│ NGA   NigeriaNGA.19_14.318394e+097.45833311.208333269.200012 │
│ NGA   NigeriaNGA.19_14.318394e+097.54166711.208333269.100006 │
│ NGA   NigeriaNGA.19_14.318394e+097.62500011.208333269.200012 │
└────────┴─────────┴──────────┴──────────────────────┴──────────┴───────────┴────────────┘
country = snoo.sn_project(country,h3_res)
Assuming coordinates columns ('lon','lat')

The previous cell runs for a long time compared due to the fact that it has to actually perform the relatively compelex query in order to give us the head of the data. Generally, we will avoid calling this kind of functions in the middle of a workflow. If we want the results of a query to be available right away, we can either ‘cache’ the output, or promote the table expression to an actual backend table.

Summaries

Let’s compute some simple summary statistics on our data, for example the yield per region :

gid_1_band = country.group_by("gid_1").agg(total_band=_.band_var.sum())
gid_1_band
┏━━━━━━━━━━┳━━━━━━━━━━━━━━┓
┃ gid_1     total_band   ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━┩
│ stringfloat64      │
├──────────┼──────────────┤
│ NGA.31_15.000652e+07 │
│ NGA.12_14.038842e+07 │
│ NGA.19_17.312160e+07 │
│ NGA.23_16.812528e+07 │
│ NGA.13_11.131993e+07 │
│ NGA.37_14.579285e+07 │
│ NGA.16_13.300971e+07 │
│ NGA.30_12.064093e+07 │
│ NGA.9_1 4.657446e+07 │
│ NGA.20_14.123441e+07 │
│  │
└──────────┴──────────────┘

Now that we have restricted our analysis to a smaller region of interest and have significantly decreased the size of the data that we work with, we can switch to python for the rest fo the analysis. Let us export the data for the selected country into a pandas dataFrame. But before, we will perform the rescaling operation with our regional outputs.

country_gdf = (country
           .join(gid_1_band,"gid_1",how="left")
           .mutate(band_frac=_.band_var/_.total_band)
            # .mutate(h3_id_gdp=_.band_frac*_.build_gdp)
           ).to_pandas()

The previous step has added a variable band_frac which corresponds to the fractions of the agricultural yield that each h3 cell contributes to the regional total. In other words we have rescaled the yield variable to be a density other the the raster (or h3) cells whithin each regions.

print(country_gdf.shape)
country_gdf.head()
(292275, 11)
gid_0 country gid_1 agriculture_usd_2015 lon lat band_var h3_id gid_1_right total_band band_frac
0 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177
1 NGA Nigeria NGA.37_1 1.815483e+09 6.708330 12.791668 8120.799805 88580348abfffff NGA.37_1 4.579285e+07 0.000177
2 NGA Nigeria NGA.37_1 1.815483e+09 6.791664 12.791668 8120.799805 8858034d1bfffff NGA.37_1 4.579285e+07 0.000177
3 NGA Nigeria NGA.18_1 2.143236e+09 8.458330 12.791668 8120.799805 88580a9a1dfffff NGA.18_1 3.557801e+07 0.000228
4 NGA Nigeria NGA.18_1 2.143236e+09 8.541664 12.791668 8120.799805 88580a91abfffff NGA.18_1 3.557801e+07 0.000228

Check that things add up

The previous rescaling operations has resulted in density distributions whithin each region. If we add up the density values for each region, we should get 1. This density has the same behaviour as a standard probability distribution.

country_gdf[["gid_1","band_frac"]].groupby("gid_1").agg("sum")
band_frac
gid_1
NGA.10_1 1.0
NGA.11_1 1.0
NGA.12_1 1.0
NGA.13_1 1.0
NGA.15_1 1.0
NGA.16_1 1.0
NGA.18_1 1.0
NGA.19_1 1.0
NGA.20_1 1.0
NGA.23_1 1.0
NGA.27_1 1.0
NGA.28_1 1.0
NGA.29_1 1.0
NGA.30_1 1.0
NGA.31_1 1.0
NGA.33_1 1.0
NGA.37_1 1.0
NGA.3_1 1.0
NGA.4_1 1.0
NGA.5_1 1.0
NGA.6_1 1.0
NGA.9_1 1.0

Mapping

Let’s have a look at this. To help us, the cmap function from the scalenav plotting module will transform the values into RGB code that pydeck will read :

col_pal = "viridis"
col_pal = pypal.load_cmap(col_pal)
cols =  cmap((country_gdf.band_var),palette=col_pal,log=True) #pd.Series([[255*j for j in x] for x in bmlunge(compact_geo_downscaled["log_value"])])
country_gdf["color"] = cols
country_gdf.head(5)
Max input : 11.06, palette colors : 10
gid_0 country gid_1 agriculture_usd_2015 lon lat band_var h3_id gid_1_right total_band band_frac color
0 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255]
1 NGA Nigeria NGA.37_1 1.815483e+09 6.708330 12.791668 8120.799805 88580348abfffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255]
2 NGA Nigeria NGA.37_1 1.815483e+09 6.791664 12.791668 8120.799805 8858034d1bfffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255]
3 NGA Nigeria NGA.18_1 2.143236e+09 8.458330 12.791668 8120.799805 88580a9a1dfffff NGA.18_1 3.557801e+07 0.000228 [134, 213, 73, 255]
4 NGA Nigeria NGA.18_1 2.143236e+09 8.541664 12.791668 8120.799805 88580a91abfffff NGA.18_1 3.557801e+07 0.000228 [134, 213, 73, 255]
view_state = pdk.data_utils.compute_view(points=country_gdf[["lon","lat"]])
# # this map is not really usefull at high resolutions

# h3_layer_down = pdk.Layer(
#     "H3HexagonLayer",
#     country_gdf[["gid_0","gid_1","h3_id","total_band","color"]],
#     pickable=True,
#     stroked=True,
#     filled=True,
#     opacity=.8,
#     extruded=False,
#     get_hexagon= "h3_id",
#     get_fill_color = "color",
#     get_line_color= [0, 0, 0, 100],
#     line_width_min_pixels=0,
#     line_width_max_pixels=1,
# )


layer_down = pdk.Layer(
    "ScatterplotLayer",
    country_gdf[["lon","lat"]],
    pickable=False,
    opacity=0.4,
    stroked=False,
    filled=True,
    radius_scale=6,
    radius_min_pixels=1,
    radius_max_pixels=10,
    line_width_min_pixels=1,
    get_position = ["lon","lat"],
    get_radius=50,
    get_fill_color=[255, 140, 0, 255],
    get_line_color=[0, 0, 0, 0],
)
# # Create mapdeck object
# deck_map_down = pdk.Deck(layers=[
#     # h3_layer_down,
#     layer_down
#     ]
#              ,initial_view_state=view_state
#             #  ,tooltip= {"text": "Value :  {h3_id_gdp}"}
#              ,height=800
#              ,map_style="road"
#              )

# deck_map_down.to_html("../../deck_maps/deck_ghsl_h3_down_grid.html")

Gridding

At this point, we come to a limitation. The developed process to convert raster grids into their centroids, and then projecting the centroids into h3 only gives the index values for the centroids. Ideally, we want to have an H3 grid covering the area that was covered by the rater in which the cells have an assigned value from the original. For this, a process referred to as gridding was implemented. It consists in filling the empty space between cell centroids with other cells, and then redistributing the variable on them based on the proximity to the centroid.

The following function assists us with that, it computes the neighbourhood (set of h3 neighbouring cells) needed to fill the space around a given centre cell, covering an area roughly equivalent to the original raster grid cell.

rast_to_h3 = dt.rast_to_h3_map(x=country_gdf["lon"].mean(),y = country_gdf["lat"].mean())
Using angles for meter grid.
# ideally, this is precomputed, but for some reason it has not worked...
neighbs = rast_to_h3[str(grid_param)]["nn"]

country_gdf["h3_gridded"] = pd.Series(country_gdf.apply(lambda row: dt.centre_cell_to_square(h3_cell = row["h3_id"],neighbs=neighbs),axis=1).tolist())
country_gdf.dropna(subset=["h3_gridded"],inplace=True)

As a result of this process, a new variable is added to the dataframe containing a list of cells that are asigned to the centroid. Similarly to a voronoi process.

country_gdf.head()
gid_0 country gid_1 agriculture_usd_2015 lon lat band_var h3_id gid_1_right total_band band_frac color h3_gridded
0 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] [8858034839fffff, 8858034911fffff, 88581c86c1f...
1 NGA Nigeria NGA.37_1 1.815483e+09 6.708330 12.791668 8120.799805 88580348abfffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] [8858034d4dfffff, 88580348b5fffff, 88581cb265f...
2 NGA Nigeria NGA.37_1 1.815483e+09 6.791664 12.791668 8120.799805 8858034d1bfffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] [8858034cedfffff, 8858034dc5fffff, 88581cb2d5f...
3 NGA Nigeria NGA.18_1 2.143236e+09 8.458330 12.791668 8120.799805 88580a9a1dfffff NGA.18_1 3.557801e+07 0.000228 [134, 213, 73, 255] [88580a9131fffff, 88580a9a17fffff, 88580a9847f...
4 NGA Nigeria NGA.18_1 2.143236e+09 8.541664 12.791668 8120.799805 88580a91abfffff NGA.18_1 3.557801e+07 0.000228 [134, 213, 73, 255] [88580a824dfffff, 88580a91b5fffff, 88580a8365f...

Gridded data set

TO get the full data, we ‘explode’ the variable :

spam_gridded = country_gdf.explode("h3_gridded")
print(spam_gridded.shape)
spam_gridded.head(3)
(44133525, 13)
gid_0 country gid_1 agriculture_usd_2015 lon lat band_var h3_id gid_1_right total_band band_frac color h3_gridded
0 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 8858034839fffff
0 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 8858034911fffff
0 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 88581c86c1fffff

We further remove the potential duplicates that could have been created :

spam_gridded = spam_gridded[~spam_gridded.duplicated(subset=["h3_gridded"])]
print(spam_gridded.shape)
spam_gridded.reset_index(inplace=True,drop=True)
(569505, 13)
spam_gridded.head(3)
gid_0 country gid_1 agriculture_usd_2015 lon lat band_var h3_id gid_1_right total_band band_frac color h3_gridded
0 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 8858034839fffff
1 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 8858034911fffff
2 NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 8858034957fffff NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 88581c86c1fffff
spam_gridded.set_index("h3_id",inplace=True,drop=True)

We count the number of cells that were added to each centroid cell:

spam_gridded["count"] = spam_gridded.groupby("h3_id")["gid_1"].transform("count")

Rescale the values to distribute them across the cells :

# ghsl_gridded["h3_id_gdp_gridded"] = np.round(ghsl_gridded["h3_id_gdp"]/ghsl_gridded["count"])
spam_gridded["h3_id_band_gridded"] = np.round(spam_gridded["band_var"]/spam_gridded["count"],2)
spam_gridded.reset_index(inplace=True,drop=False)
print(spam_gridded.shape)
spam_gridded.head()
(569505, 15)
h3_id gid_0 country gid_1 agriculture_usd_2015 lon lat band_var gid_1_right total_band band_frac color h3_gridded count h3_id_band_gridded
0 8858034957fffff NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 8858034839fffff 151 53.78
1 8858034957fffff NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 8858034911fffff 151 53.78
2 8858034957fffff NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 88581c86c1fffff 151 53.78
3 8858034957fffff NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 885803482dfffff 151 53.78
4 8858034957fffff NGA Nigeria NGA.37_1 1.815483e+09 6.624997 12.791668 8120.799805 NGA.37_1 4.579285e+07 0.000177 [134, 213, 73, 255] 885803480bfffff 151 53.78
# view_state = pdk.data_utils.compute_view(points=ghsl_gridded.sample(10000)["h3_gridded"].apply(lambda cell: h3.cell_to_latlng(cell)[::-1]))

# lagos: latitude = 6.450151,longitude = 3.531193 !!!! DOES NOT WORK BECAUSE IT IS EXCLUDED FROM DOSE!!!
# Abidjan :  latitude = 5.367, longitude = -4.009
# Kano : latitude = 11.985, longitude = 8.533

zoom_lat=11.985
zoom_lon=8.533
zoom_width=70_000

view_state_zoomed = pdk.ViewState(latitude = zoom_lat,longitude = zoom_lon,zoom=11,pitch=30,bearing=0) 
# extracting data for the plotting region : 
plotting_region = dt.square_poly(lat=zoom_lat,lon=zoom_lon,distance=zoom_width)
limits = [np.round(x,3) for x in plotting_region.bounds]
limits
[8.212, 11.664, 8.854, 12.306]

Another beauty of ibis is that we can run sql on top of tables inside out python environment :

# ghsl_gridded
spam_gridded_zoom = conn.sql(
f"""
Select * from 
    (select *, 
    array_extract(h3_cell_to_latlng(h3_gridded),2) as x_h3, 
    array_extract(h3_cell_to_latlng(h3_gridded),1) as y_h3
    from spam_gridded)
where (lon <= {limits[2]}) and (lon >= {limits[0]}) and (lat <= {limits[3]}) and (lat >= {limits[1]});
""").execute()
print(spam_gridded_zoom.shape)
spam_gridded_zoom.head()
(6509, 17)
h3_id gid_0 country gid_1 agriculture_usd_2015 lon lat band_var gid_1_right total_band band_frac color h3_gridded count h3_id_band_gridded x_h3 y_h3
0 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [82, 197, 105, 255] 88580ac8b9fffff 120 40.95 7.870442 12.315062
1 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [82, 197, 105, 255] 88580ac991fffff 120 40.95 7.884474 12.267223
2 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [82, 197, 105, 255] 88580a5341fffff 120 40.95 7.911435 12.251724
3 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [82, 197, 105, 255] 88580ac8adfffff 120 40.95 7.912636 12.273132
4 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [82, 197, 105, 255] 88580ac88bfffff 120 40.95 7.845809 12.322583
spam_gridded_zoom["color"] = cmap(spam_gridded_zoom["h3_id_band_gridded"],palette=col_pal)
Max input : 45.50, palette colors : 10
h3_layer_ghsl_h3_gridded = pdk.Layer(
    "H3HexagonLayer",
    # ghsl_gridded.loc[0:200_000],
    spam_gridded_zoom,
    pickable=True,
    stroked=True,
    filled=True,
    opacity=.5,
    extruded=False,
    get_hexagon= "h3_gridded",
    get_fill_color = "color",
    get_line_color= [0, 0, 0, 100],
    line_width_min_pixels=1,
    line_width_max_pixels=1,
)


layer_down = pdk.Layer(
    "ScatterplotLayer",
    country_gdf[["lon","lat"]],
    pickable=False,
    opacity=0.8,
    stroked=False,
    filled=True,
    radius_scale=10,
    radius_min_pixels=1,
    radius_max_pixels=20,
    line_width_min_pixels=0,
    line_width_max_pixels=0,
    get_position = ["lon","lat"],
    get_radius=25,
    get_fill_color=[255, 140, 0, 255],
    get_line_color=[0, 0, 0, 0],
)

And we can see the result of gridding overlayed with the original centroids of cells. Note that the value of the yield from a raster grid centroid is distributed across all the cells covering the area nearest to it.

# # Create mapdeck object

# deck_map_ghsl_h3_gridded = pdk.Deck(layers=[h3_layer_ghsl_h3_gridded, 
#                                             layer_down
#                                             ]
#              ,initial_view_state=view_state_zoomed
#              ,tooltip= {"text": "Value :  {h3_id_band_gridded}"}
#              ,height=800
#              ,map_style="road"
#              )

# deck_map_ghsl_h3_gridded.to_html("../../deck_maps/deck_ghsl_nres_h3_gridded.html",
#                                 #  open_browser=True,
#                                  iframe_height=800
#                                  )
# spam_rescaled = spam_gridded_zoom.rename(columns={"h3_id_gdp_gridded" : "h3_id_gdp_gridded_var",
#                                     "h3_id_band_gridded" : "h3_id_band_gridded_var",
#                                     "h3_id" : "h3_gridded_parent"
#                                     },inplace=False)
# spam_rescaled.rename(columns={"h3_gridded" : "h3_id"
#                              },inplace=True)
# spam_rescaled.head()
# h3_gridded_new_scale = sd.change_res(spam_rescaled,level=-1)
# print(h3_gridded_new_scale.shape)
# h3_gridded_new_scale.head()
# h3_gridded_new_scale["color"] = cmap((h3_gridded_new_scale.h3_id_band_gridded_var),palette=col_pal)
# #####

# h3_layer_ghsl_h3_gridded_new_scale = pdk.Layer(
#     "H3HexagonLayer",
#     h3_gridded_new_scale,
#     pickable=True,
#     stroked=True,
#     filled=True,
#     opacity=.5,
#     extruded=False,
#     get_hexagon= "h3_id",
#     get_fill_color = "color",
#     get_line_color= [0, 0, 0, 100],
#     line_width_min_pixels=1,
#     line_width_max_pixels=1,
# )
# # Create mapdeck object
# deck_map_ghsl_h3_gridded_rescaled = pdk.Deck(layers=[h3_layer_ghsl_h3_gridded_new_scale]
#              ,initial_view_state=view_state_zoomed
#              ,tooltip= {"text": "Value :  {h3_id_gdp_gridded_var}"}
#              ,map_style="road"
#              )

# deck_map_ghsl_h3_gridded_rescaled.to_html("../../deck_maps/deck_ghsl_nres_h3_gridded_new_scale.html",
#                                           iframe_height=800,
#                                         #   open_browser=True
#                                           )

Adding constraint layer

Let’s now use GHSL Human settlement layer data to hint at where there is no agricultural lands. Start with loading in the ghsl layer just like a base layer

grid_ghsl_param = 100

ghsl_file = f"/Users/cenv1069/Documents/data/datasets/JRC/processed_samples/S_100_R8_C19/*.parquet"
ghsl_file
# tiffs = [x for x in glob.glob(ghsl_file,recursive=True) if re.search(pattern=".tif$",string=x)]
# print(len(tiffs))
# tiffs
'/Users/cenv1069/Documents/data/datasets/JRC/processed_samples/S_100_R8_C19/*.parquet'
raw_grid_constraint = snoo.sn_table(conn=conn,name="raw_grid_constraint",path=ghsl_file,overwrite=True, bbox=limits)
overwriting existing
reading bbox

This has added a new table into the backend :

conn.list_tables()
['dwg', 'raw_grid_constraint', 'spam']
snoo.sn_head(raw_grid_constraint)

┌────────┐
│ 172826 │
└────────┘

┏━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
┃ lon       lat        band_var ┃
┡━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
│ float32float32uint16   │
├──────────┼───────────┼──────────┤
│ 8.21686512.3054921530 │
│ 8.21778012.305492680 │
│ 8.21961112.30549250 │
│ 8.22052712.30549285 │
│ 8.22510212.305492339 │
└──────────┴───────────┴──────────┘
ghsl_constrain_res = 12
ghsl_constraint = snoo.sn_project(raw_grid_constraint,res=ghsl_constrain_res)
Assuming coordinates columns ('lon','lat')
print(ghsl_constraint.count())
ghsl_constraint.head()

┌────────┐
│ 172826 │
└────────┘
┏━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ lon       lat        band_var  h3_id           ┃
┡━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ float32float32uint16string          │
├──────────┼───────────┼──────────┼─────────────────┤
│ 8.21686512.30549215308c580ac9090edff │
│ 8.21778012.3054926808c580ac90909dff │
│ 8.21961112.305492508c580ac90943dff │
│ 8.22052712.305492858c580ac90948dff │
│ 8.22510212.3054923398c580ac954d43ff │
└──────────┴───────────┴──────────┴─────────────────┘
constraint = ghsl_constraint.execute()

Matching Resolution

bringing the variable layer and the constrain layer to a common resolution

spam_gridded_zoom.head(
    
)
h3_id gid_0 country gid_1 agriculture_usd_2015 lon lat band_var gid_1_right total_band band_frac color h3_gridded count h3_id_band_gridded x_h3 y_h3
0 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 88580ac8b9fffff 120 40.95 7.870442 12.315062
1 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 88580ac991fffff 120 40.95 7.884474 12.267223
2 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 88580a5341fffff 120 40.95 7.911435 12.251724
3 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 88580ac8adfffff 120 40.95 7.912636 12.273132
4 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 4914.5 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 88580ac88bfffff 120 40.95 7.845809 12.322583
constraint_res = h3.get_resolution(constraint.h3_id[0])
layer_res = h3.get_resolution(spam_gridded_zoom.h3_gridded[0])

print("Constraint res : ",constraint_res,", Layer res: ",layer_res)
Constraint res :  12 , Layer res:  8

Let’s meet somewhere in between.

spam_gridded_zoom.rename(columns={"h3_id" : "h3_native"
                                ,"h3_gridded" : "h3_id"},inplace=True)
final_res = 10

Use the scalenav set_res function to set a desired resolution to both layers

constraint_gridded_zoom_rescaled = sd.set_res(constraint,final=final_res)
spam_gridded_zoom_rescaled = sd.set_res(spam_gridded_zoom,final=final_res)
constraint_res = h3.get_resolution(constraint_gridded_zoom_rescaled.h3_id[0])
layer_res = h3.get_resolution(spam_gridded_zoom_rescaled.h3_id[0])

print("Constraint res : ",constraint_res,", Layer res: ",layer_res)
Constraint res :  10 , Layer res:  10

Both resolutions match now. This will allow next to perform operations on the index variables of both layers together.

print(spam_gridded_zoom_rescaled.shape)
spam_gridded_zoom_rescaled.head()
(318941, 17)
h3_native gid_0 country gid_1 agriculture_usd_2015 lon lat band_var gid_1_right total_band band_frac color h3_id count h3_id_band_gridded x_h3 y_h3
0 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b807fff 120 40.95 7.870442 12.315062
1 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b80ffff 120 40.95 7.870442 12.315062
2 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b817fff 120 40.95 7.870442 12.315062
3 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b81ffff 120 40.95 7.870442 12.315062
4 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b827fff 120 40.95 7.870442 12.315062
print(constraint_gridded_zoom_rescaled.shape)
constraint_gridded_zoom_rescaled.head()
(132809, 3)
h3_id child_cells band_var
0 8a580a00a097fff [8b580a00a091fff] 298
1 8a580a00a09ffff [8b580a00a09bfff] 2
2 8a580a00a19ffff [8b580a00a19bfff] 136
3 8a580a00a297fff [8b580a00a296fff] 98
4 8a580a00a2d7fff [8b580a00a2d0fff] 11

Set up a common layer by merging the data and constraint layers.

layer_constrained = spam_gridded_zoom_rescaled.merge(constraint_gridded_zoom_rescaled,on="h3_id",how="left")
print(layer_constrained.shape)
layer_constrained.head()
(318941, 19)
h3_native gid_0 country gid_1 agriculture_usd_2015 lon lat band_var_x gid_1_right total_band band_frac color h3_id count h3_id_band_gridded x_h3 y_h3 child_cells band_var_y
0 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b807fff 120 40.95 7.870442 12.315062 NaN NaN
1 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b80ffff 120 40.95 7.870442 12.315062 NaN NaN
2 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b817fff 120 40.95 7.870442 12.315062 NaN NaN
3 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b81ffff 120 40.95 7.870442 12.315062 NaN NaN
4 88580ac9d7fffff NGA Nigeria NGA.20_1 2.325922e+09 8.291664 12.291668 100.295914 NGA.20_1 4.123441e+07 0.000119 [194, 223, 35, 255] 8a580ac8b827fff 120 40.95 7.870442 12.315062 NaN NaN

Exclude the cells for which the constraint was identified, their ‘child_cell’ variable is not na. To keep things simple, the values of the constraint variable are not accounted for, it is used as a hard, binary constraint. If there is any infrastructure in the cell, it is removed fully, even for small values.

layer_constrained = layer_constrained[layer_constrained.child_cells.isna()]
# layer_constrained.reset_index(drop=False,inplace=True)
layer_constrained["color"] = cmap((layer_constrained.h3_id_band_gridded),palette=col_pal)
Max input : 45.50, palette colors : 10
#####
constrained_layer = pdk.Layer(
    "H3HexagonLayer",
    layer_constrained,
    pickable=True,
    stroked=True,
    filled=True,
    opacity=.5,
    extruded=False,
    get_hexagon= "h3_id",
    get_fill_color = "color",
    get_line_color= [0, 0, 0, 100],
    line_width_min_pixels=0,
    line_width_max_pixels=1,
)
constraint_layer = pdk.Layer(
    "H3HexagonLayer",
    constraint_gridded_zoom_rescaled,
    pickable=True,
    stroked=True,
    filled=True,
    opacity=.5,
    extruded=False,
    get_hexagon= "h3_id",
    get_fill_color = [56, 65, 87, 255],
    get_line_color= [0, 0, 0, 100],
    line_width_min_pixels=0,
    line_width_max_pixels=1,
)
# # Create mapdeck object
# deck_map_agri_layer_constrained = pdk.Deck(layers=[
#     constrained_layer,
#     constraint_layer]
#              ,initial_view_state=view_state_zoomed
#              ,tooltip= {"text": "Value :  {h3_id_band_gridded}"}
#              ,map_style="road"
#              )

# deck_map_agri_layer_constrained.to_html("../../deck_maps/deck_ghsl_agri_constrained.html",
#                                         iframe_height=800,
#                                         # open_browser=True
#                                         )