Data ingestion and processing¶
import rasterio as rs
from rasterio.windows import Window
from rasterio.transform import xy
from rasterio.vrt import WarpedVRT
from rasterio.crs import CRS
from rasterio.windows import Window
from pyproj import Transformer
from pandas import DataFrame
import geopandas as gpd
import numpy as np
from numpy import array,meshgrid,arange,log
import ibis as ib
ib.options.interactive = True
# With dask
import dask as dk
import dask.array as da
import dask.dataframe as dd
# import dask_image as dki
import xarray as xr
import rioxarray as rx
from pyarrow import float32,schema,field,uint16,table,Table
from pyarrow.parquet import ParquetWriter
# big file
# src_file="/Users/cenv1069/Documents/data/datasets/JRC/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif"
src_file = "/Users/cenv1069/Documents/data/datasets/JRC/S_10m/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif"
# smaller size file
# src_file="/Users/cenv1069/Documents/data/datasets/JRC/S_1000m/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19.tif"
Working workflows below¶
# inspired by : https://rasterio.readthedocs.io/en/stable/topics/virtual-warping.html
dst_crs = CRS.from_epsg(4326)
print(dst_crs)
EPSG:4326
vrt_options = {
# 'resampling': Resampling.cubic,
'crs': dst_crs,
# 'transform': dst_transform,
# 'height': dst_height,
# 'width': dst_width,
}
out_path = "S_10m_NRES_R8_C19.parquet"
rast_schema = schema([('lon',float32())
,('lat',float32())
,('band_var',float32())
])
rast_schema.with_metadata({
"lon" : "Longitude coordinate",
"lat" : "Latitude coordinate",
"band_var" : "Value associated",
})
lon: float
lat: float
band_var: float
-- schema metadata --
lon: 'Longitude coordinate'
lat: 'Latitude coordinate'
band_var: 'Value associated'
Using a virtual Warper and windows¶
%%time
# with ParquetWriter(out_path, rast_schema) as writer:
# with rs.open(src_file) as src:
# src_crs = src.crs
# if len(src.nodatavals)>1:
# nodata = src.nodatavals[0]
# else :
# nodata = src.nodatavals
# with WarpedVRT(src, **vrt_options) as vrt:
# # At this point 'vrt' is a full dataset with dimensions,
# # CRS, and spatial extent matching 'vrt_options'.
# # Read all data into memory.
# # data = vrt.read()
# # Process the dataset in chunks. Likely not very efficient.
# win_transfrom = vrt.window_transform
# for _, window in vrt.block_windows():
# # print(src.crs)
# band1 = vrt.read(window=window)
# height = band1.shape[1]
# width = band1.shape[2]
# cols, rows = meshgrid(arange(width), arange(height))
# xs, ys = xy(
# transform = win_transfrom(window),
# rows=rows,
# cols=cols)
# lons = array(xs)
# lats = array(ys)
# out = DataFrame({"band_var" : array(band1).flatten()
# ,'lon': lons.flatten()
# ,'lat': lats.flatten()})
# out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)
# out.drop(index=out.loc[out.band_var<=0].index,inplace=True)
# # print(out.shape)
# # print(out.head())
# if out.shape[0]!=0:
# writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))
# # # Dump the aligned data into a new file. A VRT representing
# # # this transformation can also be produced by switching
# # # to the VRT driver.
# # directory, name = os.path.split(path)
# # outfile = os.path.join(directory, 'aligned-{}'.format(name))
# # rio_shutil.copy(vrt, outfile, driver='GTiff')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:1
NameError: name 'ParquetWriter' is not defined
Using a classic window approach¶
# %%time
with ParquetWriter(out_path, rast_schema) as writer:
with rs.open(src_file) as src:
src_crs = src.crs
win_transfrom = src.window_transform
transformer = Transformer.from_crs(str(src_crs), 'EPSG:4326', always_xy=True)
if len(src.nodatavals)>1:
nodata = src.nodatavals[0]
else :
nodata = src.nodatavals
# Process the dataset in chunks. Likely not very efficient.
for ij, window in src.block_windows():
# print(window)
# print(src.crs)
band1 = src.read(window=window)
# print(band1[0])
height = band1.shape[1]
width = band1.shape[2]
cols, rows = meshgrid(arange(width), arange(height))
# print(win_transfrom(window))
xs, ys = xy(
transform = win_transfrom(window),
rows=rows,
cols=cols)
# print(xs,ys)
lons,lats = transformer.transform(array(xs),array(ys))
# print(lons.shape)
# print(lats.shape)
# print(len(array(band1).flatten()))
# print(len(lons.flatten()))
out = DataFrame({'lon': lons.flatten(),
'lat': lats.flatten(),
"band_var" : array(band1[0,:,:]).flatten(),
})
out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)
out.drop(index=out.loc[out.band_var<=0].index,inplace=True)
# print(out.shape)
# print(out.head())
if out.shape[0]!=0:
writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))
test_data = ib.read_parquet(out_path)
test_data.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ lon ┃ lat ┃ band_var ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩ │ float32 │ float32 │ float32 │ ├───────────┼───────────┼──────────┤ │ -0.003478 │ 16.256193 │ 1.0 │ │ -0.003386 │ 16.256193 │ 1.0 │ │ -0.003295 │ 16.256193 │ 1.0 │ │ -0.003203 │ 16.256193 │ 1.0 │ │ -0.003569 │ 16.256102 │ 1.0 │ └───────────┴───────────┴──────────┘
test_data.count()
┌────────┐
│ 243059 │
└────────┘
test_data
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ lon ┃ lat ┃ band_var ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩ │ float32 │ float32 │ float32 │ ├───────────┼───────────┼──────────┤ │ -0.003478 │ 16.256193 │ 1.0 │ │ -0.003386 │ 16.256193 │ 1.0 │ │ -0.003295 │ 16.256193 │ 1.0 │ │ -0.003203 │ 16.256193 │ 1.0 │ │ -0.003569 │ 16.256102 │ 1.0 │ │ -0.003478 │ 16.256102 │ 1.0 │ │ -0.003386 │ 16.256102 │ 1.0 │ │ -0.003295 │ 16.256102 │ 1.0 │ │ -0.003203 │ 16.256102 │ 1.0 │ │ -0.003112 │ 16.256102 │ 1.0 │ │ … │ … │ … │ └───────────┴───────────┴──────────┘
# test_data.select("lon","lat")
# test_xy = test_data.select("lon","lat").to_pandas()
# gpd.GeoSeries(gpd.points_from_xy(test_xy["lon"],test_xy["lat"],crs="epsg:4326")).explore()