{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data ingestion and processing" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import rasterio as rs \n", "from rasterio.windows import Window\n", "from rasterio.transform import xy\n", "from rasterio.vrt import WarpedVRT\n", "from rasterio.crs import CRS\n", "from rasterio.windows import Window\n", "\n", "from pyproj import Transformer\n", "\n", "from pandas import DataFrame\n", "import geopandas as gpd\n", "import numpy as np\n", "from numpy import array,meshgrid,arange,log\n", "\n", "import ibis as ib\n", "ib.options.interactive = True" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# With dask\n", "\n", "import dask as dk\n", "import dask.array as da\n", "import dask.dataframe as dd\n", "# import dask_image as dki\n", "import xarray as xr\n", "import rioxarray as rx\n", "\n", "from pyarrow import float32,schema,field,uint16,table,Table\n", "from pyarrow.parquet import ParquetWriter" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# big file\n", "# src_file=\"/Users/cenv1069/Documents/data/datasets/JRC/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif\"\n", "\n", "src_file = \"/Users/cenv1069/Documents/data/datasets/JRC/S_10m/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19/GHS_BUILT_S_NRES_E2018_GLOBE_R2023A_54009_10_V1_0_R8_C19.tif\"\n", "\n", "# smaller size file\n", "# src_file=\"/Users/cenv1069/Documents/data/datasets/JRC/S_1000m/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19/GHS_BUILT_S_NRES_E2020_GLOBE_R2023A_54009_1000_V1_0_R8_C19.tif\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working workflows below" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EPSG:4326\n" ] } ], "source": [ "# inspired by : https://rasterio.readthedocs.io/en/stable/topics/virtual-warping.html\n", "\n", "dst_crs = CRS.from_epsg(4326)\n", "print(dst_crs)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lon: float\n", "lat: float\n", "band_var: float\n", "-- schema metadata --\n", "lon: 'Longitude coordinate'\n", "lat: 'Latitude coordinate'\n", "band_var: 'Value associated'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vrt_options = {\n", " # 'resampling': Resampling.cubic,\n", " 'crs': dst_crs,\n", " # 'transform': dst_transform,\n", " # 'height': dst_height,\n", " # 'width': dst_width,\n", "}\n", "\n", "out_path = \"S_10m_NRES_R8_C19.parquet\"\n", "\n", "rast_schema = schema([('lon',float32())\n", " ,('lat',float32())\n", " ,('band_var',float32())\n", " ])\n", "\n", "rast_schema.with_metadata({\n", " \"lon\" : \"Longitude coordinate\",\n", " \"lat\" : \"Latitude coordinate\",\n", " \"band_var\" : \"Value associated\",\n", " })\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using a virtual Warper and windows" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'ParquetWriter' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m:1\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'ParquetWriter' is not defined" ] } ], "source": [ "%%time\n", "\n", "# with ParquetWriter(out_path, rast_schema) as writer:\n", "# with rs.open(src_file) as src:\n", "# src_crs = src.crs\n", "# if len(src.nodatavals)>1:\n", "# nodata = src.nodatavals[0]\n", "# else :\n", "# nodata = src.nodatavals\n", "\n", "# with WarpedVRT(src, **vrt_options) as vrt:\n", "# # At this point 'vrt' is a full dataset with dimensions,\n", "# # CRS, and spatial extent matching 'vrt_options'.\n", "# # Read all data into memory.\n", "# # data = vrt.read()\n", "# # Process the dataset in chunks. Likely not very efficient.\n", " \n", "# win_transfrom = vrt.window_transform\n", "\n", "# for _, window in vrt.block_windows():\n", "# # print(src.crs)\n", "# band1 = vrt.read(window=window)\n", " \n", "# height = band1.shape[1]\n", "# width = band1.shape[2]\n", "# cols, rows = meshgrid(arange(width), arange(height))\n", "\n", "# xs, ys = xy(\n", "# transform = win_transfrom(window),\n", "# rows=rows,\n", "# cols=cols)\n", "\n", "# lons = array(xs)\n", "# lats = array(ys)\n", " \n", "# out = DataFrame({\"band_var\" : array(band1).flatten()\n", "# ,'lon': lons.flatten()\n", "# ,'lat': lats.flatten()})\n", " \n", "# out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)\n", "# out.drop(index=out.loc[out.band_var<=0].index,inplace=True)\n", "# # print(out.shape)\n", "# # print(out.head())\n", "\n", "# if out.shape[0]!=0:\n", "# writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))\n", "\n", " # # # Dump the aligned data into a new file. A VRT representing\n", " # # # this transformation can also be produced by switching\n", " # # # to the VRT driver.\n", " # # directory, name = os.path.split(path)\n", " # # outfile = os.path.join(directory, 'aligned-{}'.format(name))\n", " # # rio_shutil.copy(vrt, outfile, driver='GTiff')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using a classic window approach" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# %%time\n", "\n", "with ParquetWriter(out_path, rast_schema) as writer:\n", " with rs.open(src_file) as src:\n", " \n", " src_crs = src.crs\n", " win_transfrom = src.window_transform\n", " \n", " transformer = Transformer.from_crs(str(src_crs), 'EPSG:4326', always_xy=True)\n", " \n", " if len(src.nodatavals)>1:\n", " nodata = src.nodatavals[0]\n", " else :\n", " nodata = src.nodatavals\n", " # Process the dataset in chunks. Likely not very efficient.\n", " for ij, window in src.block_windows():\n", " # print(window)\n", " # print(src.crs)\n", " band1 = src.read(window=window)\n", " # print(band1[0])\n", " height = band1.shape[1]\n", " width = band1.shape[2]\n", " cols, rows = meshgrid(arange(width), arange(height))\n", " # print(win_transfrom(window))\n", " xs, ys = xy(\n", " transform = win_transfrom(window),\n", " rows=rows,\n", " cols=cols)\n", " \n", " # print(xs,ys)\n", " \n", " lons,lats = transformer.transform(array(xs),array(ys))\n", " # print(lons.shape)\n", " # print(lats.shape)\n", " # print(len(array(band1).flatten()))\n", " # print(len(lons.flatten()))\n", " \n", " out = DataFrame({'lon': lons.flatten(),\n", " 'lat': lats.flatten(),\n", " \"band_var\" : array(band1[0,:,:]).flatten(),\n", " })\n", " \n", " out.drop(index=out.loc[out.band_var==nodata].index,inplace=True)\n", " out.drop(index=out.loc[out.band_var<=0].index,inplace=True)\n", " \n", " # print(out.shape)\n", " # print(out.head())\n", " \n", " if out.shape[0]!=0:\n", " writer.write_table(Table.from_pandas(df=out,schema = rast_schema,preserve_index=False,safe=True))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "test_data = ib.read_parquet(out_path)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n",
       "┃ lon        lat        band_var ┃\n",
       "┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n",
       "│ float32float32float32  │\n",
       "├───────────┼───────────┼──────────┤\n",
       "│ -0.00347816.2561931.0 │\n",
       "│ -0.00338616.2561931.0 │\n",
       "│ -0.00329516.2561931.0 │\n",
       "│ -0.00320316.2561931.0 │\n",
       "│ -0.00356916.2561021.0 │\n",
       "└───────────┴───────────┴──────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mlon\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlat\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mband_var\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n", "│ \u001b[2mfloat32\u001b[0m │ \u001b[2mfloat32\u001b[0m │ \u001b[2mfloat32\u001b[0m │\n", "├───────────┼───────────┼──────────┤\n", "│ \u001b[1;36m-0.003478\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003386\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003295\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003203\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003569\u001b[0m │ \u001b[1;36m16.256102\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "└───────────┴───────────┴──────────┘" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "┌────────┐\n",
       "│ \u001b[1;36m243059\u001b[0m │\n",
       "└────────┘"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n",
       "┃ lon        lat        band_var ┃\n",
       "┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n",
       "│ float32float32float32  │\n",
       "├───────────┼───────────┼──────────┤\n",
       "│ -0.00347816.2561931.0 │\n",
       "│ -0.00338616.2561931.0 │\n",
       "│ -0.00329516.2561931.0 │\n",
       "│ -0.00320316.2561931.0 │\n",
       "│ -0.00356916.2561021.0 │\n",
       "│ -0.00347816.2561021.0 │\n",
       "│ -0.00338616.2561021.0 │\n",
       "│ -0.00329516.2561021.0 │\n",
       "│ -0.00320316.2561021.0 │\n",
       "│ -0.00311216.2561021.0 │\n",
       "│          │\n",
       "└───────────┴───────────┴──────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mlon\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlat\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mband_var\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n", "│ \u001b[2mfloat32\u001b[0m │ \u001b[2mfloat32\u001b[0m │ \u001b[2mfloat32\u001b[0m │\n", "├───────────┼───────────┼──────────┤\n", "│ \u001b[1;36m-0.003478\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003386\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003295\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003203\u001b[0m │ \u001b[1;36m16.256193\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003569\u001b[0m │ \u001b[1;36m16.256102\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003478\u001b[0m │ \u001b[1;36m16.256102\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003386\u001b[0m │ \u001b[1;36m16.256102\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003295\u001b[0m │ \u001b[1;36m16.256102\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003203\u001b[0m │ \u001b[1;36m16.256102\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[1;36m-0.003112\u001b[0m │ \u001b[1;36m16.256102\u001b[0m │ \u001b[1;36m1.0\u001b[0m │\n", "│ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │\n", "└───────────┴───────────┴──────────┘" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# test_data.select(\"lon\",\"lat\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# test_xy = test_data.select(\"lon\",\"lat\").to_pandas()\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# gpd.GeoSeries(gpd.points_from_xy(test_xy[\"lon\"],test_xy[\"lat\"],crs=\"epsg:4326\")).explore()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }