I am trying to follow a tutorial. Basically, I want to run random forest classification on my 31-band Sentinel 1 and Sentinel 2 stacked image. Also, I want to extract raster values to my training-testing shapefiles. This is what I tried:
from osgeo import gdal
from PIL import Image
import numpy as np
import matplotlib as mtp
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import earthpy.plot as ep
import rasterio
from rasterio.plot import reshape_as_raster, reshape_as_image
get_ipython().run_line_magic('matplotlib', 'inline')
pd.options.display.max_colwidth = 89
# In[2]:
#setting the path for image
S1_S2_stack = 'S1_S2_stack.tif'
#path to training and validation data
training_points = 'testing.shp'
validation_points = 'training.shp'
# In[3]:
colors = dict ((
(0, (0,76,153,255)), #wheat
(1, (0,153,0,255)), #corn
(2, (255,0,0,255)), #other
(3, (255,153,51,255)),
(4, (255,255,0,255))
))
# In[4]:
for k in colors:
v = colors [k]
_v = [_v / 255.0 for _v in v]
colors[k] = _v
index_colors = [colors[key] if key in colors else (1,1,1,0) for key in range (0,5)]
cmap = plt.matplotlib.colors.ListedColormap(index_colors, 'Classification', 5)
# In[5]:
src = rasterio.open(S1_S2_stack)
bands = src.read()
# In[6]:
stack =src. read()
print(stack.shape)
fig, (ax1, ax2) = plt.subplots(1,2,figsize= (20,10))
ax1 = ep.plot_rgb(arr = stack, rgb =(3, 2, 1), stretch=True, ax = ax1, title = "RGB Composite - Sentinel-2")
stack_s1 =np.stack ((stack[28],stack[29],stack[29]/stack[28]))
ax2 = ep.plot_rgb(arr = stack_s1, rgb = (1,0,2), stretch=True, ax = ax2, title= "RGB Composite - Sentinel-1 (VV, VH, VV/VH)")
plt.tight_layout()
# In[7]:
img = src.read()
profile = src.profile
with rasterio.io.MemoryFile () as memfile:
with memfile.open(**profile) as dst:
for i in range(0, src.count):
dst.write(img[i], i+1)
dataset = memfile.open()
And it works okay from here. But when I run this piece of code:
#read points from shapefile
train_pts = gpd.read_file (training_points)
train_pts = train_pts[[ 'CID','class', 'POINT_X','POINT_Y']] #attribute fields in shapefile
train_pts.index = range(len(train_pts))
coords = [(x,y) for x, y in zip(train_pts.POINT_X, train_pts.POINT_Y)] #create list of point coordinates
#sample each band of raster dataset at each point in the coordinate list
train_pts ['Raster Value'] = [x for x in dataset.sample(coords)] #all band values saved as a list in the Raster Value column
#Unpack the raster value column to separate column for each band - band names retrieved from snappy in the video but I was looking for an option
train_pts[bands] = pd.DataFrame(train_pts['Raster Value'].tolist(), index = train_pts.index)
train_pts = train_pts.drop(['Raster Value'], axis=1) #drop raster value column
#change the values for last three classes
train_pts['CID'] = train_pts['CID'].replace([7,8,15],[5,6,7])
train_pts.to.csv('train_pts.csv') #save as csv
train_pts.head () #see first column
I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [9], in <cell line: 10>()
8 train_pts ['Raster Value'] = [x for x in dataset.sample(coords)] #all band values saved as a list in the Raster Value column
9 #Unpack the raster value column to separate column for each band - band names retrieved from snappy in the video but I was looking for an option
---> 10 train_pts[src1] = pd.DataFrame(train_pts['Raster Value'].tolist(), index = train_pts.index)
11 train_pts = train_pts.drop(['Raster Value'], axis=1) #drop raster value column
12 #change the values for last three classes
File ~\.conda\envs\geocomp3_clone\lib\site-packages\pandas\core\frame.py:3643, in DataFrame.__setitem__(self, key, value)
3641 self._setitem_frame(key, value)
3642 elif isinstance(key, (Series, np.ndarray, list, Index)):
-> 3643 self._setitem_array(key, value)
3644 elif isinstance(value, DataFrame):
3645 self._set_item_frame_value(key, value)
File ~\.conda\envs\geocomp3_clone\lib\site-packages\pandas\core\frame.py:3685, in DataFrame._setitem_array(self, key, value)
3680 else:
3681 # Note: unlike self.iloc[:, indexer] = value, this will
3682 # never try to overwrite values inplace
3684 if isinstance(value, DataFrame):
-> 3685 check_key_length(self.columns, key, value)
3686 for k1, k2 in zip(key, value.columns):
3687 self[k1] = value[k2]
File ~\.conda\envs\geocomp3_clone\lib\site-packages\pandas\core\indexers\utils.py:428, in check_key_length(columns, key, value)
426 if columns.is_unique:
427 if len(value.columns) != len(key):
--> 428 raise ValueError("Columns must be same length as key")
429 else:
430 # Missing keys in columns are represented as -1
431 if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns):
ValueError: Columns must be same length as key
So my questions are as follows:
If you say
src = rasterio.open(S1_S2_stack)
bands = src.read()
then bands
is a 3-dimensional Numpy ndarray
(a series of raster frames). It makes no sense to use it for indexing like train_pts[bands]
, where train_pts
is a data frame. But I assume you want to refer to the band names. If so, try src.descriptions
instead:
band_names = [*src.descriptions]
train_pts[band_names] = pd.DataFrame(train_pts['Raster Value'].tolist(), index = train_pts.index)
band_names
should be a list so we can use them as indexes in train_pts[band_names]
. As far as src.descriptions
is a tuple, we have to transform it into a list
.band_names = [f'{src.descriptions[i]}, band {i}' for i in range(1, src.count + 1)]
band_names = [f'Band {i}' for i in range(1, src.count + 1)]