forked from Abdallah-M-Ali/Mineral-Prospectivity-Mapping-ML
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathData_preprocessing.py
More file actions
173 lines (141 loc) · 6.16 KB
/
Copy pathData_preprocessing.py
File metadata and controls
173 lines (141 loc) · 6.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# This the basic code for remote sensing data preprocessing, where the stacked data is opened and prepared as feature
# predictors (x, x_test). the samples of ore deposit are processed as target variables (y, y_test)
# The code also contain how data is preprocessed to fit different ML models requirement.
# The code is mainly several function which can be called in others python codes within the same directory
from osgeo import gdal
from osgeo import ogr
import tensorflow as tf
import numpy as np
import geopandas as gpd
import os
import random
driverTiff = gdal.GetDriverByName('GTiff')
# this function for opening the stacked data to extract the x_train and x_test
# as well and reshaping the image to become as np array
def rs_preprocessing (data, reshape=True):
rs_ds = gdal.Open(data)
nbands = rs_ds.RasterCount
band_data = []
print('bands', rs_ds.RasterCount, 'rows', rs_ds.RasterYSize, 'columns',
rs_ds.RasterXSize)
for i in range(1, nbands + 1):
band = rs_ds.GetRasterBand(i).ReadAsArray()
band_data.append(band)
band_data = np.dstack(band_data)
print(band_data.shape)
if reshape == True:
new_shape = (band_data.shape[0] * band_data.shape[1], band_data.shape[2])
img_as_array = band_data[:, :, :np.int(band_data.shape[2])].reshape(new_shape)
print('Reshaped from {o} to {n}'.format(o=band_data.shape, n=img_as_array.shape))
img_as_array = np.nan_to_num(img_as_array)
return band_data, img_as_array
else:
return band_data
# the next function accept shapefile data contains points as target variables samples
# the target value must be in attribute called value and values are binary (0,1)
# 1 represents that the target mineral exist and vice versa
# the function splits the data into train and test datasets and save them separately in two files
def target_variable (data, trainDirectory, testDirectory, tarinPercent=0.8):
gdf = gpd.read_file(data)
# def condition(dataframe):
# if dataframe['value'] == 1:
# value = 3
# elif dataframe['value'] == 0.5:
# value = 2
# else:
# value = 1
# return value
# gdf['raster'] = gdf.apply(condition, axis=1)
gdf['raster'] = np.where(gdf['Value'] == 0, 1, 2)
print(gdf.head)
gdf_train = gdf.sample(frac=tarinPercent)
gdf_test = gdf.drop(gdf_train.index)
print('gdf shape', gdf.shape, 'training', gdf_train.shape, 'test', gdf_test.shape)
gdf_train.to_file(trainDirectory)
gdf_test.to_file(testDirectory)
print('train data saved to: {}'.format(trainDirectory))
print('test data saved to: {}'.format(testDirectory))
# Data rasterization and extraction of training and testing dataset
# The folowing function accept
# (1) The RS data directory
# (2) The processed RS data
# (3) the directory of the training or testing dataset
# function return x (variable features) and y (target variables)
def dataFitting (RSData, band_data, SHfile):
RS_ds = gdal.Open(RSData)
train_ds = ogr.Open(SHfile)
lyr = train_ds.GetLayer()
driver = gdal.GetDriverByName('MEM')
target_ds = driver.Create('', RS_ds.RasterXSize, RS_ds.RasterYSize, 1, gdal.GDT_UInt16)
target_ds.SetGeoTransform(RS_ds.GetGeoTransform())
target_ds.SetProjection(RS_ds.GetProjectionRef())
options = ['ATTRIBUTE=raster']
gdal.RasterizeLayer(target_ds, [1], lyr, options=options)
data = target_ds.GetRasterBand(1).ReadAsArray()
print('min', data.min(), 'max', data.max(), 'mean', data.mean())
truth = target_ds.GetRasterBand(1).ReadAsArray()
classes = np.unique(truth)[1:]
print('class values', classes)
n_samples = (data > 0).sum()
print('{n} training samples'.format(n=n_samples))
idx = np.nonzero(truth)
x = band_data[idx]
y = truth[idx] - 1
# def condition(dataframe):
# if dataframe == 3:
# value = 1
# elif dataframe == 2:
# value = 0.5
# else:
# value = 0
# return value
# y = list(map(condition, truth[idx]))
print('Our X matrix is sized: {sz}'.format(sz=x.shape))
print('Our y array is sized: {sz}'.format(sz=np.shape(y)))
return x, y
# def get_map_prediction()
def tfPipline (feature, label, shuffle=True, repeat=False, BUFFER_SIZE=10000, BATCH_SIZE=64):
if label == '':
dataset = tf.data.Dataset.from_tensor_slices(feature)
else:
dataset = tf.data.Dataset.from_tensor_slices((feature, label))
if repeat==True:
dataset = dataset.repeat()
if shuffle:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
else:
dataset = dataset.batch(BATCH_SIZE)
return dataset
def cnn_input(dataset):
sample_size = dataset.shape[0]
time_steps = dataset.shape[1]
input_dimension = 1
dataset = dataset.reshape(sample_size, time_steps, input_dimension)
return dataset
def reset_random_seeds():
os.environ['PYTHONHASHSEED']=str(1)
tf.random.set_seed(1)
np.random.seed(1)
random.seed(1)
def write_raster(RSData, modelPrediction, band_data, savedDirectory):
RS_ds = gdal.Open(RSData)
cols = band_data.shape[1]
rows = band_data.shape[0]
modelPrediction.astype(np.float16)
driver = gdal.GetDriverByName("gtiff")
outdata = driver.Create(savedDirectory, cols, rows, 1, gdal.GDT_Float32)
outdata.SetGeoTransform(RS_ds.GetGeoTransform()) ##sets same geotransform as input
outdata.SetProjection(RS_ds.GetProjection()) ##sets same projection as input
outdata.GetRasterBand(1).WriteArray(modelPrediction)
outdata.FlushCache() ##saves to disk!!
print('Image saved to: {}'.format(savedDirectory))
def main():
print("This is the main code to test above functions")
landsat = 'D:/programes/dataset/aster-finalstack2.tif'
band_data1, img_as_array1 = rs_preprocessing(landsat, reshape=True)
train_ds = 'D:/programes/qgis/train_reg.shp'
x_train, y_train = dataFitting(landsat, band_data1, train_ds)
print(x_train)
print(y_train)
if __name__ == '__main__':
main()