MB2_Project/distrAssessment.R at main · corneliazy/MB2_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# MB2 Project (training data distribution)
# Cornelia Zygar, 2582296

###################################################################
# This is a script for assessing the influence of the distribution#
# of training data on the accuracy of a supervised random forest  #
# classification within the RStoolbox superClass() function.      #
###################################################################

# loading packages
library("ggplot2")
library("rgdal")
library("RStoolbox")
library("here")
library("raster")

###################################################################
# Basic idea                                                      #
###################################################################

# superClass() function has parameter nSamples.
# from the documentation of superClass():
# nSamples: "Integer. Number of samples per land cover class."
# "Sample training coordinates. If trainData(and valData if present) are SpatialPolygons-
# DataFrames superClass will calculate the area per polygon and sample nSamples locations
# per class within these polygons.  The number of samples per individual polygon scales
# with the polygon area, i.e. the bigger the polygon, the more samples."

# Prerequisite: constant number of training pixels (nSamples)
# The more Polygons within the shapefile, the more distributed the training data
# will be.

###################################################################
# functions                                                       #
###################################################################

# function needs to be applied on a list of SpatialPolygonDataframes
# @param trainingDataset list of SpatialPolygonDataframes
# @param img Rasterbrick or Stack
# @param validationData set of validation data (polygons)
# @param numSamples desired number of samples
# @return list containing different accuracy measures
distrAssessment <- function(trainingDataset, img, validationData, numSamples){
  result <- getValidation(
    superClass(
      img,
      trainingDataset,
      valData=validationData,
      responseCol = "class",
      predict=FALSE,
      nSamples = numSamples
      ),
    metrics="caret"
    )

  # classwise accuracies
  sensitivity <- result$byClass[,"Sensitivity"]
  specificity <- result$byClass[,"Specificity"]

  # overall accuracy
  overallAcc <- result$overall["Accuracy"]

  return (list(sensitivity, specificity, overallAcc))
}


# parameter feature: feature that should be plotted
# @param feature feature that should be plotted ("sensitivity"/ "specificity"/ "accuracy")
# @param resultList List that was returned by the distrAssessment() function
# @return line plot of accuracy values in relation to increasing degree of sampling distribution
plotDistrAssessment <- function(feature, resultList){

  # help function to select feature from list
  helpf <- function(x,n){
    return(x[[n]])
  }

  # defining list containing number of polygons
  # hard coded..not optimal
  numList <- list(1,2,4,8,16)

  if (feature == "sensitivity"){
    data <- lapply(resultList, helpf, n=1)
    df <- do.call(rbind, Map(data.frame, nPolygons = numList, accuracy = data))
    df$class <- c(rep(1:3, times = length(numList)))
  } else if (feature == "specificity"){
    data <- lapply(resultList, helpf, n=2)
    df <- do.call(rbind, Map(data.frame, nPolygons = numList, accuracy = data))
    df$class <- c(rep(1:3, times = length(numList)))
  } else if (feature == "overall"){
    data <- lapply(resultList, helpf, n=3)
    df <- do.call(rbind, Map(data.frame, nPolygons = numList, accuracy = data))
  } else{
    print("please provide valid feature argument!")
    return (NULL)
  }

  # adjust ggplot depending on selected attribute
  if("class" %in% colnames(df)){
    plot <- ggplot(data = df, aes(x=nPolygons, y=accuracy, group = class, col=factor(class)))+
      geom_line()+
      geom_point(aes(x=nPolygons,y=accuracy))+
      ylab(feature)+
      xlab("number polygons")+
      labs(color="Class")
  } else{
    plot <- ggplot(data = df, aes(x=nPolygons, y=accuracy))+
      geom_line()+
      geom_point(aes(x=nPolygons,y=accuracy))+
      ylab(feature)+
      xlab("number polygons")
  }

  return (plot)
}

###################################################################
# loading data                                                    #
###################################################################

# loading List of SpatialPolygonDataframes
# every SpatialPolygonDataframe has the same polygons as the one added before plus the same number of polygons
# added at other locations of the area.
# goal: with increasing number, the samples are increasingly evenly distributed over the image.
trainingList <- list(
  readOGR(here("training_distrAssessment/training_bayern_1_small_1.shp")),
  readOGR(here("training_distrAssessment/training_bayern_1_small_2.shp")),
  readOGR(here("training_distrAssessment/training_bayern_1_small_4.shp")),
  readOGR(here("training_distrAssessment/training_bayern_1_small_8.shp")),
  readOGR(here("training_distrAssessment/training_bayern_1_small_16.shp"))
)

# validation data
valData <- readOGR(here("validation_data/validation_bayern_1_small.shp"))

# the image which will be classified
image <- brick(here("img_data/S2Stack_20190704_bayern_1_small.tif"))

###################################################################
# running the functions                                           #
###################################################################

# applying distrAssessment function over all elements of trainingList
# the higher the numSamples value, the longer the computation will take!
accuracyResult <- lapply(trainingList, distrAssessment, img = image, numSamples=200, validationData=valData)
accuracyResult

# plotting different accuracy measures contained in accuracyResult
# overall accuracy
overallResult <- plotDistrAssessment("overall", accuracyResult)
overallResult

# sensitivity
sensitivityResult <- plotDistrAssessment("sensitivity", accuracyResult)
sensitivityResult

# specificity
specificityResult <- plotDistrAssessment("specificity", accuracyResult)
specificityResult