@dataknut
) & Emily JiangThis is where we do setup stuff like attaching libraries etc.
# Set start time ----
startTime <- proc.time()
# Libraries ----
library(here)
projRoot <- here::here()
# Emily's libraries and functions
source(paste0(projRoot, "/R/libraries.R")) # Emily's recommended libraries
source(paste0(projRoot, "/R/functions.R")) # Emily's functions
# Local libraries ----
library(kableExtra) # for fancy tables
library(skimr) # for data descriptors (skim)
# any extras?
# Local functions ----
Session info (for reference)
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_NZ.UTF-8/en_NZ.UTF-8/en_NZ.UTF-8/C/en_NZ.UTF-8/en_NZ.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bookdown_0.9 rmarkdown_1.11 skimr_1.0.5
## [4] kableExtra_1.0.1 ggforce_0.2.1 viridis_0.5.1
## [7] viridisLite_0.3.0 GREENGridData_1.0 data.table_1.12.0
## [10] lubridate_1.7.4 gridExtra_2.3 ggplot2_3.1.0
## [13] dplyr_0.8.0.1 readr_1.3.1 usethis_1.4.0
## [16] devtools_2.0.1 here_0.1
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.0 tidyr_0.8.3 pkgload_1.0.2
## [4] splines_3.5.2 R.utils_2.8.0 assertthat_0.2.0
## [7] highr_0.7 cellranger_1.1.0 yaml_2.2.0
## [10] remotes_2.0.2 progress_1.2.0 sessioninfo_1.1.1
## [13] lattice_0.20-38 pillar_1.3.1 backports_1.1.3
## [16] glue_1.3.0 digest_0.6.18 polyclip_1.10-0
## [19] rvest_0.3.2 colorspace_1.4-0 Matrix_1.2-16
## [22] htmltools_0.3.6 R.oo_1.22.0 plyr_1.8.4
## [25] pkgconfig_2.0.2 purrr_0.3.1 scales_1.0.0
## [28] webshot_0.5.1 processx_3.3.0 tweenr_1.0.1
## [31] tibble_2.0.1 mgcv_1.8-27 farver_1.1.0
## [34] withr_2.1.2 lazyeval_0.2.1 cli_1.0.1
## [37] magrittr_1.5 crayon_1.3.4 readxl_1.3.0
## [40] memoise_1.1.0 evaluate_0.13 ps_1.3.0
## [43] R.methodsS3_1.7.1 fs_1.2.6 nlme_3.1-137
## [46] MASS_7.3-51.1 xml2_1.2.0 pkgbuild_1.0.2
## [49] tools_3.5.2 prettyunits_1.0.2 hms_0.4.2
## [52] formatR_1.6 stringr_1.4.0 munsell_0.5.0
## [55] callr_3.1.1 compiler_3.5.2 rlang_0.3.1
## [58] grid_3.5.2 rstudioapi_0.9.0 labeling_0.3
## [61] testthat_2.0.1 gtable_0.2.0 reshape2_1.4.3
## [64] R6_2.4.0 knitr_1.22 rprojroot_1.3-2
## [67] desc_1.2.0 stringi_1.3.1 Rcpp_1.0.0
## [70] tidyselect_0.2.5 xfun_0.5
First we load the selected pre-extracted heat pump data and run some basic tests.
message("Loading", params$dataFile)
## Loading/Users/ben/Data/NZ_GREENGrid/safe/emilyJiang/allHouseholds_hourly.csv.gz
isTesting <- 0
# for testing ---- isTesting <- 1
if (isTesting) {
params <- list() # gets set by knit
# params$dataFile <- paste0(myParams$dPath, 'allHouseholds_hourly.csv.gz')
params$dataFile <- "~/Data/NZ_GREENGrid/safe/gridSpy/1min/dataExtracts/Heat Pump_2015-04-01_2016-03-31_observations.csv.gz"
params$dataSource <- "benData"
}
hpDT_orig <- data.table::fread(params$dataFile)
hpDT <- hpDT_orig
if (params$dataSource == "emilyData") {
# drop V1 (purpose?)
hpDT$V1 <- NULL
# fix the date - remember it's stored as UTC
hpDT <- hpDT[, `:=`(r_dateTimeChar, r_dateTime)]
hpDT <- hpDT[, `:=`(r_dateTime, lubridate::as_datetime(r_dateTime))]
hpDT <- hpDT[, `:=`(r_dateTimeLocal, lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland"))] # convert to local time from UTC
}
if (params$dataSource == "benData") {
# minute level observations, needs processing & summarising calculate hourly
# mean
hpDT <- hpDT[, `:=`(r_dateTime, lubridate::as_datetime(r_dateTime))] # make sure date set to POSIXct
hpDT <- hpDT[, `:=`(r_dateTimeHour, lubridate::floor_date(r_dateTime, "hour"))] # cuts back to hour at the start. I assume this is what Emily did
hpDTh <- hpDT[, .(powerW = mean(powerW)), keyby = .(r_dateTime = r_dateTimeHour,
circuit, linkID)] # calculate hourly mean power (temperature data is hourly means)
# proceed as above
hpDTh <- hpDTh[, `:=`(r_dateTimeLocal, lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland"))] # convert to local time from UTC
# merge location table
locationDT <- data.table::fread(paste0(myParams$dPath, "locationTable.csv"))
setkey(locationDT, linkID)
setkey(hpDTh, linkID)
hpDT <- hpDTh[locationDT[, .(linkID, location)]]
hpDT$id <- hpDT$linkID
}
# location is constant by id so we could just create a table and drop that
# columhn too but...
hpDT <- hpDT[, `:=`(hms, hms::as.hms(r_dateTimeLocal))] # useful for plots and for checking we have the right timezone!
hpDT <- GREENGridData::addNZSeason(hpDT, r_dateTime = r_dateTimeLocal)
If any of the r_dateTimes failed to parse they will have the local time set to NA so we can see what happened (if anything) in Table 2.1. If there are no rows in this table then all r_dateTimes parsed.
kableExtra::kable(head(hpDT[is.na(r_dateTime)], 10), caption = "First 10 rows of Heat Pump data where r_dateTime failed to parse (could be none at all)") %>%
kable_styling()
r_dateTime | powerW | circuit | id | location | r_dateTimeChar | r_dateTimeLocal | hms | season |
---|---|---|---|---|---|---|---|---|
Table 2.2 shows the first 10 rows of slighty processed data. Note that local time ias set to NZ time while the original r_dateTime is in UTC. This is important for linking the temperature data correctly!
kableExtra::kable(head(hpDT, 10), caption = "First 10 rows of (slightly) processed Heat Pump data") %>%
kable_styling()
r_dateTime | powerW | circuit | id | location | r_dateTimeChar | r_dateTimeLocal | hms | season |
---|---|---|---|---|---|---|---|---|
2015-03-26 04:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 04:00:00 | 2015-03-26 17:00:00 | 17:00:00 | Autumn |
2015-03-26 05:00:00 | 33.68717 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 05:00:00 | 2015-03-26 18:00:00 | 18:00:00 | Autumn |
2015-03-26 06:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 06:00:00 | 2015-03-26 19:00:00 | 19:00:00 | Autumn |
2015-03-26 07:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 07:00:00 | 2015-03-26 20:00:00 | 20:00:00 | Autumn |
2015-03-26 08:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 08:00:00 | 2015-03-26 21:00:00 | 21:00:00 | Autumn |
2015-03-26 09:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 09:00:00 | 2015-03-26 22:00:00 | 22:00:00 | Autumn |
2015-03-26 10:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 10:00:00 | 2015-03-26 23:00:00 | 23:00:00 | Autumn |
2015-03-26 11:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 11:00:00 | 2015-03-27 00:00:00 | 00:00:00 | Autumn |
2015-03-26 12:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 12:00:00 | 2015-03-27 01:00:00 | 01:00:00 | Autumn |
2015-03-26 13:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 15876 | 2015-03-26 13:00:00 | 2015-03-27 02:00:00 | 02:00:00 | Autumn |
Table 2.3 gives a list of all circuit labels seen - as a check on what we think they are measuring.
In addition, Emily’s analysis excludes the following circuits:
Downstairs (inc 1 Heat Pump)$2212
- this may well include a number of other appliances;Bedroom & Lounge Heat Pumps$2741
- appears to have many zero values in winter at low temperatures but so do others (see Figure 8.1);Theatre Heat Pump$2740
- generally low level of use (see Figure 8.1);How many of them are there? Here we create two filters - one based on Emily’s suggestions (3 exclusions as above) and one which allows Bedroom & Lounge Heat Pumps$2741
.
# flag the circuits we don't want
hpDT <- hpDT[, ej_includeCircuit := ifelse(circuit == "Downstairs (inc 1 Heat Pump)$2212" |
circuit == "Bedroom & Lounge Heat Pumps$2741" | # why exclude this one?
circuit == "Theatre Heat Pump$2740",
0, # exclude
1)] # sets all others to 1
# flag the circuits we don't want
hpDT <- hpDT[, ba_includeCircuit := ifelse(circuit == "Downstairs (inc 1 Heat Pump)$2212" |
circuit == "Theatre Heat Pump$2740",
0, # exclude
1)] # sets all others to 1
t <- hpDT[, .(nObs = .N), keyby = .(ej_includeCircuit, ba_includeCircuit, circuit)]
kableExtra::kable(t, caption = "Counts of circuit labels for those we exclude (0) and those we include (1)") %>%
kable_styling()
ej_includeCircuit | ba_includeCircuit | circuit | nObs |
---|---|---|---|
0 | 0 | Downstairs (inc 1 Heat Pump)$2212 | 18752 |
0 | 0 | Theatre Heat Pump$2740 | 35181 |
0 | 1 | Bedroom & Lounge Heat Pumps$2741 | 35181 |
1 | 1 | Heat Pump$2092 | 25831 |
1 | 1 | Heat Pump$2148 | 16012 |
1 | 1 | Heat Pump$2598 | 30378 |
1 | 1 | Heat Pump$2758 | 11800 |
1 | 1 | Heat Pump$2826 | 15161 |
1 | 1 | Heat Pump$4124 | 13045 |
1 | 1 | Heat Pump$4130 | 16273 |
1 | 1 | Heat Pump$4134 | 29398 |
1 | 1 | Heat Pump$4150 | 26788 |
1 | 1 | Heat Pump$4154 | 29379 |
1 | 1 | Heat Pump$4160 | 13669 |
1 | 1 | Heat Pump$4175 | 14779 |
1 | 1 | Heat Pump$4190 | 22913 |
1 | 1 | Heat Pump$4196 | 9023 |
1 | 1 | Heat Pump$4204 | 29351 |
1 | 1 | Heat Pump$4223 | 12190 |
1 | 1 | Upstairs Heat Pumps$2211 | 36275 |
Table 2.4 gives summary statistics for heat pump demand for all circuits and all households (id).
t <- hpDT[, .(nObs = .N, meanW = mean(powerW), minW = min(powerW), maxW = max(powerW)),
keyby = .(id, circuit, location)]
kableExtra::kable(t, caption = "Heat pump statistics by household") %>% kable_styling()
id | circuit | location | nObs | meanW | minW | maxW |
---|---|---|---|---|---|---|
rf_08 | Heat Pump$2092 | 23872 | 25831 | 67.119591 | 0 | 1692.450 |
rf_10 | Heat Pump$2598 | 23872 | 30378 | 48.317166 | 0 | 2245.535 |
rf_13 | Downstairs (inc 1 Heat Pump)$2212 | 23872 | 18752 | 276.478907 | 0 | 2919.731 |
rf_13 | Upstairs Heat Pumps$2211 | 23872 | 36275 | 148.184983 | 0 | 2692.076 |
rf_17a | Heat Pump$2148 | 23872 | 16012 | 25.986817 | 0 | 3145.554 |
rf_19 | Bedroom & Lounge Heat Pumps$2741 | 23872 | 35181 | 90.488668 | 0 | 2167.546 |
rf_19 | Theatre Heat Pump$2740 | 23872 | 35181 | 3.030243 | 0 | 800.684 |
rf_25 | Heat Pump$2758 | 23872 | 11800 | 89.182960 | 0 | 1726.893 |
rf_27 | Heat Pump$2826 | 23872 | 15161 | 138.368410 | 0 | 2382.799 |
rf_31 | Heat Pump$4204 | 15876 | 29351 | 127.532600 | 0 | 2109.801 |
rf_32 | Heat Pump$4196 | 15876 | 9023 | 67.796572 | 0 | 1860.604 |
rf_34 | Heat Pump$4223 | 15876 | 12190 | 196.538713 | 0 | 4342.474 |
rf_35 | Heat Pump$4124 | 15876 | 13045 | 58.915105 | 0 | 1127.889 |
rf_36 | Heat Pump$4150 | 15876 | 26788 | 104.854347 | 0 | 2365.296 |
rf_37 | Heat Pump$4134 | 15876 | 29398 | 49.805530 | 0 | 1752.853 |
rf_38 | Heat Pump$4175 | 15876 | 14779 | 247.926515 | 0 | 2431.999 |
rf_41 | Heat Pump$4190 | 15876 | 22913 | 51.088287 | 0 | 3528.475 |
rf_42 | Heat Pump$4130 | 15876 | 16273 | 44.588199 | 0 | 2113.905 |
rf_44 | Heat Pump$4154 | 15876 | 29379 | 97.935001 | 0 | 1998.174 |
rf_45 | Heat Pump$4160 | 15876 | 13669 | 94.366494 | 0 | 2284.892 |
Table 2.5 gives summary statistics for heat pump demand for all circuits in each location.
t <- hpDT[, .(nHouseholds = uniqueN(id), nObs = .N, meanW = mean(powerW), minW = min(powerW),
maxW = max(powerW)), keyby = .(location, season)]
kableExtra::kable(t, caption = "Heat pump statistics by household") %>% kable_styling()
location | season | nHouseholds | nObs | meanW | minW | maxW |
---|---|---|---|---|---|---|
15876 | Spring | 11 | 50789 | 52.30310 | 0 | 3598.496 |
15876 | Summer | 11 | 44567 | 11.23806 | 0 | 3653.164 |
15876 | Autumn | 11 | 59071 | 58.86289 | 0 | 3376.472 |
15876 | Winter | 11 | 62381 | 238.62007 | 0 | 4342.474 |
23872 | Spring | 7 | 56916 | 70.98425 | 0 | 2635.756 |
23872 | Summer | 7 | 54393 | 35.20986 | 0 | 2382.799 |
23872 | Autumn | 7 | 52697 | 54.59329 | 0 | 2451.487 |
23872 | Winter | 7 | 60565 | 194.59470 | 0 | 3145.554 |
Table 2.6 and Figure 2.1 gives the number of households observed for all circuits in each year and season. Small numbers in any given season/year will affect the reliability of the subsequent results. On the basis of this table/plot we may wish to select specific time periods.
t <- hpDT[, .(nHouseholds = uniqueN(id), nObs = .N), keyby = .(year = as.factor(lubridate::year(r_dateTimeLocal)),
month = lubridate::month(r_dateTimeLocal, label = TRUE))]
kableExtra::kable(t, caption = "Household counts by year and season") %>% kable_styling()
year | month | nHouseholds | nObs |
---|---|---|---|
2014 | May | 2 | 96 |
2014 | Jun | 3 | 2620 |
2014 | Jul | 6 | 4373 |
2014 | Aug | 6 | 5952 |
2014 | Sep | 6 | 5740 |
2014 | Oct | 6 | 5949 |
2014 | Nov | 7 | 5739 |
2014 | Dec | 6 | 5944 |
2015 | Jan | 6 | 5952 |
2015 | Feb | 6 | 5376 |
2015 | Mar | 17 | 7567 |
2015 | Apr | 17 | 13613 |
2015 | May | 18 | 13954 |
2015 | Jun | 18 | 14400 |
2015 | Jul | 18 | 14726 |
2015 | Aug | 18 | 14254 |
2015 | Sep | 18 | 13978 |
2015 | Oct | 18 | 14414 |
2015 | Nov | 18 | 13497 |
2015 | Dec | 16 | 12913 |
2016 | Jan | 15 | 12443 |
2016 | Feb | 15 | 11678 |
2016 | Mar | 15 | 12541 |
2016 | Apr | 14 | 9704 |
2016 | May | 14 | 10624 |
2016 | Jun | 14 | 11334 |
2016 | Jul | 15 | 11726 |
2016 | Aug | 14 | 10709 |
2016 | Sep | 13 | 9944 |
2016 | Oct | 14 | 10134 |
2016 | Nov | 12 | 9360 |
2016 | Dec | 12 | 9672 |
2017 | Jan | 12 | 9634 |
2017 | Feb | 12 | 8461 |
2017 | Mar | 11 | 8921 |
2017 | Apr | 11 | 8625 |
2017 | May | 11 | 8187 |
2017 | Jun | 9 | 7200 |
2017 | Jul | 9 | 7440 |
2017 | Aug | 9 | 7213 |
2017 | Sep | 8 | 6462 |
2017 | Oct | 8 | 6696 |
2017 | Nov | 8 | 5769 |
2017 | Dec | 7 | 5301 |
2018 | Jan | 7 | 5626 |
2018 | Feb | 8 | 5973 |
2018 | Mar | 8 | 6647 |
2018 | Apr | 7 | 5475 |
2018 | May | 7 | 5752 |
2018 | Jun | 7 | 5707 |
2018 | Jul | 7 | 5196 |
2018 | Aug | 6 | 168 |
ggplot2::ggplot(t, aes(x = year, y = month, fill = nHouseholds)) + geom_tile()
Figure 2.1: Count of observed households by year and month
Figure 2.2 shows the mean heat pump demand by time of day by season. Does this look right?
hpCap <- paste0("GREEN Grid Heat Pump circuits, n households = ", uniqueN(hpDT$id),
"\n Time period: ", min(lubridate::date(hpDT$r_dateTimeLocal)), " to ",
max(lubridate::date(hpDT$r_dateTimeLocal)))
ggplot2::ggplot(hpDT, aes(x = powerW)) + geom_histogram() + facet_grid(location ~
season) + labs(caption = paste0(hpCap, "\n n circuits: ", uniqueN(hpDT$circuit)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Figure 2.2: Histogram mean hourly power demand for Heat Pumps by season and location
Figure 2.3 shows same plot but only for power > 0 W (i.e. on standby or heating) and only for the circuits BA suggests we include (all but 2 - see Table 2.3). Does this look right?
dt <- hpDT[powerW > 0 & ba_includeCircuit == 1]
message("Check min power: ", min(dt$powerW))
## Check min power: 0.0166666666666667
ggplot2::ggplot(dt, aes(x = powerW)) + geom_histogram() + facet_grid(location ~
season) + labs(caption = paste0(hpCap, "\n n circuits: ", uniqueN(dt$circuit)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Figure 2.3: Histogram mean hourly power demand for Heat Pumps by season and location
Figure 2.4 shows same plot but only for power > 40 W and only for the circuits BA suggests we include. Does this look right?
dt <- hpDT[powerW > 40 & ba_includeCircuit == 1]
message("Check min power: ", min(dt$powerW))
## Check min power: 40.0005
ggplot2::ggplot(dt, aes(x = powerW)) + geom_histogram() + facet_grid(location ~
season) + labs(caption = paste0(hpCap, "\n n circuits: ", uniqueN(dt$circuit)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Figure 2.4: Histogram mean hourly power demand for Heat Pumps by season and location
Figure 2.5 shows the mean heat pump demand (no power value exclusions) by time of day by season for the circuits BA suggests we include. Does this look right?
Figure 2.5 is important because:
r_dateTimeLocal
r_dateTimeLocal
and used that to make this plotr_dateTime
dt <- hpDT[ba_includeCircuit == 1]
plotDT <- dt[, .(meanW = mean(powerW)), keyby = .(hms, season, location)]
ggplot2::ggplot(plotDT, aes(x = hms, y = meanW, colour = season)) + geom_line() +
facet_grid(location ~ .) + labs(caption = paste0(hpCap, "\n n circuits: ",
uniqueN(dt$circuit), "\n n observations: ", nrow(dt[!is.na(powerW)])))
Figure 2.5: Line plot of mean power demand for Heat Pumps by season and location
Are we seeing some cooling in summer?
We can repeat this plot including only powerW values > 40 W (Figure 2.6). This shows the level of demand when they are on. This plot shows some distinct signs of heat pump use in early evenings in summer producing a slightly earlier ‘peak’ than is the case for winter heating. It has to be said however that the plots are quite messy, no doubt due to the small numbers involved.
dt <- hpDT[powerW > 40 & ba_includeCircuit == 1]
plotDT <- dt[, .(meanW = mean(powerW)), keyby = .(hms, season, location)]
ggplot2::ggplot(plotDT, aes(x = hms, y = meanW, colour = season)) + geom_line() +
facet_grid(location ~ .) + labs(caption = paste0(hpCap, "\n Values > 40W only",
"\n n circuits: ", uniqueN(dt$circuit), "\n n observations: ", nrow(dt[!is.na(powerW)])))
Figure 2.6: Line plot of mean power demand for Heat Pumps by season and location where power > 40W
First we load this using the two seperate files.
message("Loading 15876_hourly_all.txt.gz")
## Loading 15876_hourly_all.txt.gz
temp15876DT <- data.table::fread(paste0(myParams$dPath, "15876_hourly_all.txt.gz"))
message("Loading 23872_hourly_all.txt.gz")
## Loading 23872_hourly_all.txt.gz
temp23872DT <- data.table::fread(paste0(myParams$dPath, "23872_hourly_all.txt.gz"))
fixTempData <- function(dt){
# bunch of stuff we want to do to each dataset
# fix datetime so it matches the hp data
dt <- dt[, r_dateTime := lubridate::dmy_hm(`Date(NZST)`, # so we know it's NZT - shame it had to have () in it!
tz = "Pacific/Auckland")] # just to be sure (if you ran this code in the UK it would set it to UTC by default)
dt <- dt[, hms := hms::as.hms(r_dateTime)]
dt <- GREENGridData::addNZSeason(dt)
dt <- dt[, r_dateTimeLocal := r_dateTime] # for matching
dt <- dt[, location := Station] # for matching
return(dt)
}
temp15876DT <- fixTempData(temp15876DT)
## Warning: 21 failed to parse.
message("Some dates failed to parse - which ones?")
## Some dates failed to parse - which ones?
kableExtra::kable(head(temp15876DT[is.na(r_dateTimeLocal), .(Station, `Date(NZST)`, r_dateTimeLocal)]),
caption = "Non-parsed dates for location = 15876")
Station | Date(NZST) | r_dateTimeLocal |
---|---|---|
15876 | 05/10/1997 02:00 | NA |
15876 | 04/10/1998 02:00 | NA |
15876 | 03/10/1999 02:00 | NA |
15876 | 01/10/2000 02:00 | NA |
15876 | 07/10/2001 02:00 | NA |
15876 | 06/10/2002 02:00 | NA |
temp23872DT <- fixTempData(temp23872DT)
## Warning: 16 failed to parse.
message("Some dates failed to parse - which ones?")
## Some dates failed to parse - which ones?
kableExtra::kable(head(temp23872DT[is.na(r_dateTimeLocal), .(Station, `Date(NZST)`, r_dateTimeLocal)]),
caption = "Non-parsed dates for location = 23872")
Station | Date(NZST) | r_dateTimeLocal |
---|---|---|
23872 | 05/10/2003 02:00 | NA |
23872 | 03/10/2004 02:00 | NA |
23872 | 02/10/2005 02:00 | NA |
23872 | 01/10/2006 02:00 | NA |
23872 | 30/09/2007 02:00 | NA |
23872 | 28/09/2008 02:00 | NA |
message("Looks like the ones around the DST switches. We hate DST almost as much as we hate timezones.")
## Looks like the ones around the DST switches. We hate DST almost as much as we hate timezones.
Convert the incoming character values to numeric as needed.
# combine the temperature data
tempDT <- rbind(temp15876DT, temp23872DT)
tempDT <- tempDT[, `:=`(meanTemp, as.numeric(`Tmean(C)`))]
## Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
message("Note that ", nrow(tempDT[is.na(meanTemp)]), " observations have no Tmean(C) values...")
## Note that 645 observations have no Tmean(C) values...
tempDT <- tempDT[, `:=`(year, lubridate::year(r_dateTime))]
t <- addmargins(with(tempDT[is.na(meanTemp)], table(Station, year)))
kableExtra::kable(t, caption = "Counts of missing temperature observations by location and year (source: 2 files)") %>%
kable_styling()
1998 | 2003 | 2004 | 2006 | 2007 | 2008 | 2010 | 2012 | 2014 | 2015 | 2016 | 2018 | Sum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15876 | 2 | 53 | 491 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 551 |
23872 | 0 | 0 | 0 | 12 | 22 | 0 | 31 | 1 | 1 | 1 | 0 | 26 | 94 |
Sum | 2 | 53 | 491 | 12 | 22 | 1 | 31 | 2 | 1 | 1 | 3 | 26 | 645 |
message("But it's mostly in years we don't have HP data for?")
## But it's mostly in years we don't have HP data for?
Figure 2.7 shows the mean temperature by time of day by season. Does this look right? The alpha values are suggesting it’s getting warmer…
As above this plot is important. If the shape looks right then we have made the correct assumptions about our incoming dateTime values.
plotDT <- tempDT[, .(meanT = mean(meanTemp, na.rm = TRUE)), # crucial otherwise any NA causes the calculation to be NA
keyby = .(hms, season, Station, year)]
tempCap <- paste0("Temperature data source: ?? NIWA ??",
"\n Time period: ", min(lubridate::date(tempDT$r_dateTimeLocal), na.rm = TRUE),
" to ",
max(lubridate::date(tempDT$r_dateTimeLocal), na.rm = TRUE)
)
ggplot2::ggplot(plotDT, aes(x = hms, y = meanT, colour = season, alpha = year)) +
geom_point() +
facet_grid(Station ~ .) +
labs(caption = tempCap)
## Warning: Removed 2 rows containing missing values (geom_point).
Figure 2.7: Line plot of mean temperature by season and location
Now test the single temperature file which contains both.
message("Loading alltemp_hourly.csv.gz")
## Loading alltemp_hourly.csv.gz
tempBothDT <- data.table::fread(paste0(myParams$dPath, "alltemp_hourly.csv.gz"))
kableExtra::kable(head(tempBothDT),
caption = "Head: alltemp_hourly.csv.gz before processing")
V1 | location | r_dateTime | Tmax_C | Tmin_C | Tmean_C | location |
---|---|---|---|---|---|---|
1 | 15876 | 1997-09-11 17:00:00 | 14.4 | 13.8 | 14.2 | 15876 |
2 | 15876 | 1997-09-11 18:00:00 | 14.3 | 12.8 | 13.8 | 15876 |
3 | 15876 | 1997-09-11 19:00:00 | 12.9 | 12.1 | 12.5 | 15876 |
4 | 15876 | 1997-09-11 20:00:00 | 12.4 | 11.9 | 12.2 | 15876 |
5 | 15876 | 1997-09-11 21:00:00 | 11.9 | 11.0 | 11.5 | 15876 |
6 | 15876 | 1997-09-11 22:00:00 | 11.7 | 11.1 | 11.4 | 15876 |
tempBothDT <- tempBothDT[, r_dateTimeLocal := lubridate::ymd_hms(r_dateTime, # looks like NZT as the first rows it match the first few rows of temp15876DT
tz = "Pacific/Auckland")] # just to be sure (if you ran this code in the UK it would set it to UTC by default)
tempBothDT <- tempBothDT[, hms := hms::as.hms(r_dateTimeLocal)]
tempBothDT <- GREENGridData::addNZSeason(tempBothDT)
tempBothDT$V1 <- NULL
tempBothDT <- tempBothDT[, meanTemp := Tmean_C]
kableExtra::kable(head(tempBothDT),
caption = "Head: alltemp_hourly.csv.gz after processing")
location | r_dateTime | Tmax_C | Tmin_C | Tmean_C | location | r_dateTimeLocal | hms | season | meanTemp |
---|---|---|---|---|---|---|---|---|---|
15876 | 1997-09-11 17:00:00 | 14.4 | 13.8 | 14.2 | 15876 | 1997-09-11 17:00:00 | 17:00:00 | Spring | 14.2 |
15876 | 1997-09-11 18:00:00 | 14.3 | 12.8 | 13.8 | 15876 | 1997-09-11 18:00:00 | 18:00:00 | Spring | 13.8 |
15876 | 1997-09-11 19:00:00 | 12.9 | 12.1 | 12.5 | 15876 | 1997-09-11 19:00:00 | 19:00:00 | Spring | 12.5 |
15876 | 1997-09-11 20:00:00 | 12.4 | 11.9 | 12.2 | 15876 | 1997-09-11 20:00:00 | 20:00:00 | Spring | 12.2 |
15876 | 1997-09-11 21:00:00 | 11.9 | 11.0 | 11.5 | 15876 | 1997-09-11 21:00:00 | 21:00:00 | Spring | 11.5 |
15876 | 1997-09-11 22:00:00 | 11.7 | 11.1 | 11.4 | 15876 | 1997-09-11 22:00:00 | 22:00:00 | Spring | 11.4 |
Figure 2.8 shows the mean temperature by time of day by season for the data from . Does this look right? The alpha values are suggesting it’s getting warmer…
Again, this plot tells us if our dateTime assumptions are correct.
# combine the temperature data
message("Note that ", nrow(tempBothDT[is.na(meanTemp)]), " observations have no Tmean(C) values...")
## Note that 634 observations have no Tmean(C) values...
tempBothDT <- tempBothDT[, year := lubridate::year(r_dateTime)]
t <- addmargins(with(tempBothDT[is.na(meanTemp)], table(location,year)))
kableExtra::kable(t, caption = "Counts of missing temperature observations by location and year (source: 1 file)") %>%
kable_styling()
1998 | 2003 | 2004 | 2006 | 2007 | 2008 | 2010 | 2012 | 2014 | 2015 | 2016 | 2018 | Sum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15876 | 2 | 53 | 491 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 551 |
23872 | 0 | 0 | 0 | 12 | 11 | 0 | 31 | 1 | 1 | 1 | 0 | 26 | 83 |
Sum | 2 | 53 | 491 | 12 | 11 | 1 | 31 | 2 | 1 | 1 | 3 | 26 | 634 |
message("But it's mostly in years we don't have HP data for?")
## But it's mostly in years we don't have HP data for?
plotDT <- tempBothDT[, .(meanT = mean(meanTemp, na.rm = TRUE)), # crucial otherwise any NA causes the calculation to be NA
keyby = .(hms, season, location, year)]
tempCap <- paste0("Temperature data source: ?? NIWA ??",
"\n Time period: ", min(lubridate::date(tempBothDT$r_dateTimeLocal), na.rm = TRUE),
" to ",
max(lubridate::date(tempBothDT$r_dateTimeLocal), na.rm = TRUE)
)
ggplot2::ggplot(plotDT, aes(x = hms, y = meanT, colour = season, alpha = year)) +
geom_point() +
facet_grid(location ~ .) +
labs(caption = tempCap)
## Warning: Removed 2 rows containing missing values (geom_point).
Figure 2.8: Line plot of mean temperature by season and location
Now we can directly compare the two sources of temperature data to see if they match. Figure 2.9 does this by linking the two files on r_dateTimeLocal
and location
and then plotting the temperature values from each source.
message("N rows of 2 files merged: ", nrow(tempDT))
## N rows of 2 files merged: 328719
message("N rows of alltemp_hourly.csv.gz : ", nrow(tempBothDT))
## N rows of alltemp_hourly.csv.gz : 324465
tempDT <- tempDT[, `:=`(source, "Separate files")]
tempBothDT <- tempBothDT[, `:=`(source, "alltemp_hourly.csv.gz")]
# let's try just comparing dates
t2fdt <- tempDT[, .(dateTime_char_2f = `Date(NZST)`, r_dateTimeLocal, meanTemp_2files = meanTemp,
location, hms_2files = hms)]
setkey(t2fdt, r_dateTimeLocal, location)
t1fdt <- tempBothDT[, .(dateTime_char_1f = r_dateTime, r_dateTimeLocal, meanTemp_1file = meanTemp,
location, hms_1file = hms)]
setkey(t1fdt, r_dateTimeLocal, location)
dt <- t2fdt[t1fdt] # merge on dateTimeLocal & location
ggplot2::ggplot(dt[!is.na(r_dateTimeLocal)], aes(x = meanTemp_2files, y = meanTemp_1file,
colour = lubridate::hour(r_dateTimeLocal))) + geom_point() + facet_grid(. ~
location) + labs(caption = "Comparison of temperature data from each source by location")
## Warning: Removed 645 rows containing missing values (geom_point).
Figure 2.9: Comparison of temperature data from each source by location
# so some of them match but not all
dt <- dt[, `:=`(diffTemp, meanTemp_2files - meanTemp_1file)]
Bang on. As we can see, it makes no differnce which temperture data is used.
So it looks like the date times are OK for both the HP and temperature data and either version of the temperature data is OK too. Now let’s try to link the temperature data derivd from the 2 files to the heat pump data using station and r_dateTimeLocal.
tempDT <- data.table::setkey(tempDT, location, r_dateTimeLocal) # set key to link
tempDT <- tempDT[, `:=`(hour, lubridate::hour(r_dateTimeLocal))]
hpDT <- data.table::setkey(hpDT, location, r_dateTimeLocal)
hpDT <- hpDT[, `:=`(hour, lubridate::hour(r_dateTimeLocal))]
# remove the NA dateTimes before we do
tempDTclean <- tempDT[!is.na(r_dateTimeLocal)]
linkedDT <- tempDTclean[hpDT] # this will add each row of tempDT to the end of each relevant row of hpDT
Now, because we did not match on hour
we can see if anything failed to match up in terms of time of day. There are (Table 2.11) but it looks like they are to do with the missing temperature data as the NAs in the temperature columns (see Table 2.12).
t <- addmargins(with(linkedDT, table(hour, i.hour, useNA = "always")))
kableExtra::kable(t, caption = "Counts of hours from each source") %>% kable_styling()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | NA | Sum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18180 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18180 |
1 | 0 | 18183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18183 |
2 | 0 | 0 | 18133 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18133 |
3 | 0 | 0 | 0 | 18127 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18127 |
4 | 0 | 0 | 0 | 0 | 18177 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18177 |
5 | 0 | 0 | 0 | 0 | 0 | 18188 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18188 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 18186 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18186 |
7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18198 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18198 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18203 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18203 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18179 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18179 |
10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18194 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18194 |
11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18158 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18158 |
12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18171 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18171 |
13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18186 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18186 |
14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18193 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18193 |
15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18207 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18207 |
16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18220 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18220 |
17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18233 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18233 |
18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18234 | 0 | 0 | 0 | 0 | 0 | 0 | 18234 |
19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18235 | 0 | 0 | 0 | 0 | 0 | 18235 |
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18229 | 0 | 0 | 0 | 0 | 18229 |
21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18227 | 0 | 0 | 0 | 18227 |
22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18209 | 0 | 0 | 18209 |
23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18141 | 0 | 18141 |
NA | 192 | 198 | 244 | 198 | 198 | 198 | 211 | 200 | 197 | 221 | 209 | 228 | 219 | 209 | 204 | 191 | 184 | 184 | 184 | 188 | 187 | 187 | 185 | 189 | 0 | 4805 |
Sum | 18372 | 18381 | 18377 | 18325 | 18375 | 18386 | 18397 | 18398 | 18400 | 18400 | 18403 | 18386 | 18390 | 18395 | 18397 | 18398 | 18404 | 18417 | 18418 | 18423 | 18416 | 18414 | 18394 | 18330 | 0 | 441396 |
t <- head(linkedDT[is.na(hour)])
kableExtra::kable(t, caption = "Rows of data where hour (from temperature data file) = NA") %>%
kable_styling()
Station | Date(NZST) | Tmax(C) | Period(Hrs) | Period(Hrs).1 | Period(Hrs).2 | Period(Hrs).3 | Tmin(C) | Tgmin(C) | Tmean(C) | RHmean(%) | Freq | r_dateTime | hms | season | r_dateTimeLocal | location | meanTemp | year | source | hour | i.r_dateTime | powerW | circuit | id | r_dateTimeChar | i.hms | i.season | ej_includeCircuit | ba_includeCircuit | i.hour |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2015-04-05 02:00:00 | 15876 | NA | NA | NA | NA | 2015-04-04 13:00:00 | 0.00000 | Heat Pump$4204 | rf_31 | 2015-04-04 13:00:00 | 02:00:00 | Autumn | 1 | 1 | 2 |
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2015-04-05 02:00:00 | 15876 | NA | NA | NA | NA | 2015-04-04 13:00:00 | 0.00000 | Heat Pump$4223 | rf_34 | 2015-04-04 13:00:00 | 02:00:00 | Autumn | 1 | 1 | 2 |
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2015-04-05 02:00:00 | 15876 | NA | NA | NA | NA | 2015-04-04 13:00:00 | 0.20000 | Heat Pump$4124 | rf_35 | 2015-04-04 13:00:00 | 02:00:00 | Autumn | 1 | 1 | 2 |
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2015-04-05 02:00:00 | 15876 | NA | NA | NA | NA | 2015-04-04 13:00:00 | 0.00000 | Heat Pump$4154 | rf_44 | 2015-04-04 13:00:00 | 02:00:00 | Autumn | 1 | 1 | 2 |
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2015-05-29 09:00:00 | 15876 | NA | NA | NA | NA | 2015-05-28 21:00:00 | 225.59650 | Heat Pump$4204 | rf_31 | 2015-05-28 21:00:00 | 09:00:00 | Autumn | 1 | 1 | 9 |
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2015-05-29 09:00:00 | 15876 | NA | NA | NA | NA | 2015-05-28 21:00:00 | 31.88967 | Heat Pump$4196 | rf_32 | 2015-05-28 21:00:00 | 09:00:00 | Autumn | 1 | 1 | 9 |
So now we think we’ve linked the data correctly, let’s do some exploratory analysis. In this section we keep the observations where power < 40 for the circuits defined in ba_includeCircuit
(see Table 2.3).
First we just plot all the hourly data for demand vs mean temperature (Figure 3.1).
baFilteredDT <- linkedDT[ba_includeCircuit == 1 & !is.na(meanTemp)]
t <- baFilteredDT[, .(meanW = mean(powerW), minW = min(powerW), maxW = max(powerW),
nObs = .N, nCircuits = uniqueN(circuit), nHouseholds = uniqueN(id)), keyby = .(location)]
kableExtra::kable(t, caption = "Summary power demand statistics (all data)") %>%
kable_styling()
location | meanW | minW | maxW | nObs | nCircuits | nHouseholds |
---|---|---|---|---|---|---|
15876 | 99.47134 | 0 | 4342.474 | 216263 | 11 | 11 |
23872 | 90.76957 | 0 | 3145.554 | 167444 | 7 | 7 |
hpCap <- paste0("GREEN Grid Heat Pump circuits, n households = ", uniqueN(baFilteredDT$id),
"\n Time period: ", min(lubridate::date(baFilteredDT$r_dateTimeLocal)),
" to ", max(lubridate::date(baFilteredDT$r_dateTimeLocal)), "\n n circuits: ",
uniqueN(baFilteredDT$circuit))
tempCap <- paste0("Temperature data source: ?? NIWA ??")
combinedCaption <- paste0(hpCap, "\n", tempCap)
ggplot2::ggplot(baFilteredDT[!is.na(meanTemp)], aes(x = meanTemp, y = powerW/1000,
colour = id)) + geom_point() + facet_grid(season ~ location) + labs(caption = combinedCaption,
x = "Mean temperature", y = "Mean half-hourly kW")
Figure 3.1: Plots of temperature vs demand by season and location
That doesn’t tell us a lot - although these is indeed a suggestion of their use for cooling in summer in both locations by some households. This should remind us that this analysis is based on a very small sample of households. So let’s try to re-create the binned box plots. In this case it is not helpful to distinguish between households but we should be aware that the patterns we see may be driven by just one or two households’ behaviour.
ggplot2::ggplot(linkedDT[!is.na(meanTemp)],
mapping = aes(x = meanTemp, y = powerW/1000)) +
geom_boxplot(aes(cut_width(meanTemp,
width = 2, # change the width to get 'wider' bins
boundary = 0))) +
facet_grid(season ~ location) +
labs(caption = combinedCaption,
x = "Mean temperature",
y = "Mean half-hourly kW")
Figure 3.2: Box plots of temperature vs demand by season and location
Figure 3.2 shows binned box plots of power against demand and suggest more clearly that there is a cooling effect for households in summer - especially in 15876. Or perhaps we should more accurately say there is evidence of a summer cooling effect in the 11 (!) households who live in 15876. And who would base a claim on that many households?
Anyway, just for good measure lets try to re-create the U shaped plots Emily produced by ignoring season.
ggplot2::ggplot(linkedDT[!is.na(meanTemp)], mapping = aes(x = meanTemp, y = powerW/1000)) +
geom_boxplot(aes(cut_width(meanTemp,
width = 2, # change the width to get 'wider' bins
boundary = 0))) +
facet_grid(. ~ location) +
labs(caption = combinedCaption,
x = "Mean temperature",
y = "Mean half-hourly kW")
Figure 3.3: Box plots of temperature vs demand by season and location
Although we can clearly see the increased demand for heat as temperatures decline below 0, the increase in the upper temperature ranges is probably obscured by the large number of zero values. It is arguable whether leaving the zeros out is valid since they could be interpreted as reflecting a household’s lack of need for cooling. Were we to only include values > 40 W (to exclude 0 and standby power values) we would be analysing (and subsequently modelling) only those households who actually responded to the temperature change by demanding cooling - which will not be the case for all households in the population.
Notwithstanding the previous argument re data exclusion, in this section we test Emily’s suggestion to only keep heat pump data where power > 40 for the circuits defined in ēj_includeCircuit
(see Table 2.3).
First we just plot all the hourly data for demand vs mean temperature (Figure 4.1).
ejFilteredDT <- linkedDT[powerW > 40 & ej_includeCircuit == 1 & !is.na(meanTemp)]
t <- ejFilteredDT[, .(meanW = mean(powerW), minW = min(powerW), maxW = max(powerW),
nObs = .N, nCircuits = uniqueN(circuit), nHouseholds = uniqueN(id)), keyby = .(location)]
kableExtra::kable(t, caption = "Summary power demand staistics (filtered)") %>%
kable_styling()
location | meanW | minW | maxW | nObs | nCircuits | nHouseholds |
---|---|---|---|---|---|---|
15876 | 532.1126 | 40.0005 | 4342.474 | 39840 | 11 | 11 |
23872 | 608.7093 | 40.0305 | 3145.554 | 19053 | 6 | 6 |
hpCap <- paste0("GREEN Grid Heat Pump circuits, n households = ", uniqueN(ejFilteredDT$id),
"\n Time period: ", min(lubridate::date(ejFilteredDT$r_dateTimeLocal)),
" to ", max(lubridate::date(ejFilteredDT$r_dateTimeLocal)), "\n n circuits: ",
uniqueN(ejFilteredDT$circuit))
tempCap <- paste0("Temperature data source: ?? NIWA ??")
combinedCaption <- paste0(hpCap, "\n", tempCap)
ggplot2::ggplot(ejFilteredDT, aes(x = meanTemp, y = powerW/1000, colour = id)) +
geom_point() + facet_grid(season ~ location) + labs(caption = combinedCaption,
x = "Mean temperature", y = "Mean half-hourly kW")
Figure 4.1: Plots of temperature vs demand by season and location
Again, that doesn’t tell us a lot - although these is indeed a suggestion of their use for cooling in summer in both locations. So let’s re-try to create the binned box plots.
ggplot2::ggplot(ejFilteredDT, mapping = aes(x = meanTemp, y = powerW/1000)) +
geom_boxplot(aes(cut_width(meanTemp,
width = 2, # change the width to get 'wider' bins
boundary = 0))) +
facet_grid(season ~ location) +
labs(caption = combinedCaption,
x = "Mean temperature",
y = "Mean half-hourly kW")
Figure 4.2: Box plots of temperature vs demand by season and location
Figure 4.2 shows binned box plots of power against demand. As we would expect, the plot suggests more clearly that there is a cooling effect for households in summer when we remove 0 and power < 40 W values.
Anyway, lets again try to re-create the U shaped plots Emily produced by ignoring season.
ggplot2::ggplot(ejFilteredDT, mapping = aes(x = meanTemp, y = powerW/1000)) +
geom_boxplot(aes(cut_width(meanTemp,
width = 2, # change the width to get 'wider' bins
boundary = 0))) +
facet_grid(. ~ location) +
labs(caption = combinedCaption,
x = "Mean temperature",
y = "Mean half-hourly kW")
Figure 4.3: Box plots of temperature vs demand by season and location
The increase in demand for power as the temperature rises is now clearer.
But this is fairly obvious since we are just looking at the households who responded to the temperature changes…
tbc
Needs to include the 0 values (see discussion above)?
If the regression model assumes independence of observations then this is violated if we have multiple observations for each house (which we do whether we just use the ‘best’ year or not…). The short-term fix would be to use robust standard errors in model reporting or maybe a fixed-effects model (?) but that might be well beyond scope of the report!
t <- proc.time() - startTime
elapsed <- t[[3]]
Analysis completed in 122.95 seconds ( 2.05 minutes) using knitr in RStudio with R version 3.5.2 (2018-12-20) running on x86_64-apple-darwin15.6.0.
list packages used & cite
Figure 8.1 shows the relationship between mean temperature and mean power for all hourly observations for all circuits colour coded by season. Smoothed fit lines have been added using mgcv:gam()
curves due to the size of the dataset (see ?geom_smooth
and [@ggplot]).
ggplot2::ggplot(linkedDT, aes(x = meanTemp, y = powerW, colour = season)) +
geom_point(alpha = 0.5) + geom_smooth() + facet_wrap(. ~ circuit)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4939 rows containing non-finite values (stat_smooth).
## Warning: Removed 4939 rows containing missing values (geom_point).
Figure 8.1: Pairs plot of temperature vs heat pump power demand for all households
Depending on input files choice this will vary.
head(hpDT_orig)
## V1 r_dateTime powerW circuit id location
## 1: 1 2015-03-26 04:00:00 0.00000 Heat Pump$4204 rf_31 15876
## 2: 2 2015-03-26 05:00:00 33.68717 Heat Pump$4204 rf_31 15876
## 3: 3 2015-03-26 06:00:00 0.00000 Heat Pump$4204 rf_31 15876
## 4: 4 2015-03-26 07:00:00 0.00000 Heat Pump$4204 rf_31 15876
## 5: 5 2015-03-26 08:00:00 0.00000 Heat Pump$4204 rf_31 15876
## 6: 6 2015-03-26 09:00:00 0.00000 Heat Pump$4204 rf_31 15876
skimr::skim(hpDT_orig)
## Skim summary statistics
## n obs: 441379
## n variables: 6
##
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## circuit 0 441379 441379 14 33 0 20
## id 0 441379 441379 5 6 0 18
## r_dateTime 0 441379 441379 19 19 0 36583
##
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## location 0 441379 441379 19944.32 3997.39 15876 15876
## V1 0 441379 441379 220690 127415.29 1 110345.5
## p50 p75 p100 hist
## 23872 23872 23872 ▇▁▁▁▁▁▁▇
## 220690 331034.5 441379 ▇▇▇▇▇▇▇▇
##
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## powerW 0 441379 441379 95.47 271.51 0 0 0 19.02 4342.47
## hist
## ▇▁▁▁▁▁▁▁
skimr::skim(hpDT)
## Skim summary statistics
## n obs: 441379
## n variables: 12
##
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## circuit 0 441379 441379 14 33 0 20
## id 0 441379 441379 5 6 0 18
## r_dateTimeChar 0 441379 441379 19 19 0 36583
##
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median n_unique
## hms 0 441379 441379 0 secs 82800 secs 12:00:00 24
##
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n n_unique
## season 0 441379 441379 4
## top_counts ordered
## Win: 122946, Aut: 111768, Spr: 107705, Sum: 98960 FALSE
##
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75
## hour 0 441379 441379 11.5 6.92 0 6 12 18
## location 0 441379 441379 19944.32 3997.39 15876 15876 23872 23872
## p100 hist
## 23 ▇▇▇▇▇▇▇▇
## 23872 ▇▁▁▁▁▁▁▇
##
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75
## ba_includeCircuit 0 441379 441379 0.88 0.33 0 1 1 1
## ej_includeCircuit 0 441379 441379 0.8 0.4 0 1 1 1
## powerW 0 441379 441379 95.47 271.51 0 0 0 19.02
## p100 hist
## 1 ▁▁▁▁▁▁▁▇
## 1 ▂▁▁▁▁▁▁▇
## 4342.47 ▇▁▁▁▁▁▁▁
##
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median
## r_dateTime 0 441379 441379 2014-05-28 2018-08-01 2016-04-08
## r_dateTimeLocal 0 441379 441379 2014-05-29 2018-08-01 2016-04-09
## n_unique
## 36583
## 36583
File 1:
skimr::skim(temp15876DT)
## Skim summary statistics
## n obs: 185459
## n variables: 17
##
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## Date(NZST) 0 185459 185459 16 16 0 185455
## Freq 0 185459 185459 1 1 0 1
## Period(Hrs) 0 185459 185459 1 1 0 2
## Period(Hrs).1 0 185459 185459 1 1 0 2
## Period(Hrs).2 0 185459 185459 1 1 0 2
## Period(Hrs).3 0 185459 185459 1 1 0 2
## RHmean(%) 0 185459 185459 1 3 0 88
## Tgmin(C) 0 185459 185459 1 4 0 569
## Tmax(C) 0 185459 185459 1 4 0 385
## Tmean(C) 0 185459 185459 1 5 0 382
## Tmin(C) 0 185459 185459 1 5 0 378
##
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median n_unique
## hms 21 185438 185459 0 secs 82800 secs 43200 secs 24
##
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n n_unique
## season 0 185459 185459 4
## top_counts ordered
## Spr: 47102, Win: 46360, Aut: 46082, Sum: 45915 FALSE
##
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## location 0 185459 185459 15876 0 15876 15876 15876 15876 15876
## Station 0 185459 185459 15876 0 15876 15876 15876 15876 15876
## hist
## ▁▁▁▇▁▁▁▁
## ▁▁▁▇▁▁▁▁
##
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median
## r_dateTime 21 185438 185459 1997-09-11 2019-01-06 2008-05-19
## r_dateTimeLocal 21 185438 185459 1997-09-11 2019-01-06 2008-05-19
## n_unique
## 185434
## 185434
File 2:
skimr::skim(temp23872DT)
## Skim summary statistics
## n obs: 143260
## n variables: 17
##
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## Date(NZST) 0 143260 143260 16 16 0 139010
## Freq 0 143260 143260 1 1 0 1
## Period(Hrs) 0 143260 143260 1 1 0 2
## Period(Hrs).1 0 143260 143260 1 1 0 2
## Period(Hrs).2 0 143260 143260 1 1 0 2
## Period(Hrs).3 0 143260 143260 1 1 0 2
## RHmean(%) 0 143260 143260 1 3 0 85
## Tgmin(C) 0 143260 143260 1 4 0 511
## Tmax(C) 0 143260 143260 1 4 0 307
## Tmean(C) 0 143260 143260 1 4 0 303
## Tmin(C) 0 143260 143260 1 4 0 302
##
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median n_unique
## hms 16 143244 143260 0 secs 82800 secs 43200 secs 24
##
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n n_unique
## season 0 143260 143260 4
## top_counts ordered
## Aut: 36423, Sum: 36244, Win: 36023, Spr: 34570 FALSE
##
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## location 0 143260 143260 23872 0 23872 23872 23872 23872 23872
## Station 0 143260 143260 23872 0 23872 23872 23872 23872 23872
## hist
## ▁▁▁▇▁▁▁▁
## ▁▁▁▇▁▁▁▁
##
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median
## r_dateTime 16 143244 143260 2002-12-06 2019-01-06 2010-09-02
## r_dateTimeLocal 16 143244 143260 2002-12-06 2019-01-06 2010-09-02
## n_unique
## 138994
## 138994
Combined file:
skimr::skim(tempBothDT)
## Skim summary statistics
## n obs: 324465
## n variables: 12
##
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## r_dateTime 37 324428 324465 19 19 0 186732
## source 0 324465 324465 21 21 0 1
##
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median n_unique
## hms 37 324428 324465 0 secs 82800 secs 43200 secs 24
##
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n n_unique
## season 0 324465 324465 4
## top_counts ordered
## Spr: 81672, Win: 81298, Sum: 81200, Aut: 80295 FALSE
##
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## location 0 324465 324465 19301.71 3956.83 15876 15876 15876
## location.1 0 324465 324465 19301.71 3956.83 15876 15876 15876
## p75 p100 hist
## 23872 23872 ▇▁▁▁▁▁▁▆
## 23872 23872 ▇▁▁▁▁▁▁▆
##
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75
## meanTemp 634 323831 324465 12.63 5.3 -10 9.1 12.6 16.3
## Tmax_C 503 323962 324465 13.26 5.33 -9.5 9.7 13.2 16.9
## Tmean_C 634 323831 324465 12.63 5.3 -10 9.1 12.6 16.3
## Tmin_C 577 323888 324465 12.02 5.29 -12.2 8.5 12 15.7
## year 37 324428 324465 2008.96 5.69 1997 2004 2009 2014
## p100 hist
## 35.6 ▁▁▃▇▇▂▁▁
## 36.4 ▁▁▃▇▇▃▁▁
## 35.6 ▁▁▃▇▇▂▁▁
## 34.2 ▁▁▂▇▇▃▁▁
## 2019 ▃▅▇▇▆▇▇▅
##
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
## variable missing complete n min max median
## r_dateTimeLocal 37 324428 324465 1997-09-11 2019-01-06 2009-08-24
## n_unique
## 186732