1 Setup

This is where we do setup stuff like attaching libraries etc.

# Set start time ----
startTime <- proc.time()

# Libraries ----
library(here)
projRoot <- here::here()

# Emily's libraries and functions
source(paste0(projRoot, "/R/libraries.R"))  # Emily's recommended libraries
source(paste0(projRoot, "/R/functions.R"))  # Emily's functions

# Local libraries ----
library(kableExtra)  # for fancy tables
library(skimr)  # for data descriptors (skim)
# any extras?

# Local functions ----

Session info (for reference)

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_NZ.UTF-8/en_NZ.UTF-8/en_NZ.UTF-8/C/en_NZ.UTF-8/en_NZ.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bookdown_0.9      rmarkdown_1.11    skimr_1.0.5      
##  [4] kableExtra_1.0.1  ggforce_0.2.1     viridis_0.5.1    
##  [7] viridisLite_0.3.0 GREENGridData_1.0 data.table_1.12.0
## [10] lubridate_1.7.4   gridExtra_2.3     ggplot2_3.1.0    
## [13] dplyr_0.8.0.1     readr_1.3.1       usethis_1.4.0    
## [16] devtools_2.0.1    here_0.1         
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.0        tidyr_0.8.3       pkgload_1.0.2    
##  [4] splines_3.5.2     R.utils_2.8.0     assertthat_0.2.0 
##  [7] highr_0.7         cellranger_1.1.0  yaml_2.2.0       
## [10] remotes_2.0.2     progress_1.2.0    sessioninfo_1.1.1
## [13] lattice_0.20-38   pillar_1.3.1      backports_1.1.3  
## [16] glue_1.3.0        digest_0.6.18     polyclip_1.10-0  
## [19] rvest_0.3.2       colorspace_1.4-0  Matrix_1.2-16    
## [22] htmltools_0.3.6   R.oo_1.22.0       plyr_1.8.4       
## [25] pkgconfig_2.0.2   purrr_0.3.1       scales_1.0.0     
## [28] webshot_0.5.1     processx_3.3.0    tweenr_1.0.1     
## [31] tibble_2.0.1      mgcv_1.8-27       farver_1.1.0     
## [34] withr_2.1.2       lazyeval_0.2.1    cli_1.0.1        
## [37] magrittr_1.5      crayon_1.3.4      readxl_1.3.0     
## [40] memoise_1.1.0     evaluate_0.13     ps_1.3.0         
## [43] R.methodsS3_1.7.1 fs_1.2.6          nlme_3.1-137     
## [46] MASS_7.3-51.1     xml2_1.2.0        pkgbuild_1.0.2   
## [49] tools_3.5.2       prettyunits_1.0.2 hms_0.4.2        
## [52] formatR_1.6       stringr_1.4.0     munsell_0.5.0    
## [55] callr_3.1.1       compiler_3.5.2    rlang_0.3.1      
## [58] grid_3.5.2        rstudioapi_0.9.0  labeling_0.3     
## [61] testthat_2.0.1    gtable_0.2.0      reshape2_1.4.3   
## [64] R6_2.4.0          knitr_1.22        rprojroot_1.3-2  
## [67] desc_1.2.0        stringi_1.3.1     Rcpp_1.0.0       
## [70] tidyselect_0.2.5  xfun_0.5

2 Load final data

2.1 Heat pumps

First we load the selected pre-extracted heat pump data and run some basic tests.

message("Loading", params$dataFile)
## Loading/Users/ben/Data/NZ_GREENGrid/safe/emilyJiang/allHouseholds_hourly.csv.gz
isTesting <- 0

# for testing ---- isTesting <- 1

if (isTesting) {
    params <- list()  # gets set by knit
    # params$dataFile <- paste0(myParams$dPath, 'allHouseholds_hourly.csv.gz')
    params$dataFile <- "~/Data/NZ_GREENGrid/safe/gridSpy/1min/dataExtracts/Heat Pump_2015-04-01_2016-03-31_observations.csv.gz"
    params$dataSource <- "benData"
}

hpDT_orig <- data.table::fread(params$dataFile)

hpDT <- hpDT_orig

if (params$dataSource == "emilyData") {
    # drop V1 (purpose?)
    hpDT$V1 <- NULL
    # fix the date - remember it's stored as UTC
    hpDT <- hpDT[, `:=`(r_dateTimeChar, r_dateTime)]
    hpDT <- hpDT[, `:=`(r_dateTime, lubridate::as_datetime(r_dateTime))]
    hpDT <- hpDT[, `:=`(r_dateTimeLocal, lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland"))]  # convert to local time from UTC
}

if (params$dataSource == "benData") {
    # minute level observations, needs processing & summarising calculate hourly
    # mean
    hpDT <- hpDT[, `:=`(r_dateTime, lubridate::as_datetime(r_dateTime))]  # make sure date set to POSIXct
    hpDT <- hpDT[, `:=`(r_dateTimeHour, lubridate::floor_date(r_dateTime, "hour"))]  # cuts back to hour at the start. I assume this is what Emily did
    hpDTh <- hpDT[, .(powerW = mean(powerW)), keyby = .(r_dateTime = r_dateTimeHour, 
        circuit, linkID)]  # calculate hourly mean power (temperature data is hourly means)
    # proceed as above
    hpDTh <- hpDTh[, `:=`(r_dateTimeLocal, lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland"))]  # convert to local time from UTC
    # merge location table
    locationDT <- data.table::fread(paste0(myParams$dPath, "locationTable.csv"))
    setkey(locationDT, linkID)
    setkey(hpDTh, linkID)
    hpDT <- hpDTh[locationDT[, .(linkID, location)]]
    hpDT$id <- hpDT$linkID
}


# location is constant by id so we could just create a table and drop that
# columhn too but...

hpDT <- hpDT[, `:=`(hms, hms::as.hms(r_dateTimeLocal))]  # useful for plots and for checking we have the right timezone!

hpDT <- GREENGridData::addNZSeason(hpDT, r_dateTime = r_dateTimeLocal)

If any of the r_dateTimes failed to parse they will have the local time set to NA so we can see what happened (if anything) in Table 2.1. If there are no rows in this table then all r_dateTimes parsed.

kableExtra::kable(head(hpDT[is.na(r_dateTime)], 10), caption = "First 10 rows of Heat Pump data where r_dateTime failed to parse (could be none at all)") %>% 
    kable_styling()
Table 2.1: First 10 rows of Heat Pump data where r_dateTime failed to parse (could be none at all)
r_dateTime powerW circuit id location r_dateTimeChar r_dateTimeLocal hms season

Table 2.2 shows the first 10 rows of slighty processed data. Note that local time ias set to NZ time while the original r_dateTime is in UTC. This is important for linking the temperature data correctly!

kableExtra::kable(head(hpDT, 10), caption = "First 10 rows of (slightly) processed Heat Pump data") %>% 
    kable_styling()
Table 2.2: First 10 rows of (slightly) processed Heat Pump data
r_dateTime powerW circuit id location r_dateTimeChar r_dateTimeLocal hms season
2015-03-26 04:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 04:00:00 2015-03-26 17:00:00 17:00:00 Autumn
2015-03-26 05:00:00 33.68717 Heat Pump$4204 rf_31 15876 2015-03-26 05:00:00 2015-03-26 18:00:00 18:00:00 Autumn
2015-03-26 06:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 06:00:00 2015-03-26 19:00:00 19:00:00 Autumn
2015-03-26 07:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 07:00:00 2015-03-26 20:00:00 20:00:00 Autumn
2015-03-26 08:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 08:00:00 2015-03-26 21:00:00 21:00:00 Autumn
2015-03-26 09:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 09:00:00 2015-03-26 22:00:00 22:00:00 Autumn
2015-03-26 10:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 10:00:00 2015-03-26 23:00:00 23:00:00 Autumn
2015-03-26 11:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 11:00:00 2015-03-27 00:00:00 00:00:00 Autumn
2015-03-26 12:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 12:00:00 2015-03-27 01:00:00 01:00:00 Autumn
2015-03-26 13:00:00 0.00000 Heat Pump$4204 rf_31 15876 2015-03-26 13:00:00 2015-03-27 02:00:00 02:00:00 Autumn

Table 2.3 gives a list of all circuit labels seen - as a check on what we think they are measuring.

In addition, Emily’s analysis excludes the following circuits:

  • Downstairs (inc 1 Heat Pump)$2212 - this may well include a number of other appliances;
  • Bedroom & Lounge Heat Pumps$2741 - appears to have many zero values in winter at low temperatures but so do others (see Figure 8.1);
  • Theatre Heat Pump$2740 - generally low level of use (see Figure 8.1);

How many of them are there? Here we create two filters - one based on Emily’s suggestions (3 exclusions as above) and one which allows Bedroom & Lounge Heat Pumps$2741.

# flag the circuits we don't want
hpDT <- hpDT[, ej_includeCircuit := ifelse(circuit == "Downstairs (inc 1 Heat Pump)$2212" |
                                          circuit == "Bedroom & Lounge Heat Pumps$2741" | # why exclude this one?
                                          circuit == "Theatre Heat Pump$2740",
                                        0, # exclude 
                                        1)] # sets all others to 1

# flag the circuits we don't want
hpDT <- hpDT[, ba_includeCircuit := ifelse(circuit == "Downstairs (inc 1 Heat Pump)$2212" |
                                          circuit == "Theatre Heat Pump$2740",
                                        0, # exclude 
                                        1)] # sets all others to 1


t <- hpDT[, .(nObs = .N), keyby = .(ej_includeCircuit, ba_includeCircuit, circuit)]

kableExtra::kable(t, caption = "Counts of circuit labels for those we exclude (0) and those we include (1)") %>% 
  kable_styling()
Table 2.3: Counts of circuit labels for those we exclude (0) and those we include (1)
ej_includeCircuit ba_includeCircuit circuit nObs
0 0 Downstairs (inc 1 Heat Pump)$2212 18752
0 0 Theatre Heat Pump$2740 35181
0 1 Bedroom & Lounge Heat Pumps$2741 35181
1 1 Heat Pump$2092 25831
1 1 Heat Pump$2148 16012
1 1 Heat Pump$2598 30378
1 1 Heat Pump$2758 11800
1 1 Heat Pump$2826 15161
1 1 Heat Pump$4124 13045
1 1 Heat Pump$4130 16273
1 1 Heat Pump$4134 29398
1 1 Heat Pump$4150 26788
1 1 Heat Pump$4154 29379
1 1 Heat Pump$4160 13669
1 1 Heat Pump$4175 14779
1 1 Heat Pump$4190 22913
1 1 Heat Pump$4196 9023
1 1 Heat Pump$4204 29351
1 1 Heat Pump$4223 12190
1 1 Upstairs Heat Pumps$2211 36275

Table 2.4 gives summary statistics for heat pump demand for all circuits and all households (id).

t <- hpDT[, .(nObs = .N, meanW = mean(powerW), minW = min(powerW), maxW = max(powerW)), 
    keyby = .(id, circuit, location)]

kableExtra::kable(t, caption = "Heat pump statistics by household") %>% kable_styling()
Table 2.4: Heat pump statistics by household
id circuit location nObs meanW minW maxW
rf_08 Heat Pump$2092 23872 25831 67.119591 0 1692.450
rf_10 Heat Pump$2598 23872 30378 48.317166 0 2245.535
rf_13 Downstairs (inc 1 Heat Pump)$2212 23872 18752 276.478907 0 2919.731
rf_13 Upstairs Heat Pumps$2211 23872 36275 148.184983 0 2692.076
rf_17a Heat Pump$2148 23872 16012 25.986817 0 3145.554
rf_19 Bedroom & Lounge Heat Pumps$2741 23872 35181 90.488668 0 2167.546
rf_19 Theatre Heat Pump$2740 23872 35181 3.030243 0 800.684
rf_25 Heat Pump$2758 23872 11800 89.182960 0 1726.893
rf_27 Heat Pump$2826 23872 15161 138.368410 0 2382.799
rf_31 Heat Pump$4204 15876 29351 127.532600 0 2109.801
rf_32 Heat Pump$4196 15876 9023 67.796572 0 1860.604
rf_34 Heat Pump$4223 15876 12190 196.538713 0 4342.474
rf_35 Heat Pump$4124 15876 13045 58.915105 0 1127.889
rf_36 Heat Pump$4150 15876 26788 104.854347 0 2365.296
rf_37 Heat Pump$4134 15876 29398 49.805530 0 1752.853
rf_38 Heat Pump$4175 15876 14779 247.926515 0 2431.999
rf_41 Heat Pump$4190 15876 22913 51.088287 0 3528.475
rf_42 Heat Pump$4130 15876 16273 44.588199 0 2113.905
rf_44 Heat Pump$4154 15876 29379 97.935001 0 1998.174
rf_45 Heat Pump$4160 15876 13669 94.366494 0 2284.892

Table 2.5 gives summary statistics for heat pump demand for all circuits in each location.

t <- hpDT[, .(nHouseholds = uniqueN(id), nObs = .N, meanW = mean(powerW), minW = min(powerW), 
    maxW = max(powerW)), keyby = .(location, season)]

kableExtra::kable(t, caption = "Heat pump statistics by household") %>% kable_styling()
Table 2.5: Heat pump statistics by household
location season nHouseholds nObs meanW minW maxW
15876 Spring 11 50789 52.30310 0 3598.496
15876 Summer 11 44567 11.23806 0 3653.164
15876 Autumn 11 59071 58.86289 0 3376.472
15876 Winter 11 62381 238.62007 0 4342.474
23872 Spring 7 56916 70.98425 0 2635.756
23872 Summer 7 54393 35.20986 0 2382.799
23872 Autumn 7 52697 54.59329 0 2451.487
23872 Winter 7 60565 194.59470 0 3145.554

Table 2.6 and Figure 2.1 gives the number of households observed for all circuits in each year and season. Small numbers in any given season/year will affect the reliability of the subsequent results. On the basis of this table/plot we may wish to select specific time periods.

t <- hpDT[, .(nHouseholds = uniqueN(id), nObs = .N), keyby = .(year = as.factor(lubridate::year(r_dateTimeLocal)), 
    month = lubridate::month(r_dateTimeLocal, label = TRUE))]

kableExtra::kable(t, caption = "Household counts by year and season") %>% kable_styling()
Table 2.6: Household counts by year and season
year month nHouseholds nObs
2014 May 2 96
2014 Jun 3 2620
2014 Jul 6 4373
2014 Aug 6 5952
2014 Sep 6 5740
2014 Oct 6 5949
2014 Nov 7 5739
2014 Dec 6 5944
2015 Jan 6 5952
2015 Feb 6 5376
2015 Mar 17 7567
2015 Apr 17 13613
2015 May 18 13954
2015 Jun 18 14400
2015 Jul 18 14726
2015 Aug 18 14254
2015 Sep 18 13978
2015 Oct 18 14414
2015 Nov 18 13497
2015 Dec 16 12913
2016 Jan 15 12443
2016 Feb 15 11678
2016 Mar 15 12541
2016 Apr 14 9704
2016 May 14 10624
2016 Jun 14 11334
2016 Jul 15 11726
2016 Aug 14 10709
2016 Sep 13 9944
2016 Oct 14 10134
2016 Nov 12 9360
2016 Dec 12 9672
2017 Jan 12 9634
2017 Feb 12 8461
2017 Mar 11 8921
2017 Apr 11 8625
2017 May 11 8187
2017 Jun 9 7200
2017 Jul 9 7440
2017 Aug 9 7213
2017 Sep 8 6462
2017 Oct 8 6696
2017 Nov 8 5769
2017 Dec 7 5301
2018 Jan 7 5626
2018 Feb 8 5973
2018 Mar 8 6647
2018 Apr 7 5475
2018 May 7 5752
2018 Jun 7 5707
2018 Jul 7 5196
2018 Aug 6 168
ggplot2::ggplot(t, aes(x = year, y = month, fill = nHouseholds)) + geom_tile()
Count of observed households by year and month

Figure 2.1: Count of observed households by year and month

Figure 2.2 shows the mean heat pump demand by time of day by season. Does this look right?

hpCap <- paste0("GREEN Grid Heat Pump circuits, n households = ", uniqueN(hpDT$id), 
    "\n Time period: ", min(lubridate::date(hpDT$r_dateTimeLocal)), " to ", 
    max(lubridate::date(hpDT$r_dateTimeLocal)))

ggplot2::ggplot(hpDT, aes(x = powerW)) + geom_histogram() + facet_grid(location ~ 
    season) + labs(caption = paste0(hpCap, "\n n circuits: ", uniqueN(hpDT$circuit)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram mean hourly power demand for Heat Pumps by season and location

Figure 2.2: Histogram mean hourly power demand for Heat Pumps by season and location

Figure 2.3 shows same plot but only for power > 0 W (i.e. on standby or heating) and only for the circuits BA suggests we include (all but 2 - see Table 2.3). Does this look right?

dt <- hpDT[powerW > 0 & ba_includeCircuit == 1]
message("Check min power: ", min(dt$powerW))
## Check min power: 0.0166666666666667
ggplot2::ggplot(dt, aes(x = powerW)) + geom_histogram() + facet_grid(location ~ 
    season) + labs(caption = paste0(hpCap, "\n n circuits: ", uniqueN(dt$circuit)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram mean hourly power demand for Heat Pumps by season and location

Figure 2.3: Histogram mean hourly power demand for Heat Pumps by season and location

Figure 2.4 shows same plot but only for power > 40 W and only for the circuits BA suggests we include. Does this look right?

dt <- hpDT[powerW > 40 & ba_includeCircuit == 1]
message("Check min power: ", min(dt$powerW))
## Check min power: 40.0005
ggplot2::ggplot(dt, aes(x = powerW)) + geom_histogram() + facet_grid(location ~ 
    season) + labs(caption = paste0(hpCap, "\n n circuits: ", uniqueN(dt$circuit)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram mean hourly power demand for Heat Pumps by season and location

Figure 2.4: Histogram mean hourly power demand for Heat Pumps by season and location

Figure 2.5 shows the mean heat pump demand (no power value exclusions) by time of day by season for the circuits BA suggests we include. Does this look right?

Figure 2.5 is important because:

  • we converted the incoming UTC dateTime to local (NZT) time -> r_dateTimeLocal
  • we then created a hh:mm variable from r_dateTimeLocal and used that to make this plot
  • so if this looks like the right shape for Heat Pump demand then we have made the correct assumptions about the incoming r_dateTime
dt <- hpDT[ba_includeCircuit == 1]

plotDT <- dt[, .(meanW = mean(powerW)), keyby = .(hms, season, location)]

ggplot2::ggplot(plotDT, aes(x = hms, y = meanW, colour = season)) + geom_line() + 
    facet_grid(location ~ .) + labs(caption = paste0(hpCap, "\n n circuits: ", 
    uniqueN(dt$circuit), "\n n observations: ", nrow(dt[!is.na(powerW)])))
Line plot of mean power demand for Heat Pumps by season and location

Figure 2.5: Line plot of mean power demand for Heat Pumps by season and location

Are we seeing some cooling in summer?

We can repeat this plot including only powerW values > 40 W (Figure 2.6). This shows the level of demand when they are on. This plot shows some distinct signs of heat pump use in early evenings in summer producing a slightly earlier ‘peak’ than is the case for winter heating. It has to be said however that the plots are quite messy, no doubt due to the small numbers involved.

dt <- hpDT[powerW > 40 & ba_includeCircuit == 1]

plotDT <- dt[, .(meanW = mean(powerW)), keyby = .(hms, season, location)]

ggplot2::ggplot(plotDT, aes(x = hms, y = meanW, colour = season)) + geom_line() + 
    facet_grid(location ~ .) + labs(caption = paste0(hpCap, "\n Values > 40W only", 
    "\n n circuits: ", uniqueN(dt$circuit), "\n n observations: ", nrow(dt[!is.na(powerW)])))
Line plot of mean power demand for Heat Pumps by season and location where power > 40W

Figure 2.6: Line plot of mean power demand for Heat Pumps by season and location where power > 40W

2.2 Temperature

2.2.1 Data in 2 files

First we load this using the two seperate files.

message("Loading 15876_hourly_all.txt.gz")
## Loading 15876_hourly_all.txt.gz
temp15876DT <- data.table::fread(paste0(myParams$dPath, "15876_hourly_all.txt.gz"))
message("Loading 23872_hourly_all.txt.gz")
## Loading 23872_hourly_all.txt.gz
temp23872DT <- data.table::fread(paste0(myParams$dPath, "23872_hourly_all.txt.gz"))

fixTempData <- function(dt){
  # bunch of stuff we want to do to each dataset
  # fix datetime so it matches the hp data
  dt <- dt[, r_dateTime := lubridate::dmy_hm(`Date(NZST)`, # so we know it's NZT - shame it had to have () in it!
                                                  tz = "Pacific/Auckland")] # just to be sure (if you ran this code in the UK it would set it to UTC by default)
  
  
  dt <- dt[, hms := hms::as.hms(r_dateTime)]
  
  dt <- GREENGridData::addNZSeason(dt)
  
  dt <- dt[, r_dateTimeLocal := r_dateTime] # for matching
  dt <- dt[, location := Station] # for matching
  
  return(dt)
}

temp15876DT <- fixTempData(temp15876DT)
## Warning: 21 failed to parse.
message("Some dates failed to parse - which ones?")
## Some dates failed to parse - which ones?
kableExtra::kable(head(temp15876DT[is.na(r_dateTimeLocal), .(Station, `Date(NZST)`, r_dateTimeLocal)]), 
                  caption = "Non-parsed dates for location =  15876")
Table 2.7: Non-parsed dates for location = 15876
Station Date(NZST) r_dateTimeLocal
15876 05/10/1997 02:00 NA
15876 04/10/1998 02:00 NA
15876 03/10/1999 02:00 NA
15876 01/10/2000 02:00 NA
15876 07/10/2001 02:00 NA
15876 06/10/2002 02:00 NA
temp23872DT <- fixTempData(temp23872DT)
## Warning: 16 failed to parse.
message("Some dates failed to parse - which ones?")
## Some dates failed to parse - which ones?
kableExtra::kable(head(temp23872DT[is.na(r_dateTimeLocal), .(Station, `Date(NZST)`, r_dateTimeLocal)]), 
                  caption = "Non-parsed dates for location =  23872")
Table 2.7: Non-parsed dates for location = 23872
Station Date(NZST) r_dateTimeLocal
23872 05/10/2003 02:00 NA
23872 03/10/2004 02:00 NA
23872 02/10/2005 02:00 NA
23872 01/10/2006 02:00 NA
23872 30/09/2007 02:00 NA
23872 28/09/2008 02:00 NA
message("Looks like the ones around the DST switches. We hate DST almost as much as we hate timezones.")
## Looks like the ones around the DST switches. We hate DST almost as much as we hate timezones.

Convert the incoming character values to numeric as needed.

# combine the temperature data
tempDT <- rbind(temp15876DT, temp23872DT)

tempDT <- tempDT[, `:=`(meanTemp, as.numeric(`Tmean(C)`))]
## Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
message("Note that ", nrow(tempDT[is.na(meanTemp)]), " observations have no Tmean(C) values...")
## Note that 645 observations have no Tmean(C) values...
tempDT <- tempDT[, `:=`(year, lubridate::year(r_dateTime))]
t <- addmargins(with(tempDT[is.na(meanTemp)], table(Station, year)))

kableExtra::kable(t, caption = "Counts of missing temperature observations by location and year (source: 2 files)") %>% 
    kable_styling()
Table 2.8: Counts of missing temperature observations by location and year (source: 2 files)
1998 2003 2004 2006 2007 2008 2010 2012 2014 2015 2016 2018 Sum
15876 2 53 491 0 0 1 0 1 0 0 3 0 551
23872 0 0 0 12 22 0 31 1 1 1 0 26 94
Sum 2 53 491 12 22 1 31 2 1 1 3 26 645
message("But it's mostly in years we don't have HP data for?")
## But it's mostly in years we don't have HP data for?

Figure 2.7 shows the mean temperature by time of day by season. Does this look right? The alpha values are suggesting it’s getting warmer…

As above this plot is important. If the shape looks right then we have made the correct assumptions about our incoming dateTime values.

plotDT <- tempDT[, .(meanT = mean(meanTemp, na.rm = TRUE)), # crucial otherwise any NA causes the calculation to be NA
                 keyby = .(hms, season, Station, year)]

tempCap <- paste0("Temperature data source: ?? NIWA ??",
                        "\n Time period: ", min(lubridate::date(tempDT$r_dateTimeLocal), na.rm = TRUE),
                        " to ",
                         max(lubridate::date(tempDT$r_dateTimeLocal), na.rm = TRUE)
                  )
  
ggplot2::ggplot(plotDT, aes(x = hms, y = meanT, colour = season, alpha = year)) +
  geom_point() +
  facet_grid(Station ~ .) +
  labs(caption = tempCap)
## Warning: Removed 2 rows containing missing values (geom_point).
Line plot of mean temperature by season and location

Figure 2.7: Line plot of mean temperature by season and location

2.2.2 Both datasets in 1 file

Now test the single temperature file which contains both.

message("Loading alltemp_hourly.csv.gz")
## Loading alltemp_hourly.csv.gz
tempBothDT <- data.table::fread(paste0(myParams$dPath, "alltemp_hourly.csv.gz"))

kableExtra::kable(head(tempBothDT), 
                  caption = "Head:  alltemp_hourly.csv.gz before processing")
Table 2.9: Head: alltemp_hourly.csv.gz before processing
V1 location r_dateTime Tmax_C Tmin_C Tmean_C location
1 15876 1997-09-11 17:00:00 14.4 13.8 14.2 15876
2 15876 1997-09-11 18:00:00 14.3 12.8 13.8 15876
3 15876 1997-09-11 19:00:00 12.9 12.1 12.5 15876
4 15876 1997-09-11 20:00:00 12.4 11.9 12.2 15876
5 15876 1997-09-11 21:00:00 11.9 11.0 11.5 15876
6 15876 1997-09-11 22:00:00 11.7 11.1 11.4 15876
tempBothDT <- tempBothDT[, r_dateTimeLocal := lubridate::ymd_hms(r_dateTime, # looks like NZT as the first rows it match the first few rows of temp15876DT
                                                  tz = "Pacific/Auckland")] # just to be sure (if you ran this code in the UK it would set it to UTC by default)
  
  
  tempBothDT <- tempBothDT[, hms := hms::as.hms(r_dateTimeLocal)]
  
  tempBothDT <- GREENGridData::addNZSeason(tempBothDT)
  
tempBothDT$V1 <- NULL
tempBothDT <- tempBothDT[, meanTemp := Tmean_C]


kableExtra::kable(head(tempBothDT), 
                  caption = "Head:  alltemp_hourly.csv.gz after processing")
Table 2.9: Head: alltemp_hourly.csv.gz after processing
location r_dateTime Tmax_C Tmin_C Tmean_C location r_dateTimeLocal hms season meanTemp
15876 1997-09-11 17:00:00 14.4 13.8 14.2 15876 1997-09-11 17:00:00 17:00:00 Spring 14.2
15876 1997-09-11 18:00:00 14.3 12.8 13.8 15876 1997-09-11 18:00:00 18:00:00 Spring 13.8
15876 1997-09-11 19:00:00 12.9 12.1 12.5 15876 1997-09-11 19:00:00 19:00:00 Spring 12.5
15876 1997-09-11 20:00:00 12.4 11.9 12.2 15876 1997-09-11 20:00:00 20:00:00 Spring 12.2
15876 1997-09-11 21:00:00 11.9 11.0 11.5 15876 1997-09-11 21:00:00 21:00:00 Spring 11.5
15876 1997-09-11 22:00:00 11.7 11.1 11.4 15876 1997-09-11 22:00:00 22:00:00 Spring 11.4

Figure 2.8 shows the mean temperature by time of day by season for the data from . Does this look right? The alpha values are suggesting it’s getting warmer…

Again, this plot tells us if our dateTime assumptions are correct.

# combine the temperature data

message("Note that ", nrow(tempBothDT[is.na(meanTemp)]), " observations have no Tmean(C) values...")
## Note that 634 observations have no Tmean(C) values...
tempBothDT <- tempBothDT[, year := lubridate::year(r_dateTime)]
t <- addmargins(with(tempBothDT[is.na(meanTemp)], table(location,year)))

kableExtra::kable(t, caption = "Counts of missing temperature observations by location and year (source: 1 file)") %>%
  kable_styling()
Table 2.10: Counts of missing temperature observations by location and year (source: 1 file)
1998 2003 2004 2006 2007 2008 2010 2012 2014 2015 2016 2018 Sum
15876 2 53 491 0 0 1 0 1 0 0 3 0 551
23872 0 0 0 12 11 0 31 1 1 1 0 26 83
Sum 2 53 491 12 11 1 31 2 1 1 3 26 634
message("But it's mostly in years we don't have HP data for?")
## But it's mostly in years we don't have HP data for?
plotDT <- tempBothDT[, .(meanT = mean(meanTemp, na.rm = TRUE)), # crucial otherwise any NA causes the calculation to be NA
                 keyby = .(hms, season, location, year)]

tempCap <- paste0("Temperature data source: ?? NIWA ??",
                        "\n Time period: ", min(lubridate::date(tempBothDT$r_dateTimeLocal), na.rm = TRUE),
                        " to ",
                         max(lubridate::date(tempBothDT$r_dateTimeLocal), na.rm = TRUE)
                  )
  
ggplot2::ggplot(plotDT, aes(x = hms, y = meanT, colour = season, alpha = year)) +
  geom_point() +
  facet_grid(location ~ .) +
  labs(caption = tempCap)
## Warning: Removed 2 rows containing missing values (geom_point).
Line plot of mean temperature by season and location

Figure 2.8: Line plot of mean temperature by season and location

Now we can directly compare the two sources of temperature data to see if they match. Figure 2.9 does this by linking the two files on r_dateTimeLocal and location and then plotting the temperature values from each source.

message("N rows of 2 files merged: ", nrow(tempDT))
## N rows of 2 files merged: 328719
message("N rows of alltemp_hourly.csv.gz : ", nrow(tempBothDT))
## N rows of alltemp_hourly.csv.gz : 324465
tempDT <- tempDT[, `:=`(source, "Separate files")]
tempBothDT <- tempBothDT[, `:=`(source, "alltemp_hourly.csv.gz")]

# let's try just comparing dates

t2fdt <- tempDT[, .(dateTime_char_2f = `Date(NZST)`, r_dateTimeLocal, meanTemp_2files = meanTemp, 
    location, hms_2files = hms)]
setkey(t2fdt, r_dateTimeLocal, location)

t1fdt <- tempBothDT[, .(dateTime_char_1f = r_dateTime, r_dateTimeLocal, meanTemp_1file = meanTemp, 
    location, hms_1file = hms)]
setkey(t1fdt, r_dateTimeLocal, location)

dt <- t2fdt[t1fdt]  # merge on dateTimeLocal & location

ggplot2::ggplot(dt[!is.na(r_dateTimeLocal)], aes(x = meanTemp_2files, y = meanTemp_1file, 
    colour = lubridate::hour(r_dateTimeLocal))) + geom_point() + facet_grid(. ~ 
    location) + labs(caption = "Comparison of temperature data from each source by location")
## Warning: Removed 645 rows containing missing values (geom_point).
Comparison of temperature data from each source by location

Figure 2.9: Comparison of temperature data from each source by location

# so some of them match but not all
dt <- dt[, `:=`(diffTemp, meanTemp_2files - meanTemp_1file)]

Bang on. As we can see, it makes no differnce which temperture data is used.

3 Test relationship between temperature and heat pump demand (BA filtered data)

So now we think we’ve linked the data correctly, let’s do some exploratory analysis. In this section we keep the observations where power < 40 for the circuits defined in ba_includeCircuit (see Table 2.3).

First we just plot all the hourly data for demand vs mean temperature (Figure 3.1).

baFilteredDT <- linkedDT[ba_includeCircuit == 1 & !is.na(meanTemp)]


t <- baFilteredDT[, .(meanW = mean(powerW), minW = min(powerW), maxW = max(powerW), 
    nObs = .N, nCircuits = uniqueN(circuit), nHouseholds = uniqueN(id)), keyby = .(location)]

kableExtra::kable(t, caption = "Summary power demand statistics (all data)") %>% 
    kable_styling()
Table 3.1: Summary power demand statistics (all data)
location meanW minW maxW nObs nCircuits nHouseholds
15876 99.47134 0 4342.474 216263 11 11
23872 90.76957 0 3145.554 167444 7 7
hpCap <- paste0("GREEN Grid Heat Pump circuits, n households = ", uniqueN(baFilteredDT$id), 
    "\n Time period: ", min(lubridate::date(baFilteredDT$r_dateTimeLocal)), 
    " to ", max(lubridate::date(baFilteredDT$r_dateTimeLocal)), "\n n circuits: ", 
    uniqueN(baFilteredDT$circuit))

tempCap <- paste0("Temperature data source: ?? NIWA ??")

combinedCaption <- paste0(hpCap, "\n", tempCap)

ggplot2::ggplot(baFilteredDT[!is.na(meanTemp)], aes(x = meanTemp, y = powerW/1000, 
    colour = id)) + geom_point() + facet_grid(season ~ location) + labs(caption = combinedCaption, 
    x = "Mean temperature", y = "Mean half-hourly kW")
Plots of temperature vs demand by season and location

Figure 3.1: Plots of temperature vs demand by season and location

That doesn’t tell us a lot - although these is indeed a suggestion of their use for cooling in summer in both locations by some households. This should remind us that this analysis is based on a very small sample of households. So let’s try to re-create the binned box plots. In this case it is not helpful to distinguish between households but we should be aware that the patterns we see may be driven by just one or two households’ behaviour.

ggplot2::ggplot(linkedDT[!is.na(meanTemp)],
                         mapping = aes(x = meanTemp, y = powerW/1000)) + 
  geom_boxplot(aes(cut_width(meanTemp,
                             width = 2, # change the width to get 'wider' bins
                             boundary = 0)))  + 
  facet_grid(season ~ location) +
  labs(caption = combinedCaption,
       x = "Mean temperature",
       y = "Mean half-hourly kW")
Box plots of temperature vs demand by season and location

Figure 3.2: Box plots of temperature vs demand by season and location

Figure 3.2 shows binned box plots of power against demand and suggest more clearly that there is a cooling effect for households in summer - especially in 15876. Or perhaps we should more accurately say there is evidence of a summer cooling effect in the 11 (!) households who live in 15876. And who would base a claim on that many households?

Anyway, just for good measure lets try to re-create the U shaped plots Emily produced by ignoring season.

ggplot2::ggplot(linkedDT[!is.na(meanTemp)], mapping = aes(x = meanTemp, y = powerW/1000)) + 
  geom_boxplot(aes(cut_width(meanTemp,
                             width = 2, # change the width to get 'wider' bins
                             boundary = 0)))  + 
  facet_grid(. ~ location) +
  labs(caption = combinedCaption,
       x = "Mean temperature",
       y = "Mean half-hourly kW")
Box plots of temperature vs demand by season and location

Figure 3.3: Box plots of temperature vs demand by season and location

Although we can clearly see the increased demand for heat as temperatures decline below 0, the increase in the upper temperature ranges is probably obscured by the large number of zero values. It is arguable whether leaving the zeros out is valid since they could be interpreted as reflecting a household’s lack of need for cooling. Were we to only include values > 40 W (to exclude 0 and standby power values) we would be analysing (and subsequently modelling) only those households who actually responded to the temperature change by demanding cooling - which will not be the case for all households in the population.

4 Test relationship between temperature and heat pump demand (EJ filtered data)

Notwithstanding the previous argument re data exclusion, in this section we test Emily’s suggestion to only keep heat pump data where power > 40 for the circuits defined in ēj_includeCircuit (see Table 2.3).

First we just plot all the hourly data for demand vs mean temperature (Figure 4.1).

ejFilteredDT <- linkedDT[powerW > 40 & ej_includeCircuit == 1 & !is.na(meanTemp)]

t <- ejFilteredDT[, .(meanW = mean(powerW), minW = min(powerW), maxW = max(powerW), 
    nObs = .N, nCircuits = uniqueN(circuit), nHouseholds = uniqueN(id)), keyby = .(location)]

kableExtra::kable(t, caption = "Summary power demand staistics (filtered)") %>% 
    kable_styling()
Table 4.1: Summary power demand staistics (filtered)
location meanW minW maxW nObs nCircuits nHouseholds
15876 532.1126 40.0005 4342.474 39840 11 11
23872 608.7093 40.0305 3145.554 19053 6 6
hpCap <- paste0("GREEN Grid Heat Pump circuits, n households = ", uniqueN(ejFilteredDT$id), 
    "\n Time period: ", min(lubridate::date(ejFilteredDT$r_dateTimeLocal)), 
    " to ", max(lubridate::date(ejFilteredDT$r_dateTimeLocal)), "\n n circuits: ", 
    uniqueN(ejFilteredDT$circuit))

tempCap <- paste0("Temperature data source: ?? NIWA ??")

combinedCaption <- paste0(hpCap, "\n", tempCap)

ggplot2::ggplot(ejFilteredDT, aes(x = meanTemp, y = powerW/1000, colour = id)) + 
    geom_point() + facet_grid(season ~ location) + labs(caption = combinedCaption, 
    x = "Mean temperature", y = "Mean half-hourly kW")
Plots of temperature vs demand by season and location

Figure 4.1: Plots of temperature vs demand by season and location

Again, that doesn’t tell us a lot - although these is indeed a suggestion of their use for cooling in summer in both locations. So let’s re-try to create the binned box plots.

ggplot2::ggplot(ejFilteredDT, mapping = aes(x = meanTemp, y = powerW/1000)) + 
  geom_boxplot(aes(cut_width(meanTemp,
                             width = 2, # change the width to get 'wider' bins
                             boundary = 0)))  + 
  facet_grid(season ~ location) +
  labs(caption = combinedCaption,
       x = "Mean temperature",
       y = "Mean half-hourly kW")
Box plots of temperature vs demand by season and location

Figure 4.2: Box plots of temperature vs demand by season and location

Figure 4.2 shows binned box plots of power against demand. As we would expect, the plot suggests more clearly that there is a cooling effect for households in summer when we remove 0 and power < 40 W values.

Anyway, lets again try to re-create the U shaped plots Emily produced by ignoring season.

ggplot2::ggplot(ejFilteredDT, mapping = aes(x = meanTemp, y = powerW/1000)) + 
  geom_boxplot(aes(cut_width(meanTemp,
                             width = 2, # change the width to get 'wider' bins
                             boundary = 0)))  + 
  facet_grid(. ~ location) +
  labs(caption = combinedCaption,
       x = "Mean temperature",
       y = "Mean half-hourly kW")
Box plots of temperature vs demand by season and location

Figure 4.3: Box plots of temperature vs demand by season and location

The increase in demand for power as the temperature rises is now clearer.

But this is fairly obvious since we are just looking at the households who responded to the temperature changes…

5 Modelling

tbc

Needs to include the 0 values (see discussion above)?

If the regression model assumes independence of observations then this is violated if we have multiple observations for each house (which we do whether we just use the ‘best’ year or not…). The short-term fix would be to use robust standard errors in model reporting or maybe a fixed-effects model (?) but that might be well beyond scope of the report!

6 Runtime

t <- proc.time() - startTime

elapsed <- t[[3]]

Analysis completed in 122.95 seconds ( 2.05 minutes) using knitr in RStudio with R version 3.5.2 (2018-12-20) running on x86_64-apple-darwin15.6.0.

7 R packages used

list packages used & cite

8 Annexes

8.1 Detailed statistics

Figure 8.1 shows the relationship between mean temperature and mean power for all hourly observations for all circuits colour coded by season. Smoothed fit lines have been added using mgcv:gam() curves due to the size of the dataset (see ?geom_smooth and [@ggplot]).

ggplot2::ggplot(linkedDT, aes(x = meanTemp, y = powerW, colour = season)) + 
    geom_point(alpha = 0.5) + geom_smooth() + facet_wrap(. ~ circuit)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4939 rows containing non-finite values (stat_smooth).
## Warning: Removed 4939 rows containing missing values (geom_point).
Pairs plot of temperature vs heat pump power demand for all households

Figure 8.1: Pairs plot of temperature vs heat pump power demand for all households

8.2 Data codebooks

8.2.1 Original power data (before processing)

Depending on input files choice this will vary.

  • data source: emilyData
  • data file: `/Users/ben/Data/NZ_GREENGrid/safe/emilyJiang/allHouseholds_hourly.csv.gz
head(hpDT_orig)
##    V1          r_dateTime   powerW        circuit    id location
## 1:  1 2015-03-26 04:00:00  0.00000 Heat Pump$4204 rf_31    15876
## 2:  2 2015-03-26 05:00:00 33.68717 Heat Pump$4204 rf_31    15876
## 3:  3 2015-03-26 06:00:00  0.00000 Heat Pump$4204 rf_31    15876
## 4:  4 2015-03-26 07:00:00  0.00000 Heat Pump$4204 rf_31    15876
## 5:  5 2015-03-26 08:00:00  0.00000 Heat Pump$4204 rf_31    15876
## 6:  6 2015-03-26 09:00:00  0.00000 Heat Pump$4204 rf_31    15876
skimr::skim(hpDT_orig)
## Skim summary statistics
##  n obs: 441379 
##  n variables: 6 
## 
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
##    variable missing complete      n min max empty n_unique
##     circuit       0   441379 441379  14  33     0       20
##          id       0   441379 441379   5   6     0       18
##  r_dateTime       0   441379 441379  19  19     0    36583
## 
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n      mean        sd    p0      p25
##  location       0   441379 441379  19944.32   3997.39 15876  15876  
##        V1       0   441379 441379 220690    127415.29     1 110345.5
##     p50      p75   p100     hist
##   23872  23872    23872 ▇▁▁▁▁▁▁▇
##  220690 331034.5 441379 ▇▇▇▇▇▇▇▇
## 
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n  mean     sd p0 p25 p50   p75    p100
##    powerW       0   441379 441379 95.47 271.51  0   0   0 19.02 4342.47
##      hist
##  ▇▁▁▁▁▁▁▁

8.2.2 Power data (after processing)

skimr::skim(hpDT)
## Skim summary statistics
##  n obs: 441379 
##  n variables: 12 
## 
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
##        variable missing complete      n min max empty n_unique
##         circuit       0   441379 441379  14  33     0       20
##              id       0   441379 441379   5   6     0       18
##  r_dateTimeChar       0   441379 441379  19  19     0    36583
## 
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n    min        max   median n_unique
##       hms       0   441379 441379 0 secs 82800 secs 12:00:00       24
## 
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n n_unique
##    season       0   441379 441379        4
##                                         top_counts ordered
##  Win: 122946, Aut: 111768, Spr: 107705, Sum: 98960   FALSE
## 
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n     mean      sd    p0   p25   p50   p75
##      hour       0   441379 441379    11.5     6.92     0     6    12    18
##  location       0   441379 441379 19944.32 3997.39 15876 15876 23872 23872
##   p100     hist
##     23 ▇▇▇▇▇▇▇▇
##  23872 ▇▁▁▁▁▁▁▇
## 
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────
##           variable missing complete      n  mean     sd p0 p25 p50   p75
##  ba_includeCircuit       0   441379 441379  0.88   0.33  0   1   1  1   
##  ej_includeCircuit       0   441379 441379  0.8    0.4   0   1   1  1   
##             powerW       0   441379 441379 95.47 271.51  0   0   0 19.02
##     p100     hist
##     1    ▁▁▁▁▁▁▁▇
##     1    ▂▁▁▁▁▁▁▇
##  4342.47 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
##         variable missing complete      n        min        max     median
##       r_dateTime       0   441379 441379 2014-05-28 2018-08-01 2016-04-08
##  r_dateTimeLocal       0   441379 441379 2014-05-29 2018-08-01 2016-04-09
##  n_unique
##     36583
##     36583

8.2.3 Original temperature file(s)

File 1:

skimr::skim(temp15876DT)
## Skim summary statistics
##  n obs: 185459 
##  n variables: 17 
## 
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
##       variable missing complete      n min max empty n_unique
##     Date(NZST)       0   185459 185459  16  16     0   185455
##           Freq       0   185459 185459   1   1     0        1
##    Period(Hrs)       0   185459 185459   1   1     0        2
##  Period(Hrs).1       0   185459 185459   1   1     0        2
##  Period(Hrs).2       0   185459 185459   1   1     0        2
##  Period(Hrs).3       0   185459 185459   1   1     0        2
##      RHmean(%)       0   185459 185459   1   3     0       88
##       Tgmin(C)       0   185459 185459   1   4     0      569
##        Tmax(C)       0   185459 185459   1   4     0      385
##       Tmean(C)       0   185459 185459   1   5     0      382
##        Tmin(C)       0   185459 185459   1   5     0      378
## 
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n    min        max     median n_unique
##       hms      21   185438 185459 0 secs 82800 secs 43200 secs       24
## 
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n n_unique
##    season       0   185459 185459        4
##                                      top_counts ordered
##  Spr: 47102, Win: 46360, Aut: 46082, Sum: 45915   FALSE
## 
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n  mean sd    p0   p25   p50   p75  p100
##  location       0   185459 185459 15876  0 15876 15876 15876 15876 15876
##   Station       0   185459 185459 15876  0 15876 15876 15876 15876 15876
##      hist
##  ▁▁▁▇▁▁▁▁
##  ▁▁▁▇▁▁▁▁
## 
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
##         variable missing complete      n        min        max     median
##       r_dateTime      21   185438 185459 1997-09-11 2019-01-06 2008-05-19
##  r_dateTimeLocal      21   185438 185459 1997-09-11 2019-01-06 2008-05-19
##  n_unique
##    185434
##    185434

File 2:

skimr::skim(temp23872DT)
## Skim summary statistics
##  n obs: 143260 
##  n variables: 17 
## 
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
##       variable missing complete      n min max empty n_unique
##     Date(NZST)       0   143260 143260  16  16     0   139010
##           Freq       0   143260 143260   1   1     0        1
##    Period(Hrs)       0   143260 143260   1   1     0        2
##  Period(Hrs).1       0   143260 143260   1   1     0        2
##  Period(Hrs).2       0   143260 143260   1   1     0        2
##  Period(Hrs).3       0   143260 143260   1   1     0        2
##      RHmean(%)       0   143260 143260   1   3     0       85
##       Tgmin(C)       0   143260 143260   1   4     0      511
##        Tmax(C)       0   143260 143260   1   4     0      307
##       Tmean(C)       0   143260 143260   1   4     0      303
##        Tmin(C)       0   143260 143260   1   4     0      302
## 
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n    min        max     median n_unique
##       hms      16   143244 143260 0 secs 82800 secs 43200 secs       24
## 
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n n_unique
##    season       0   143260 143260        4
##                                      top_counts ordered
##  Aut: 36423, Sum: 36244, Win: 36023, Spr: 34570   FALSE
## 
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n  mean sd    p0   p25   p50   p75  p100
##  location       0   143260 143260 23872  0 23872 23872 23872 23872 23872
##   Station       0   143260 143260 23872  0 23872 23872 23872 23872 23872
##      hist
##  ▁▁▁▇▁▁▁▁
##  ▁▁▁▇▁▁▁▁
## 
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
##         variable missing complete      n        min        max     median
##       r_dateTime      16   143244 143260 2002-12-06 2019-01-06 2010-09-02
##  r_dateTimeLocal      16   143244 143260 2002-12-06 2019-01-06 2010-09-02
##  n_unique
##    138994
##    138994

Combined file:

skimr::skim(tempBothDT)
## Skim summary statistics
##  n obs: 324465 
##  n variables: 12 
## 
## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────
##    variable missing complete      n min max empty n_unique
##  r_dateTime      37   324428 324465  19  19     0   186732
##      source       0   324465 324465  21  21     0        1
## 
## ── Variable type:difftime ────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n    min        max     median n_unique
##       hms      37   324428 324465 0 secs 82800 secs 43200 secs       24
## 
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n n_unique
##    season       0   324465 324465        4
##                                      top_counts ordered
##  Spr: 81672, Win: 81298, Sum: 81200, Aut: 80295   FALSE
## 
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────
##    variable missing complete      n     mean      sd    p0   p25   p50
##    location       0   324465 324465 19301.71 3956.83 15876 15876 15876
##  location.1       0   324465 324465 19301.71 3956.83 15876 15876 15876
##    p75  p100     hist
##  23872 23872 ▇▁▁▁▁▁▁▆
##  23872 23872 ▇▁▁▁▁▁▁▆
## 
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete      n    mean   sd     p0    p25    p50    p75
##  meanTemp     634   323831 324465   12.63 5.3   -10      9.1   12.6   16.3
##    Tmax_C     503   323962 324465   13.26 5.33   -9.5    9.7   13.2   16.9
##   Tmean_C     634   323831 324465   12.63 5.3   -10      9.1   12.6   16.3
##    Tmin_C     577   323888 324465   12.02 5.29  -12.2    8.5   12     15.7
##      year      37   324428 324465 2008.96 5.69 1997   2004   2009   2014  
##    p100     hist
##    35.6 ▁▁▃▇▇▂▁▁
##    36.4 ▁▁▃▇▇▃▁▁
##    35.6 ▁▁▃▇▇▂▁▁
##    34.2 ▁▁▂▇▇▃▁▁
##  2019   ▃▅▇▇▆▇▇▅
## 
## ── Variable type:POSIXct ─────────────────────────────────────────────────────────────────────────────────────────────────
##         variable missing complete      n        min        max     median
##  r_dateTimeLocal      37   324428 324465 1997-09-11 2019-01-06 2009-08-24
##  n_unique
##    186732

9 References