c-ml-demand-eulp/viz_obtain_temperatures.py at main · DeltaE/c-ml-demand-eulp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 30 18:19:36 2025

@author: luisfernando
"""

import os
import sys
import pandas as pd
import time
import copy
import numpy as np

'''
Start the process below. The order of the functions is not chronological.
'''
START_PROCESS = time.time()

list_parquets_os = [
    i for i in os.listdir() if '_1.parquet' in i and 'simple_parquet' in i]

list_temperatures = []
list_temperatures_long = []

# Iterate across the dataframes:
for l_parquet in list_parquets_os:
    read_local_parquet = pd.read_parquet(l_parquet)
    state_id = l_parquet.split('_')[2]

    local_temperatures = read_local_parquet[[
        'bldg_id', 'timestamp', 'out.outdoor_air_dryblub_temp.c']]
    # Convert temperature to float16 for memory efficiency (1 decimal place precision)
    local_temperatures.loc[:, 'out.outdoor_air_dryblub_temp.c'] = np.round(local_temperatures['out.outdoor_air_dryblub_temp.c'], decimals=1).astype(np.float32)
    list_temperatures.append(local_temperatures)

    local_temperatures_long = local_temperatures.copy()
    local_temperatures_long['STATE'] = pd.Categorical([state_id] * len(local_temperatures_long))
    list_temperatures_long.append(local_temperatures_long)

    # print('Check local temperature up until here.')
    # sys.exit()

# Concatenate the list below:
PRINT_ALL_TEMPS_LONG_BOOL = True
if list_temperatures and PRINT_ALL_TEMPS_LONG_BOOL: # Check if the list is not empty before concatenation
    all_temperatures = pd.concat(list_temperatures, ignore_index=True)
    all_temperatures_long = pd.concat(list_temperatures_long, ignore_index=True)
    print("Concatenated DataFrame:")
    print(all_temperatures.head()) # Print the head to verify

    # Save to CSV (long format)
    # all_temperatures_long.to_csv("all_temperatures_long.csv", index=False)  # index=False to avoid saving the index
    all_temperatures_long.to_parquet("all_temperatures_long.parquet", index=False)  # index=False to avoid saving the index

    # Now you can work with the 'all_temperatures' DataFrame
else:
    print("No parquet files found matching the criteria.  No concatenation performed.")


# Pivot the temperature file:
if not all_temperatures.empty: #check if all_temperatures is empty.
    try:
        pivoted_temperatures = all_temperatures.pivot_table(
            index='timestamp',
            columns='bldg_id',
            values='out.outdoor_air_dryblub_temp.c',
            aggfunc='mean'
        )

        # Convert index to DatetimeIndex if it's not already one
        if not isinstance(pivoted_temperatures.index, pd.DatetimeIndex):
            pivoted_temperatures.index = pd.to_datetime(pivoted_temperatures.index)

        print("Pivoted DataFrame:")
        print(pivoted_temperatures.head())

        # Save the pivoted DataFrame to CSV
        pivoted_temperatures.to_csv("pivoted_temperatures.csv", index=True)  # index=True is more common for pivoted data
        pivoted_temperatures.to_parquet("pivoted_temperatures.parquet", index=True)  # index=True is more common for pivoted data

    except ValueError as e:
        print(f"Error during pivoting: {e}")
        print("Check for duplicate combinations of 'timestamp' and 'bldg_id'.")
        # Investigate the data to understand the cause.
        # You might need to adjust the aggfunc or pre-process the data.
elif all_temperatures.empty:
    print("all_temperatures dataframe is empty. No pivoting can be done.")


END_PROCESS = time.time()
TIME_ELAPSED = -START_PROCESS + END_PROCESS
print('\n TIME ELAPSED:')
print(str(TIME_ELAPSED) + ' seconds /', str(TIME_ELAPSED/60) + ' minutes.')