-
Notifications
You must be signed in to change notification settings - Fork 0
/
clusterPage.py
384 lines (323 loc) · 19.3 KB
/
clusterPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
import streamlit as st
import json
import pandas as pd
import st_aggrid
# the command below causes segmentation fault on local computer
from st_aggrid import AgGrid, GridOptionsBuilder, ColumnsAutoSizeMode
from streamlit_extras.stylable_container import stylable_container
import altair as alt
from vega_datasets import data
from shapely.geometry import shape, MultiPolygon, MultiLineString
@st.cache_data
def load_basemap() :
basemap = alt.topo_feature(data.us_10m.url, 'states')
return basemap
#@st.cache_data
def load_geojson(file_path):
with open(file_path, 'r') as file:
geojson_data = json.load(file)
return geojson_data
@st.cache_data
def load_csv(path):
data = pd.read_csv(path)
return data
#@st.cache_data # This function will be cached
def load_excel(path, sheetname):
data = pd.read_excel(path, sheetname)
return data
@st.cache_data # This function will be cached
def load_json(path):
return pd.read_json(path)
def safe_round(val, precision = 4):
try:
return round(val, precision)
except:
return val
@st.cache_data # This function will be cached
def get_cluster(cluster_df, project_head, vis_df, weight_list, threshold = 0.6):
if project_head in cluster_df["ProjectHead"].values:
new_df = pd.DataFrame.from_dict(cluster_df[cluster_df["ProjectHead"] == project_head]["Cluster"].iloc[0])
# add lat/lon to displayed table
new_df = pd.merge(new_df, vis_df[['Project Name', 'GIS Lat', 'GIS Long', 'Type-1', 'Net MWs to Grid', 'Remain_MW']], left_on='Project', right_on='Project Name', how='left')
new_df = new_df.drop(columns=['Project Name'])
#st.write(new_df)
# only select the first three columns in the summary, ignore the geolocations
cluster_data_df = pd.json_normalize(cluster_df[cluster_df["ProjectHead"] == project_head]["Summary"].iloc[0]).iloc[:,[0,1,2]]
cluster_data_df.fillna(value=0, inplace=True)
associated_projects_df = new_df[["Project",'Net MWs to Grid', "Likelihood of Approval", "Location", "Process", "Infrastructure", "Overall","GIS Lat", "GIS Long", 'Type-1', 'Remain_MW']]
associated_projects_df.fillna(value=associated_projects_df['Likelihood of Approval'].mean(), inplace=True)
# Scale values
scaled_weights = [x/sum(weight_list) for x in weight_list]
associated_projects_df['Overall'] = associated_projects_df[["Likelihood of Approval", "Location", "Process", "Infrastructure"]].mul(scaled_weights, axis=1).sum(axis=1)
# Filter above threshold
associated_projects_df = associated_projects_df[associated_projects_df['Overall'] > threshold].sort_values(by=['Overall', 'Likelihood of Approval'], ascending=False)
cluster_data_df["Cluster Strength"] = associated_projects_df['Overall'].mean()
cluster_data_df["Likelihood of Approval"] = associated_projects_df['Likelihood of Approval'].mean()
cluster_data_df['Net Transmission Capacity'] = associated_projects_df['Net MWs to Grid'].sum() - associated_projects_df['Remain_MW'].sum()
# round Likelyhood of Approval, location, process, overall
associated_projects_df['Likelihood of Approval'] = associated_projects_df['Likelihood of Approval'].round(4)
associated_projects_df['Location'] = associated_projects_df['Location'].round(4)
associated_projects_df['Process'] = associated_projects_df['Process'].round(4)
associated_projects_df['Overall'] = associated_projects_df['Overall'].round(4)
# round the cluster strength, net transmission capacity, likelyhood of approval
cluster_data_df['Cluster Strength'] = safe_round(cluster_data_df['Cluster Strength'])
cluster_data_df['Net Transmission Capacity'] = safe_round(cluster_data_df['Net Transmission Capacity'], 1)
cluster_data_df['Likelihood of Approval'] = safe_round(cluster_data_df['Likelihood of Approval'])
else:
cluster_data_df = pd.DataFrame({})
associated_projects_df = pd.DataFrame({})
return cluster_data_df, associated_projects_df
def set_selection_cb(selected_rows_in, cluster_df, vis_df, weight_list = [0.25,0.25,0.25,0.25]):
# in local python, selected_rows_in is a list, however on streamlit, it is pd df
if isinstance(selected_rows_in, list) :
# for list
if selected_rows_in:
st.session_state.selected_rows = selected_rows_in
st.session_state.cluster_summary_df, st.session_state.associated_projects_df = get_cluster(cluster_df, st.session_state.selected_rows[0]["Project Name"], vis_df, weight_list)
elif isinstance(selected_rows_in, pd.DataFrame) :
# for pandas dataframe
if not selected_rows_in.empty:
st.session_state.selected_rows = selected_rows_in
st.session_state.cluster_summary_df, st.session_state.associated_projects_df = get_cluster(cluster_df, st.session_state.selected_rows["Project Name"].iloc[0], vis_df, weight_list)
else:
st.session_state.selected_rows = None
st.session_state.cluster_summary_df = pd.DataFrame({})
st.session_state.associated_projects_df = pd.DataFrame({})
def reset_selection_cb():
st.session_state.selected_rows = None
st.session_state.cluster_summary_df = pd.DataFrame({})
st.session_state.associated_projects_df = pd.DataFrame({})
def get_points_centroid(in_df) :
return in_df['GIS Long'].mean(), in_df['GIS Lat'].mean()
def check_list_or_df_empty(in_var) :
# check if list is empty, value is none or pandas df is empty
# for list
if isinstance(in_var, list) :
return not in_var
# for None value
elif in_var is None :
return in_var is None
# for pandas data frame
else :
return in_var.empty
def create_altair_charts_main2(basemap, data_tmp, in_center, in_scale, project_head) :
# prepare for altair display
#ca_counties = alt.Data(values=county_geojson)
# define map center and zoom scale
center = in_center
scale = in_scale
width = 300
height = 600
# Layering and configuring the components
base = alt.layer(
alt.Chart(basemap).mark_geoshape(fill='lightgray', stroke='gray').encode(tooltip=alt.value(None))
#alt.Chart(ca_counties).mark_geoshape(fill='yellow', stroke='gray')
).properties(width=width, height=height)
projections = {
"Mercator": {
"type": "mercator",
"center": center,
"rotate": [0,0,0],
"translate": [width/2, height/2],
"scale": scale,
"precision": 0.1
},
}
geo_chart = base.properties(projection=projections['Mercator'])
alt_chart = geo_chart
# add extra points data only when data is selected and there are points within county boundary
# combine project head with cluster
map_points_df = data_tmp
map_points_df['is_project_head'] = 0
project_head = project_head.rename(columns=dict(zip(project_head.columns, [c.title() for c in project_head.columns])))
project_head['is_project_head'] = 1
project_head.rename(columns={"Project Name": "Project", "Gis Long": "GIS Long", "Gis Lat": "GIS Lat"}, inplace=True)
map_points_df = pd.concat([map_points_df[['Project', 'GIS Long', 'GIS Lat', 'Type-1', 'is_project_head']], project_head[['Project', 'GIS Long', 'GIS Lat', 'Type-1', 'is_project_head']]])
if map_points_df is not None:
extra_points = alt.Chart(map_points_df).mark_point(
filled = True,
opacity = 0.5
).encode(
longitude='GIS Long:Q',
latitude='GIS Lat:Q',
tooltip='Project:N',
size = alt.Size('is_project_head:N', scale=alt.Scale(domain=[0, 1], range=[100, 400])),
shape = alt.Shape('is_project_head:N', legend=None, scale=alt.Scale(domain=[0, 1], range=['circle', 'cross'])),
color= alt.Color('Type-1:N', scale = alt.Scale(domain=['Photovoltaic', 'Wind Turbine', 'Storage', 'Steam Turbine', 'Hydro', 'Gas Turbine', 'Solar Thermal', 'Combined Cycle'], range=['gold', '#0FD00B', '#252523', 'red', 'blue', '#D52EEE', '#0884C6', 'green']))
)
alt_chart = geo_chart + extra_points
return alt_chart
def main2():
## Load Data
# US states map
basemap = load_basemap()
# CA counties map
#california_counties_geojson = load_geojson('data/California_County_Boundaries.geojson')
# need to load csv with lat/lon cooridnates
full_queue_df = load_csv('data/new_caiso_queue_MW.csv')
column_ixs_to_keep = [0, 1, 2, 5, 6, 8, 14, 18, 22, 24, 26, 28, 30, 31, 32, 33, 34, 37, 38, 50]
visible_df = full_queue_df.iloc[:, column_ixs_to_keep]
options_builder = GridOptionsBuilder.from_dataframe(visible_df)
options_builder.configure_selection('single')
options_builder.configure_pagination(paginationPageSize=10, paginationAutoPageSize=False)
grid_options = options_builder.build()
cluster_df = load_json("final_clusters_still_nan.json")
# st.write(alt.__version__)
if check_list_or_df_empty(st.session_state.selected_rows) :
st.subheader("Let’s get to clustering!")
st.markdown(
"""
Studying a single application at a time makes for a slow going. Overpowered’s clustering tool helps you determine which projects make sense to study together. This tool focuses on the California grid operator (CAISO) Interconnection Queue.
"""
)
st.subheader("Set Custom Weights")
st.markdown(
"""
Overpowered provides a structured scoring mechanism to determine the best groups of applicants to study together. We also recognize that expert energy users have insights into how Queue applications are successful in their unique operating regions. Feel free to start with the default equal split, or configure the weighting parameters as you see fit!
"""
)
c1, c2 = st.columns(2)
with c1:
with st.expander("Set Parameters"):
st.write('***Assign relative weights***')
col1, col2 = st.columns(2)
with col1:
w1 = st.number_input(label="Location", value = st.session_state.w1)
w2 = st.number_input(label="Process", value = st.session_state.w2)
with col2:
w3 = st.number_input(label="Infrastructure", value = st.session_state.w3)
w4 = st.number_input(label="Likelihood of Approval", value = st.session_state.w4)
st.subheader("Pick a Project")
st.markdown(
"""
Now that you’ve set your weights, click a base project in the CAISO Queue below to generate cluster recommendations.
"""
)
grid_return = AgGrid(visible_df, grid_options)
selected_rows = grid_return["selected_rows"]
# write out selected rows to check its format
#st.write(type(selected_rows[0]))
with stylable_container(
key="go_button",
css_styles= """
button {
background-color: green;
color: white;
border-color: green;
}
"""
):
#st.write(type(selected_rows))
if isinstance(selected_rows, list):
# for list type
go_button = st.button('Go', on_click=set_selection_cb(selected_rows, cluster_df, visible_df, [st.session_state.w4, st.session_state.w1, st.session_state.w2, st.session_state.w3]), disabled= not selected_rows)
if go_button:
st.session_state.w1 = w1
st.session_state.w2 = w2
st.session_state.w3 = w3
st.session_state.w4 = w4
elif isinstance(selected_rows, pd.DataFrame):
# for pandas data frame type
go_button = st.button('Go', on_click=set_selection_cb(selected_rows, cluster_df, visible_df, [st.session_state.w4, st.session_state.w1, st.session_state.w2, st.session_state.w3]), disabled= selected_rows.empty)
if go_button:
st.session_state.w1 = w1
st.session_state.w2 = w2
st.session_state.w3 = w3
st.session_state.w4 = w4
else:
go_button = st.button('Go', disabled= True)
else:
# when it is a list of dict
if isinstance(st.session_state.selected_rows,list) :
st.subheader(st.session_state.selected_rows[0]["Project Name"] + " Suggested Cluster")
# when it is a pandas data frame
else :
st.subheader(st.session_state.selected_rows["Project Name"].iloc[0] + " Suggested Cluster")
if st.session_state.associated_projects_df.empty:
st.markdown(''':red[No Cluster found]''', unsafe_allow_html=True)
else:
col1, col2 = st.columns([3, 2])
with col1:
with st.expander("Set Parameters"):
st.write('***Assign relative weights***')
c1, c2 = st.columns(2)
with c1:
w1 = st.number_input(label="Location", value = st.session_state.w1)
w2 = st.number_input(label="Process", value = st.session_state.w2)
with c2:
w3 = st.number_input(label="Infrastructure", value = st.session_state.w3)
w4 = st.number_input(label="Likelihood of Approval", value = st.session_state.w4)
with stylable_container(
key="rerun_button_ct",
css_styles= """
button {
background-color: green;
color: white;
border-color: green;
}
"""
):
st.button(label= "Rerun", key="rerun_btn")
if st.session_state.get("rerun_btn"):
st.session_state.w1 = w1
st.session_state.w2 = w2
st.session_state.w3 = w3
st.session_state.w4 = w4
set_selection_cb(st.session_state.selected_rows, cluster_df, visible_df, [st.session_state.w4, st.session_state.w1, st.session_state.w2, st.session_state.w3])
row = st.columns(len(list(st.session_state.cluster_summary_df)))
for ix, col in enumerate(row):
with col:
with stylable_container(
key = "cluster_summary_" + str(ix),
css_styles='''
{
border: 2px solid;
border-radius: 10px;
text-align: center
}
'''
):
st.subheader(str(st.session_state.cluster_summary_df.iat[0, ix]))
st.write(list(st.session_state.cluster_summary_df)[ix])
cluster_grid_return = AgGrid(st.session_state.associated_projects_df.iloc[:,0:7], columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
with col2:
#st.markdown('''
# :rainbow[Map Placeholder]''')
scale_list = list(range(1000,25000,1000))
index_scale = scale_list.index(2000)
selectedScale = st.selectbox("Choose a Zoom-In Scale to Display: ", scale_list, index_scale, key='scale2')
# get centroid coordinates
centroid = get_points_centroid(st.session_state.associated_projects_df)
# display cluster in the map
if isinstance(st.session_state.selected_rows,list) :
alt_chart = create_altair_charts_main2(basemap,st.session_state.associated_projects_df,centroid, selectedScale, pd.DataFrame(st.session_state.selected_rows))
else:
alt_chart = create_altair_charts_main2(basemap,st.session_state.associated_projects_df,centroid, selectedScale, st.session_state.selected_rows)
with st.container(border=True):
st.altair_chart(alt_chart, use_container_width=True)
with stylable_container(
key="clear_button",
css_styles= """
button {
color: red;
border-color: red;
}
"""
):
clear_button = st.button('Clear', on_click=reset_selection_cb)
st.subheader("Variable Definitions")
st.markdown(
"""
Here we can see the high-level details of the cluster recommendation.
- **Cluster Strength**: This tells you the strength of the cluster as a whole. It’s the average similarity score between the base project you selected above and all the other projects in the recommended cluster.
- **Net Transmission Capacity**: This metric gives us insight into the amount of infrastructure (transmission lines and/or storage) that would need to be built to accommodate this cluster.
- (Sum of MWs supplied to the grid for all projects in the cluster) - (Sum of available transmission capacity at each project’s proposed interconnect point)
- **Likelihood of Approval**: The likelihood of approval is calculated for each project using features learned from historical data. This likelihood score takes the average approval of all projects in the recommended cluster.
The table above allows us to dig into the details of each project in the cluster. Similarity scores are calculated for different categories between the base project and all other projects. The most similar projects are included in the cluster. A higher number indicates a better similarity score. (See “Details - Overpowered’s Scoring Mechanism” for more information.)
- **Likelihood of Approval**: This is the likelihood that a given project would succeed independent of the rest of the cluster, based on past project applications.
- **Location**: This measures the geospatial proximity between two projects.
- **Process**: This summarizes the readiness of each project. It includes operational variables, such as the project’s position in the Queue, the date it's expected to go online, and its permit status. We want to discourage “line skipping” by grouping projects that are closer together in the Queue. We also want to encourage ease of construction and real-life operations by having projects in the same geography go online at similar times.
- **Infrastructure**: This captures the similarity of the project build types. For example, two solar projects can be studied under the same set of assumptions, which is more efficient than two projects of different types.
- **Overall**: The overall similarity score between the base project and the given project.
"""
)